20 #include <cuComplex.h>    33 #ifndef FINITE_DIFFERENCE    47     #define ANY(X) (__any((X)))    48     #define ALL(X) (__all((X)))    55 #define Max_no_steps (200000)    56 #define NewtonMaxit (8)    58 #define StartNewton (true)    62 #define Roundoff (EPS)    72 #define ThetaMin (0.001)    74 #define NewtonTol (0.03)    82 #define Max_consecutive_errs (5)    84 #ifdef DIVERGENCE_TEST    96 void scale (
double const * 
const __restrict__ y0,
    97             double const * 
const __restrict__ y,
    98             double * 
const __restrict__ sc) {
   100     for (
int i = 0; i < 
NSP; ++i) {
   111 void scale_init (
double const * 
const __restrict__ y0,
   112                  double * 
const __restrict__ sc) {
   114     for (
int i = 0; i < 
NSP; ++i) {
   125 void safe_memcpy(
double * 
const __restrict__ dest,
   126                  double const * 
const __restrict__ source)
   129     for (
int i = 0; i < 
NSP; i++)
   143 void safe_memset3(
double * 
const __restrict__ dest1,
   144                   double * 
const __restrict__ dest2,
   145                   double * 
const __restrict__ dest3, 
const double val)
   148     for (
int i = 0; i < 
NSP; i++)
   150         dest1[
INDEX(i)] = val;
   151         dest2[
INDEX(i)] = val;
   152         dest3[
INDEX(i)] = val;
   162 void safe_memset(
double * 
const __restrict__ dest1, 
const double val)
   165     for (
int i = 0; i < 
NSP; i++)
   167         dest1[
INDEX(i)] = val;
   177 void safe_memset_jac(
double * 
const __restrict__ dest1, 
const double val)
   180     for (
int i = 0; i < 
NSP * 
NSP; i++)
   182         dest1[
INDEX(i)] = val;
   191 __constant__ 
double rkA[3][3] = { {
   192      1.968154772236604258683861429918299e-1,
   193     -6.55354258501983881085227825696087e-2,
   194      2.377097434822015242040823210718965e-2
   196      3.944243147390872769974116714584975e-1,
   197      2.920734116652284630205027458970589e-1,
   198     -4.154875212599793019818600988496743e-2
   200      3.764030627004672750500754423692808e-1,
   201      5.124858261884216138388134465196080e-1,
   202      1.111111111111111111111111111111111e-1
   206 __constant__ 
double rkB[3] = {
   207 3.764030627004672750500754423692808e-1,
   208 5.124858261884216138388134465196080e-1,
   209 1.111111111111111111111111111111111e-1
   212 __constant__ 
double rkC[3] = {
   213 1.550510257216821901802715925294109e-1,
   214 6.449489742783178098197284074705891e-1,
   229 __constant__ 
double rkGamma = 3.637834252744495732208418513577775e0;
   230 __constant__ 
double rkAlpha = 2.681082873627752133895790743211112e0;
   231 __constant__ 
double rkBeta  = 3.050430199247410569426377624787569e0;
   233 __constant__ 
double rkT[3][3] = {
   234 {9.443876248897524148749007950641664e-2,
   235 -1.412552950209542084279903838077973e-1,
   236 -3.00291941051474244918611170890539e-2},
   237 {2.502131229653333113765090675125018e-1,
   238 2.041293522937999319959908102983381e-1,
   239 3.829421127572619377954382335998733e-1},
   245 __constant__ 
double rkTinv[3][3] = {
   246 {4.178718591551904727346462658512057e0,
   247 3.27682820761062387082533272429617e-1,
   248 5.233764454994495480399309159089876e-1},
   249 {-4.178718591551904727346462658512057e0,
   250 -3.27682820761062387082533272429617e-1,
   251 4.766235545005504519600690840910124e-1},
   252 {-5.02872634945786875951247343139544e-1,
   253 2.571926949855605429186785353601676e0,
   254 -5.960392048282249249688219110993024e-1}
   258 {1.520148562492775501049204957366528e+1,
   259 1.192055789400527921212348994770778e0,
   260 1.903956760517560343018332287285119e0},
   261 {-9.669512977505946748632625374449567e0,
   262 -8.724028436822336183071773193986487e0,
   263 3.096043239482439656981667712714881e0},
   264 {-1.409513259499574544876303981551774e+1,
   265 5.895975725255405108079130152868952e0,
   266 -1.441236197545344702389881889085515e-1}
   269 __constant__ 
double rkAinvT[3][3] = {
   270 {0.3435525649691961614912493915818282e0,
   271 -0.4703191128473198422370558694426832e0,
   272 0.3503786597113668965366406634269080e0},
   273 {0.9102338692094599309122768354288852e0,
   274 1.715425895757991796035292755937326e0,
   275 0.4040171993145015239277111187301784e0},
   276 {3.637834252744495732208418513577775e0,
   277 2.681082873627752133895790743211112e0,
   278 -3.050430199247410569426377624787569e0}
   283 __constant__ 
double rkE[4] = {
   285 -10.04880939982741556246032950764708*0.05,
   286 1.382142733160748895793662840980412*0.05,
   287 -0.3333333333333333333333333333333333*0.05
   297 __constant__ 
double rkELO = 4;
   309 __device__ 
void RK_Decomp(
double H, 
const double* 
const __restrict__ Jac,
   311                             int* __restrict__ info) {
   312     double* 
const __restrict__ E1 = solver->E1;
   313     cuDoubleComplex* 
const __restrict__ E2 = solver->E2;
   314     int* 
const __restrict__ ipiv1 = solver->ipiv1;
   315     int* 
const __restrict__ ipiv2 = solver->ipiv2;
   316     cuDoubleComplex temp = make_cuDoubleComplex(
rkAlpha/H, 
rkBeta/H);
   318     for (
int i = 0; i < 
NSP; i++)
   321         for(
int j = 0; j < 
NSP; j++)
   336 __device__ 
void RK_Make_Interpolate(
const double* __restrict__ Z1, 
const double* __restrict__ Z2,
   337                                         const double* __restrict__ Z3, 
double* __restrict__ CONT) {
   340     for (
int i = 0; i < 
NSP; i++) {
   344                     /den) - Z3[
INDEX(i)];
   352 __device__ 
void RK_Interpolate(
double H, 
double Hold, 
double* __restrict__ Z1,
   353                                 double* __restrict__ Z2, 
double* __restrict__ Z3, 
const double* __restrict__ CONT) {
   355     register double x1 = 1.0 + 
rkC[0] * r;
   356     register double x2 = 1.0 + 
rkC[1] * r;
   357     register double x3 = 1.0 + 
rkC[2] * r;
   359     for (
int i = 0; i < 
NSP; i++) {
   367 __device__ 
void WADD(
const double* __restrict__ X, 
const double* __restrict__ Y, 
double* __restrict__ Z) {
   369     for (
int i = 0; i < 
NSP; i++)
   375 __device__ 
void DAXPY3(
double DA1, 
double DA2, 
double DA3,
   376                         const double* __restrict__ DX, 
double* __restrict__ DY1,
   377                         double* __restrict__ DY2, 
double* __restrict__ DY3) {
   379     for (
int i = 0; i < 
NSP; i++) {
   391                                 double const * 
const __restrict__ Y,
   393                                 const mechanism_memory* __restrict__ mech,
   394                                 double* __restrict__ TMP,
   395                                 double* __restrict__ F) {
   396     double const * 
const __restrict__ Z1 = solver->Z1;
   397     double const * 
const __restrict__ Z2 = solver->Z2;
   398     double const * 
const __restrict__ Z3 = solver->Z3;
   399     double * 
const __restrict__ R1 = solver->DZ1;
   400     double * 
const __restrict__ R2 = solver->DZ2;
   401     double * 
const __restrict__ R3 = solver->DZ3;
   404     for (
int i = 0; i < 
NSP; i++) {
   412     dydt(t + 
rkC[0] * H, pr, TMP, F, mech);
   418     dydt(t + 
rkC[1] * H, pr, TMP, F, mech);
   424     dydt(t + 
rkC[2] * H, pr, TMP, F, mech);
   429 __device__ 
void dlaswp(
double * 
const __restrict__ A,
   430                        int const * 
const __restrict__ ipiv) {
   432     for (
int i = 0; i < 
NSP; i++) {
   433         int ip = ipiv[
INDEX(i)];
   435             double temp = A[
INDEX(i)];
   444 __device__ 
void dtrsm(
bool upper, 
bool nounit,
   445                       double const * 
const __restrict__ A,
   446                       double * 
const __restrict__ b) {
   449         for (
int k = 
NSP - 1; k >= 0; --k)
   455             for (
int i = 0; i < k; i++)
   463         for (
int k = 0; k < 
NSP; k++) {
   464             if (fabs(b[
INDEX(k)]) > 0) {
   469                 for (
int i = k + 1; i < 
NSP; i++)
   478 __device__ 
void dgetrs(
double * 
const __restrict__ A,
   479                        double * 
const __restrict__ B,
   480                        int const * 
const __restrict__ ipiv) {
   482     dtrsm(
false, 
false, A, B);
   483     dtrsm(
true, 
true, A, B);
   486 __device__ 
void zlaswp(cuDoubleComplex * 
const __restrict__ A, 
int const * 
const __restrict__ ipiv) {
   488     for (
int i = 0; i < 
NSP; i++) {
   489         int ip = ipiv[
INDEX(i)];
   491             cuDoubleComplex temp = A[
INDEX(i)];
   500 __device__ 
void ztrsm(
bool upper, 
bool nounit,
   501                       cuDoubleComplex 
const * 
const __restrict__ A,
   502                       cuDoubleComplex * 
const __restrict__ b) {
   505         for (
int k = 
NSP - 1; k >= 0; --k)
   511             for (
int i = 0; i < k; i++)
   519         for (
int k = 0; k < 
NSP; k++) {
   520             if (cuCabs(b[
INDEX(k)]) > 0) {
   525                 for (
int i = k + 1; i < 
NSP; i++)
   534 __device__ 
void zgetrs(cuDoubleComplex * 
const __restrict__ A,
   535                        cuDoubleComplex * 
const __restrict__ B,
   536                        int const * 
const __restrict__ ipiv) {
   538     ztrsm(
false, 
false, A, B);
   539     ztrsm(
true, 
true, A, B);
   542 __device__ 
void RK_Solve(
const double H,
   544                                 cuDoubleComplex * 
const __restrict__ temp) {
   546     double* 
const __restrict__ E1 = solver->E1;
   547     cuDoubleComplex * 
const __restrict__ E2 = solver->E2;
   548     double * 
const __restrict__ R1 = solver->DZ1;
   549     double * 
const __restrict__ R2 = solver->DZ2;
   550     double * 
const __restrict__ R3 = solver->DZ3;
   551     int* 
const __restrict__ ipiv1 = solver->ipiv1;
   552     int* 
const __restrict__ ipiv2 = solver->ipiv2;
   556     for(
int i = 0; i < 
NSP; i++)
   558         double x1 = R1[
INDEX(i)] / H;
   559         double x2 = R2[
INDEX(i)] / H;
   560         double x3 = R3[
INDEX(i)] / H;
   565     dgetrs(E1, R1, ipiv1);
   567     for (
int i = 0; i < 
NSP; ++i)
   571     zgetrs(E2, temp, ipiv2);
   573     for (
int i = 0; i < 
NSP; ++i)
   581     for (
int i = 0; i < 
NSP; ++i) {
   582         double x1 = R1[
INDEX(i)];
   583         double x2 = R2[
INDEX(i)];
   584         double x3 = R3[
INDEX(i)];
   592                                double const * 
const __restrict__ DY) {
   595     for (
int i = 0; i < 
NSP; ++i){
   598     return fmax(sqrt(sum / ((
double)
NSP)), 1e-10);
   603                                              double const * 
const __restrict__ Y,
   605                                              mechanism_memory 
const * 
const __restrict__ mech,
   606                                              const bool FirstStep, 
const bool Reject) {
   608     double HrkE1  = 
rkE[1]/H;
   609     double HrkE2  = 
rkE[2]/H;
   610     double HrkE3  = 
rkE[3]/H;
   612     double * 
const __restrict__ E1 = mech->jac;
   613     const double * 
const __restrict__ F0 = mech->dy;
   614     double * 
const __restrict__ F1 = solver->work1;
   615     double * 
const __restrict__ F2 = solver->work2;
   616     double * 
const __restrict__ TMP = solver->work3;
   617     double const * 
const __restrict__ Z1 = solver->Z1;
   618     double const * 
const __restrict__ Z2 = solver->Z2;
   619     double const * 
const __restrict__ Z3 = solver->Z3;
   620     int const * __restrict__ ipiv1 = solver->ipiv1;
   621     double const * __restrict__ 
scale = solver->scale;
   624     for (
int i = 0; i < 
NSP; ++i) {
   628     for (
int i = 0; i < 
NSP; ++i) {
   631     dgetrs(E1, TMP, ipiv1);
   633     if (Err >= 1.0 && (FirstStep || Reject)) {
   635         for (
int i = 0; i < 
NSP; i++) {
   638         dydt(t, pr, TMP, F1, mech);
   640         for (
int i = 0; i < 
NSP; i++) {
   643         dgetrs(E1, TMP, ipiv1);
   653 __device__ 
void integrate (
const double t_start,
   656                             double * 
const __restrict__ y,
   657                             mechanism_memory 
const * 
const __restrict__ mech,
   665 #ifdef CONST_TIME_STEP   666     double H = t_end - t_start;
   668     double H = fmin(5e-7, t_end - t_start);
   673     bool FirstStep = 
true;
   674     bool SkipJac = 
false;
   677     double * 
const __restrict__ A = mech->jac;
   678     double * 
const __restrict__ sc = solver->scale;
   679     double * 
const __restrict__ y0 = solver->y0;
   680     double * 
const __restrict__ F0 = mech->dy;
   681     double * 
const __restrict__ work1 = solver->work1;
   682     double * 
const __restrict__ work2 = solver->work2;
   683     cuDoubleComplex * 
const __restrict__ work4 = solver->work4;
   684     double * 
const __restrict__ Z1 = solver->Z1;
   685     double * 
const __restrict__ Z2 = solver->Z2;
   686     double * 
const __restrict__ Z3 = solver->Z3;
   687     double * 
const __restrict__ DZ1 = solver->DZ1;
   688     double * 
const __restrict__ DZ2 = solver->DZ2;
   689     double * 
const __restrict__ DZ3 = solver->DZ3;
   690     double * 
const __restrict__ CONT = solver->CONT;
   691     int * 
const __restrict__ result = solver->result;
   696     safe_memset(F0, 0.0);
   699     int Nconsecutive = 0;
   701     double NewtonRate = pow(2.0, 1.25);
   703         #ifdef DIVERGENCE_TEST   707             dydt (t, var, y, F0, mech);
   712 #ifndef FINITE_DIFFERENCE   715                 eval_jacob (t, var, y, A, mech, work1, work2);
   721                 if (Nconsecutive >= 5)
   743         if (0.1 * fabs(H) <= fabs(t) * 
Roundoff)
   749             safe_memset3(Z1, Z2, Z3, 0.0);
   753         bool NewtonDone = 
false;
   754         double NewtonIncrementOld = 0;
   760         NewtonRate = pow(fmax(NewtonRate, 
EPS), 0.8);
   768             double NewtonIncrement = sqrt((d1 * d1 + d2 * d2 + d3 * d3) / 3.0);
   773                 Theta = NewtonIncrement / NewtonIncrementOld;
   777                     NewtonRate = Theta / (1.0 - Theta);
   779                 double NewtonPredictedErr = (NewtonIncrement * pow(Theta, (
NewtonMaxit - NewtonIter - 1))) / (1.0 - Theta);
   782                     double Qnewton = fmin(10.0, NewtonPredictedErr / 
NewtonTol);
   783                     Fac = 0.8 * pow(Qnewton, -1.0/((
double)(
NewtonMaxit-NewtonIter)));
   788             NewtonIncrementOld = fmax(NewtonIncrement, 
Roundoff);
   791             for (
int i = 0; i < 
NSP; i++)
   798             NewtonDone = (NewtonRate * NewtonIncrement <= 
NewtonTol);
   799             if (NewtonDone) 
break;
   806 #ifndef CONST_TIME_STEP   816                         solver, mech, FirstStep, Reject);
   826                 double FacGus = 
FacSafe * (H / Hacc) * pow(Err * Err / ErrOld, -0.25);
   828                 Fac = fmin(Fac, FacGus);
   832             ErrOld = fmax(1e-2, Err);
   838             for (
int i = 0; i < 
NSP; i++) {
   847             Hnew = fmin(fmax(Hnew, Hmin), t_end - t);
   849                 Hnew = fmin(Hnew, H);
   852             if (t + Hnew / 
Qmin - t_end >= 0.0) {
   855                 double Hratio = Hnew / H;
   857                 SkipLU = (Theta <= ThetaMin) && (Hratio>=
Qmin) && (Hratio<=
Qmax);
   858                 if (!SkipLU) H = Hnew;
   861             SkipJac = NewtonIter == 1 || NewtonRate <= 
ThetaMin;
   864             if (FirstStep || Reject) {
   878         for (
int i = 0; i < 
NSP; i++) {
 static void DAXPY3(const double DA1, const double DA2, const double DA3, const double *__restrict__ DX, double *__restrict__ DY1, double *__restrict__ DY2, double *__restrict__ DY3)
Sepcialization of DAXPY with unrolled (or at least bounds known at compile time) loops. 
 
#define T_ID
The global CUDA thread index. 
 
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process) 
 
Defines some simple macros to simplify GPU indexing. 
 
static const double rkBeta
 
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached. 
 
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation. 
 
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
 
static const double rkELO
 
static const double rkTinv[3][3]
 
static double RK_ErrorEstimate(const double H, const double t, const double pr, const double *__restrict__ Y, const double *__restrict__ F0, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, const double *__restrict__ scale, double *__restrict__ E1, int *__restrict__ ipiv1, const bool FirstStep, const bool Reject)
Computes and returns the error estimate for this step. 
 
static const double rkT[3][3]
 
static const double rkTinvAinv[3][3]
 
Header definitions for CUDA LU factorization routines. 
 
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking. 
 
Contains header definitions for the CUDA RHS function for the van der Pol example. 
 
static const double rkGamma
 
static const double rkC[3]
 
static const double rkAinvT[3][3]
 
Header definition of CUDA Finite Difference Jacobian. 
 
Headers for CUDA LU decomposition implementation. 
 
#define EC_success
Successful time step. 
 
simple convenience file to include the correct solver properties file 
 
static void RK_PrepareRHS(const double t, const double pr, const double H, const double *__restrict__ Y, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3)
Prepare the right-hand side for Newton iterations: . 
 
__device__ void integrate(const double, const double, const double, double *const __restrict__, mechanism_memory const *const __restrict__, solver_memory const *const __restrict__)
 
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm. 
 
static double RK_ErrorNorm(const double *__restrict__ scale, double *__restrict__ DY)
Computes the scaled error norm from the given scale and DY vectors. 
 
__device__ void getLU(const int n, double *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
 
static void WADD(const double *__restrict__ X, const double *__restrict__ Y, double *__restrict__ Z)
Performs  with unrolled (or at least bounds known at compile time) loops. 
 
static const double rkAlpha
 
static const double rkB[3]
 
__device__ void getComplexLU(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
 
#define EC_newton_max_iterations_exceeded
Maximum allowed Newton Iteration steps exceeded. 
 
A file generated by Scons that specifies various options to the solvers. 
 
static void RK_Interpolate(const double H, const double Hold, double *__restrict__ Z1, double *__restrict__ Z2, double *__restrict__ Z3, const double *__restrict__ CONT)
Apply quadaratic interpolate to get initial values. 
 
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded. 
 
static void RK_Decomp(const double H, double *__restrict__ E1, double complex *__restrict__ E2, const double *__restrict__ Jac, int *__restrict__ ipiv1, int *__restrict__ ipiv2, int *__restrict__ info)
Compute E1 & E2 matricies and their LU Decomposition. 
 
static void RK_Solve(const double H, double *__restrict__ E1, double complex *__restrict__ E2, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3, int *__restrict__ ipiv1, int *__restrict__ ipiv2)
Solves for the RHS values in the Newton iteration. 
 
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
 
static const double rkE[4]
 
static const double rkA[3][3]
 
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math. 
 
static void RK_Make_Interpolate(const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ CONT)
Compute Quadaratic interpolate. 
 
Contains a header definition for the CUDA van der Pol Jacobian evaluation.