accelerInt  v0.1
radau2a.cu
Go to the documentation of this file.
1 
20 #include <cuComplex.h>
21 
22 //various mechanism/solver defns
23 //these should be included first
24 #include "header.cuh"
25 #include "solver_options.cuh"
26 #include "solver_props.cuh"
27 
28 //math operations
29 #include "inverse.cuh"
30 #include "complexInverse.cuh"
31 
32 //rate/jacobian subroutines
33 #ifndef FINITE_DIFFERENCE
34 #include "jacob.cuh"
35 #else
36 #include "fd_jacob.cuh"
37 #endif
38 #include "dydt.cuh"
39 #include "gpu_macros.cuh"
40 
41 #ifdef GENERATE_DOCS
42 namespace radau2acu {
43 #endif
44 
45 //#define WARP_VOTING
46 #ifdef WARP_VOTING
47  #define ANY(X) (__any((X)))
48  #define ALL(X) (__all((X)))
49 #else
50  #define ANY(X) ((X))
51  #define ALL(X) ((X))
52 #endif
53 
55 #define Max_no_steps (200000)
56 #define NewtonMaxit (8)
58 #define StartNewton (true)
60 #define Gustafsson
62 #define Roundoff (EPS)
64 #define FacMin (0.2)
66 #define FacMax (8)
68 #define FacSafe (0.9)
70 #define FacRej (0.1)
72 #define ThetaMin (0.001)
74 #define NewtonTol (0.03)
76 #define Qmin (1.0)
78 #define Qmax (1.2)
80 //#define UNROLL (8)
82 #define Max_consecutive_errs (5)
83 //#define UNROLL (8)
84 #ifdef DIVERGENCE_TEST
85  extern __device__ int integrator_steps[DIVERGENCE_TEST];
86 #endif
87 //#define SDIRK_ERROR
88 
95 __device__
96 void scale (double const * const __restrict__ y0,
97  double const * const __restrict__ y,
98  double * const __restrict__ sc) {
99  #pragma unroll 8
100  for (int i = 0; i < NSP; ++i) {
101  sc[INDEX(i)] = 1.0 / (ATOL + fmax(fabs(y0[INDEX(i)]), fabs(y[INDEX(i)])) * RTOL);
102  }
103 }
104 
110 __device__
111 void scale_init (double const * const __restrict__ y0,
112  double * const __restrict__ sc) {
113  #pragma unroll 8
114  for (int i = 0; i < NSP; ++i) {
115  sc[INDEX(i)] = 1.0 / (ATOL + fabs(y0[INDEX(i)]) * RTOL);
116  }
117 }
118 
124 __device__
125 void safe_memcpy(double * const __restrict__ dest,
126  double const * const __restrict__ source)
127 {
128  #pragma unroll 8
129  for (int i = 0; i < NSP; i++)
130  {
131  dest[INDEX(i)] = source[INDEX(i)];
132  }
133 }
134 
142 __device__
143 void safe_memset3(double * const __restrict__ dest1,
144  double * const __restrict__ dest2,
145  double * const __restrict__ dest3, const double val)
146 {
147  #pragma unroll 8
148  for (int i = 0; i < NSP; i++)
149  {
150  dest1[INDEX(i)] = val;
151  dest2[INDEX(i)] = val;
152  dest3[INDEX(i)] = val;
153  }
154 }
155 
161 __device__
162 void safe_memset(double * const __restrict__ dest1, const double val)
163 {
164  #pragma unroll 8
165  for (int i = 0; i < NSP; i++)
166  {
167  dest1[INDEX(i)] = val;
168  }
169 }
170 
176 __device__
177 void safe_memset_jac(double * const __restrict__ dest1, const double val)
178 {
179  #pragma unroll 8
180  for (int i = 0; i < NSP * NSP; i++)
181  {
182  dest1[INDEX(i)] = val;
183  }
184 }
185 
191 __constant__ double rkA[3][3] = { {
192  1.968154772236604258683861429918299e-1,
193  -6.55354258501983881085227825696087e-2,
194  2.377097434822015242040823210718965e-2
195  }, {
196  3.944243147390872769974116714584975e-1,
197  2.920734116652284630205027458970589e-1,
198  -4.154875212599793019818600988496743e-2
199  }, {
200  3.764030627004672750500754423692808e-1,
201  5.124858261884216138388134465196080e-1,
202  1.111111111111111111111111111111111e-1
203  }
204 };
205 
206 __constant__ double rkB[3] = {
207 3.764030627004672750500754423692808e-1,
208 5.124858261884216138388134465196080e-1,
209 1.111111111111111111111111111111111e-1
210 };
211 
212 __constant__ double rkC[3] = {
213 1.550510257216821901802715925294109e-1,
214 6.449489742783178098197284074705891e-1,
215 1.0
216 };
217 
218 //Local order of error estimator
219 /*
220 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
221 !~~~> Diagonalize the RK matrix:
222 ! rkTinv * inv(rkA) * rkT =
223 ! | rkGamma 0 0 |
224 ! | 0 rkAlpha -rkBeta |
225 ! | 0 rkBeta rkAlpha |
226 !~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
227 */
228 
229 __constant__ double rkGamma = 3.637834252744495732208418513577775e0;
230 __constant__ double rkAlpha = 2.681082873627752133895790743211112e0;
231 __constant__ double rkBeta = 3.050430199247410569426377624787569e0;
232 
233 __constant__ double rkT[3][3] = {
234 {9.443876248897524148749007950641664e-2,
235 -1.412552950209542084279903838077973e-1,
236 -3.00291941051474244918611170890539e-2},
237 {2.502131229653333113765090675125018e-1,
238 2.041293522937999319959908102983381e-1,
239 3.829421127572619377954382335998733e-1},
240 {1.0e0,
241 1.0e0,
242 0.0e0}
243 };
244 
245 __constant__ double rkTinv[3][3] = {
246 {4.178718591551904727346462658512057e0,
247 3.27682820761062387082533272429617e-1,
248 5.233764454994495480399309159089876e-1},
249 {-4.178718591551904727346462658512057e0,
250 -3.27682820761062387082533272429617e-1,
251 4.766235545005504519600690840910124e-1},
252 {-5.02872634945786875951247343139544e-1,
253 2.571926949855605429186785353601676e0,
254 -5.960392048282249249688219110993024e-1}
255 };
256 
257 __constant__ double rkTinvAinv[3][3] = {
258 {1.520148562492775501049204957366528e+1,
259 1.192055789400527921212348994770778e0,
260 1.903956760517560343018332287285119e0},
261 {-9.669512977505946748632625374449567e0,
262 -8.724028436822336183071773193986487e0,
263 3.096043239482439656981667712714881e0},
264 {-1.409513259499574544876303981551774e+1,
265 5.895975725255405108079130152868952e0,
266 -1.441236197545344702389881889085515e-1}
267 };
268 
269 __constant__ double rkAinvT[3][3] = {
270 {0.3435525649691961614912493915818282e0,
271 -0.4703191128473198422370558694426832e0,
272 0.3503786597113668965366406634269080e0},
273 {0.9102338692094599309122768354288852e0,
274 1.715425895757991796035292755937326e0,
275 0.4040171993145015239277111187301784e0},
276 {3.637834252744495732208418513577775e0,
277 2.681082873627752133895790743211112e0,
278 -3.050430199247410569426377624787569e0}
279 };
280 
281 // Classical error estimator:
282 // H* Sum (B_j-Bhat_j)*f(Z_j) = H*E(0)*f(0) + Sum E_j*Z_j
283 __constant__ double rkE[4] = {
284 0.05,
285 -10.04880939982741556246032950764708*0.05,
286 1.382142733160748895793662840980412*0.05,
287 -0.3333333333333333333333333333333333*0.05
288 };
289 /*
290 // H* Sum Bgam_j*f(Z_j) = H*Bgam(0)*f(0) + Sum Theta_j*Z_j
291 const static double rkTheta[3] = {
292 -1.520677486405081647234271944611547 - 10.04880939982741556246032950764708*0.05,
293 2.070455145596436382729929151810376 + 1.382142733160748895793662840980413*0.05,
294 -0.3333333333333333333333333333333333*0.05 - 0.3744441479783868387391430179970741
295 };*/
296 
297 __constant__ double rkELO = 4;
298 
304 
306 /*
307 * calculate E1 & E2 matricies and their LU Decomposition
308 */
309 __device__ void RK_Decomp(double H, const double* const __restrict__ Jac,
310  const solver_memory* const __restrict__ solver,
311  int* __restrict__ info) {
312  double* const __restrict__ E1 = solver->E1;
313  cuDoubleComplex* const __restrict__ E2 = solver->E2;
314  int* const __restrict__ ipiv1 = solver->ipiv1;
315  int* const __restrict__ ipiv2 = solver->ipiv2;
316  cuDoubleComplex temp = make_cuDoubleComplex(rkAlpha/H, rkBeta/H);
317  #pragma unroll 8
318  for (int i = 0; i < NSP; i++)
319  {
320  #pragma unroll 8
321  for(int j = 0; j < NSP; j++)
322  {
323  E1[INDEX(i + j * NSP)] = -Jac[INDEX(i + j * NSP)];
324  E2[INDEX(i + j * NSP)] = make_cuDoubleComplex(-Jac[INDEX(i + j * NSP)], 0);
325  }
326  E1[INDEX(i + i * NSP)] += rkGamma / H;
327  E2[INDEX(i + i * NSP)] = cuCadd(E2[INDEX(i + i * NSP)], temp);
328  }
329  getLU(NSP, E1, ipiv1, info);
330  if (*info != 0) {
331  return;
332  }
333  getComplexLU(NSP, E2, ipiv2, info);
334 }
335 
336 __device__ void RK_Make_Interpolate(const double* __restrict__ Z1, const double* __restrict__ Z2,
337  const double* __restrict__ Z3, double* __restrict__ CONT) {
338  double den = (rkC[2] - rkC[1]) * (rkC[1] - rkC[0]) * (rkC[0] - rkC[2]);
339  #pragma unroll 8
340  for (int i = 0; i < NSP; i++) {
341  CONT[INDEX(i)] = ((-rkC[2] * rkC[2] * rkC[1] * Z1[INDEX(i)] + Z3[INDEX(i)] * rkC[1]* rkC[0] * rkC[0]
342  + rkC[1] * rkC[1] * rkC[2] * Z1[INDEX(i)] - rkC[1] * rkC[1] * rkC[0] * Z3[INDEX(i)]
343  + rkC[2] * rkC[2] * rkC[0] * Z2[INDEX(i)] - Z2[INDEX(i)] * rkC[2] * rkC[0] * rkC[0])
344  /den) - Z3[INDEX(i)];
345  CONT[INDEX(NSP + i)] = -( rkC[0] * rkC[0] * (Z3[INDEX(i)] - Z2[INDEX(i)]) + rkC[1] * rkC[1] * (Z1[INDEX(i)] - Z3[INDEX(i)])
346  + rkC[2] * rkC[2] * (Z2[INDEX(i)] - Z1[INDEX(i)]) )/den;
347  CONT[INDEX(NSP + NSP + i)] = ( rkC[0] * (Z3[INDEX(i)] - Z2[INDEX(i)]) + rkC[1] * (Z1[INDEX(i)] - Z3[INDEX(i)])
348  + rkC[2] * (Z2[INDEX(i)] - Z1[INDEX(i)]) ) / den;
349  }
350 }
351 
352 __device__ void RK_Interpolate(double H, double Hold, double* __restrict__ Z1,
353  double* __restrict__ Z2, double* __restrict__ Z3, const double* __restrict__ CONT) {
354  double r = H / Hold;
355  register double x1 = 1.0 + rkC[0] * r;
356  register double x2 = 1.0 + rkC[1] * r;
357  register double x3 = 1.0 + rkC[2] * r;
358  #pragma unroll 8
359  for (int i = 0; i < NSP; i++) {
360  Z1[INDEX(i)] = CONT[INDEX(i)] + x1 * (CONT[INDEX(NSP + i)] + x1 * CONT[INDEX(NSP + NSP + i)]);
361  Z2[INDEX(i)] = CONT[INDEX(i)] + x2 * (CONT[INDEX(NSP + i)] + x2 * CONT[INDEX(NSP + NSP + i)]);
362  Z3[INDEX(i)] = CONT[INDEX(i)] + x2 * (CONT[INDEX(NSP + i)] + x3 * CONT[INDEX(NSP + NSP + i)]);
363  }
364 }
365 
366 
367 __device__ void WADD(const double* __restrict__ X, const double* __restrict__ Y, double* __restrict__ Z) {
368  #pragma unroll 8
369  for (int i = 0; i < NSP; i++)
370  {
371  Z[INDEX(i)] = X[INDEX(i)] + Y[INDEX(i)];
372  }
373 }
374 
375 __device__ void DAXPY3(double DA1, double DA2, double DA3,
376  const double* __restrict__ DX, double* __restrict__ DY1,
377  double* __restrict__ DY2, double* __restrict__ DY3) {
378  #pragma unroll 8
379  for (int i = 0; i < NSP; i++) {
380  DY1[INDEX(i)] += DA1 * DX[INDEX(i)];
381  DY2[INDEX(i)] += DA2 * DX[INDEX(i)];
382  DY3[INDEX(i)] += DA3 * DX[INDEX(i)];
383  }
384 }
385 
386 /*
387 *Prepare the right-hand side for Newton iterations
388 * R = Z - hA * F
389 */
390 __device__ void RK_PrepareRHS(double t, double pr, double H,
391  double const * const __restrict__ Y,
392  const solver_memory* __restrict__ solver,
393  const mechanism_memory* __restrict__ mech,
394  double* __restrict__ TMP,
395  double* __restrict__ F) {
396  double const * const __restrict__ Z1 = solver->Z1;
397  double const * const __restrict__ Z2 = solver->Z2;
398  double const * const __restrict__ Z3 = solver->Z3;
399  double * const __restrict__ R1 = solver->DZ1;
400  double * const __restrict__ R2 = solver->DZ2;
401  double * const __restrict__ R3 = solver->DZ3;
402 
403  #pragma unroll
404  for (int i = 0; i < NSP; i++) {
405  R1[INDEX(i)] = Z1[INDEX(i)];
406  R2[INDEX(i)] = Z2[INDEX(i)];
407  R3[INDEX(i)] = Z3[INDEX(i)];
408  }
409 
410  // TMP = Y + Z1
411  WADD(Y, Z1, TMP);
412  dydt(t + rkC[0] * H, pr, TMP, F, mech);
413  //R[:] -= -h * rkA[:][0] * F[:]
414  DAXPY3(-H * rkA[0][0], -H * rkA[1][0], -H * rkA[2][0], F, R1, R2, R3);
415 
416  // TMP = Y + Z2
417  WADD(Y, Z2, TMP);
418  dydt(t + rkC[1] * H, pr, TMP, F, mech);
419  //R[:] -= -h * rkA[:][1] * F[:]
420  DAXPY3(-H * rkA[0][1], -H * rkA[1][1], -H * rkA[2][1], F, R1, R2, R3);
421 
422  // TMP = Y + Z3
423  WADD(Y, Z3, TMP);
424  dydt(t + rkC[2] * H, pr, TMP, F, mech);
425  //R[:] -= -h * rkA[:][2] * F[:]
426  DAXPY3(-H * rkA[0][2], -H * rkA[1][2], -H * rkA[2][2], F, R1, R2, R3);
427 }
428 
429 __device__ void dlaswp(double * const __restrict__ A,
430  int const * const __restrict__ ipiv) {
431  #pragma unroll 8
432  for (int i = 0; i < NSP; i++) {
433  int ip = ipiv[INDEX(i)];
434  if (ip != i) {
435  double temp = A[INDEX(i)];
436  A[INDEX(i)] = A[INDEX(ip)];
437  A[INDEX(ip)] = temp;
438  }
439  }
440 }
441 
442 //diag == 'n' -> nounit = true
443 //upper == 'u' -> upper = true
444 __device__ void dtrsm(bool upper, bool nounit,
445  double const * const __restrict__ A,
446  double * const __restrict__ b) {
447  if (upper) {
448  #pragma unroll 8
449  for (int k = NSP - 1; k >= 0; --k)
450  {
451  if (nounit) {
452  b[INDEX(k)] /= A[INDEX(k + k * NSP)];
453  }
454  #pragma unroll 8
455  for (int i = 0; i < k; i++)
456  {
457  b[INDEX(i)] -= b[INDEX(k)] * A[INDEX(i + k * NSP)];
458  }
459  }
460  }
461  else{
462  #pragma unroll 8
463  for (int k = 0; k < NSP; k++) {
464  if (fabs(b[INDEX(k)]) > 0) {
465  if (nounit) {
466  b[INDEX(k)] /= A[INDEX(k + k * NSP)];
467  }
468  #pragma unroll 8
469  for (int i = k + 1; i < NSP; i++)
470  {
471  b[INDEX(i)] -= b[INDEX(k)] * A[INDEX(i + k * NSP)];
472  }
473  }
474  }
475  }
476 }
477 
478 __device__ void dgetrs(double * const __restrict__ A,
479  double * const __restrict__ B,
480  int const * const __restrict__ ipiv) {
481  dlaswp(B, ipiv);
482  dtrsm(false, false, A, B);
483  dtrsm(true, true, A, B);
484 }
485 
486 __device__ void zlaswp(cuDoubleComplex * const __restrict__ A, int const * const __restrict__ ipiv) {
487  #pragma unroll 8
488  for (int i = 0; i < NSP; i++) {
489  int ip = ipiv[INDEX(i)];
490  if (ip != i) {
491  cuDoubleComplex temp = A[INDEX(i)];
492  A[INDEX(i)] = A[INDEX(ip)];
493  A[INDEX(ip)] = temp;
494  }
495  }
496 }
497 
498 //diag == 'n' -> nounit = true
499 //upper == 'u' -> upper = true
500 __device__ void ztrsm(bool upper, bool nounit,
501  cuDoubleComplex const * const __restrict__ A,
502  cuDoubleComplex * const __restrict__ b) {
503  if (upper) {
504  #pragma unroll 8
505  for (int k = NSP - 1; k >= 0; --k)
506  {
507  if (nounit) {
508  b[INDEX(k)] = cuCdiv(b[INDEX(k)], A[INDEX(k + k * NSP)]);
509  }
510  #pragma unroll 8
511  for (int i = 0; i < k; i++)
512  {
513  b[INDEX(i)] = cuCsub(b[INDEX(i)], cuCmul(b[INDEX(k)], A[INDEX(i + k * NSP)]));
514  }
515  }
516  }
517  else{
518  #pragma unroll 8
519  for (int k = 0; k < NSP; k++) {
520  if (cuCabs(b[INDEX(k)]) > 0) {
521  if (nounit) {
522  b[INDEX(k)] = cuCdiv(b[INDEX(k)], A[INDEX(k + k * NSP)]);
523  }
524  #pragma unroll 8
525  for (int i = k + 1; i < NSP; i++)
526  {
527  b[INDEX(i)] = cuCsub(b[INDEX(i)], cuCmul(b[INDEX(k)], A[INDEX(i + k * NSP)]));
528  }
529  }
530  }
531  }
532 }
533 
534 __device__ void zgetrs(cuDoubleComplex * const __restrict__ A,
535  cuDoubleComplex * const __restrict__ B,
536  int const * const __restrict__ ipiv) {
537  zlaswp(B, ipiv);
538  ztrsm(false, false, A, B);
539  ztrsm(true, true, A, B);
540 }
541 
542 __device__ void RK_Solve(const double H,
543  solver_memory const * const __restrict__ solver,
544  cuDoubleComplex * const __restrict__ temp) {
545 
546  double* const __restrict__ E1 = solver->E1;
547  cuDoubleComplex * const __restrict__ E2 = solver->E2;
548  double * const __restrict__ R1 = solver->DZ1;
549  double * const __restrict__ R2 = solver->DZ2;
550  double * const __restrict__ R3 = solver->DZ3;
551  int* const __restrict__ ipiv1 = solver->ipiv1;
552  int* const __restrict__ ipiv2 = solver->ipiv2;
553 
554  // Z = (1/h) T^(-1) A^(-1) * Z
555  #pragma unroll 8
556  for(int i = 0; i < NSP; i++)
557  {
558  double x1 = R1[INDEX(i)] / H;
559  double x2 = R2[INDEX(i)] / H;
560  double x3 = R3[INDEX(i)] / H;
561  R1[INDEX(i)] = rkTinvAinv[0][0] * x1 + rkTinvAinv[0][1] * x2 + rkTinvAinv[0][2] * x3;
562  R2[INDEX(i)] = rkTinvAinv[1][0] * x1 + rkTinvAinv[1][1] * x2 + rkTinvAinv[1][2] * x3;
563  R3[INDEX(i)] = rkTinvAinv[2][0] * x1 + rkTinvAinv[2][1] * x2 + rkTinvAinv[2][2] * x3;
564  }
565  dgetrs(E1, R1, ipiv1);
566  #pragma unroll 8
567  for (int i = 0; i < NSP; ++i)
568  {
569  temp[INDEX(i)] = make_cuDoubleComplex(R2[INDEX(i)], R3[INDEX(i)]);
570  }
571  zgetrs(E2, temp, ipiv2);
572  #pragma unroll 8
573  for (int i = 0; i < NSP; ++i)
574  {
575  R2[INDEX(i)] = cuCreal(temp[INDEX(i)]);
576  R3[INDEX(i)] = cuCimag(temp[INDEX(i)]);
577  }
578 
579  // Z = T * Z
580  #pragma unroll 8
581  for (int i = 0; i < NSP; ++i) {
582  double x1 = R1[INDEX(i)];
583  double x2 = R2[INDEX(i)];
584  double x3 = R3[INDEX(i)];
585  R1[INDEX(i)] = rkT[0][0] * x1 + rkT[0][1] * x2 + rkT[0][2] * x3;
586  R2[INDEX(i)] = rkT[1][0] * x1 + rkT[1][1] * x2 + rkT[1][2] * x3;
587  R3[INDEX(i)] = rkT[2][0] * x1 + rkT[2][1] * x2 + rkT[2][2] * x3;
588  }
589 }
590 
591 __device__ double RK_ErrorNorm(double const * const __restrict__ scale,
592  double const * const __restrict__ DY) {
593  double sum = 0;
594  #pragma unroll 8
595  for (int i = 0; i < NSP; ++i){
596  sum += (scale[INDEX(i)] * scale[INDEX(i)] * DY[INDEX(i)] * DY[INDEX(i)]);
597  }
598  return fmax(sqrt(sum / ((double)NSP)), 1e-10);
599 }
600 
601 __device__ double RK_ErrorEstimate(const double H, const double t,
602  const double pr,
603  double const * const __restrict__ Y,
604  solver_memory const * const __restrict__ solver,
605  mechanism_memory const * const __restrict__ mech,
606  const bool FirstStep, const bool Reject) {
607 
608  double HrkE1 = rkE[1]/H;
609  double HrkE2 = rkE[2]/H;
610  double HrkE3 = rkE[3]/H;
611 
612  double * const __restrict__ E1 = mech->jac;
613  const double * const __restrict__ F0 = mech->dy;
614  double * const __restrict__ F1 = solver->work1;
615  double * const __restrict__ F2 = solver->work2;
616  double * const __restrict__ TMP = solver->work3;
617  double const * const __restrict__ Z1 = solver->Z1;
618  double const * const __restrict__ Z2 = solver->Z2;
619  double const * const __restrict__ Z3 = solver->Z3;
620  int const * __restrict__ ipiv1 = solver->ipiv1;
621  double const * __restrict__ scale = solver->scale;
622 
623  #pragma unroll 8
624  for (int i = 0; i < NSP; ++i) {
625  F2[INDEX(i)] = HrkE1 * Z1[INDEX(i)] + HrkE2 * Z2[INDEX(i)] + HrkE3 * Z3[INDEX(i)];
626  }
627  #pragma unroll 8
628  for (int i = 0; i < NSP; ++i) {
629  TMP[INDEX(i)] = rkE[0] * F0[INDEX(i)] + F2[INDEX(i)];
630  }
631  dgetrs(E1, TMP, ipiv1);
632  double Err = RK_ErrorNorm(scale, TMP);
633  if (Err >= 1.0 && (FirstStep || Reject)) {
634  #pragma unroll 8
635  for (int i = 0; i < NSP; i++) {
636  TMP[INDEX(i)] += Y[INDEX(i)];
637  }
638  dydt(t, pr, TMP, F1, mech);
639  #pragma unroll 8
640  for (int i = 0; i < NSP; i++) {
641  TMP[INDEX(i)] = F1[INDEX(i)] + F2[INDEX(i)];
642  }
643  dgetrs(E1, TMP, ipiv1);
644  Err = RK_ErrorNorm(scale, TMP);
645  }
646  return Err;
647 }
648 
653 __device__ void integrate (const double t_start,
654  const double t_end,
655  const double var,
656  double * const __restrict__ y,
657  mechanism_memory const * const __restrict__ mech,
658  solver_memory const * const __restrict__ solver) {
659  double Hmin = 0;
660  double Hold = 0;
661 #ifdef Gustafsson
662  double Hacc = 0;
663  double ErrOld = 0;
664 #endif
665 #ifdef CONST_TIME_STEP
666  double H = t_end - t_start;
667 #else
668  double H = fmin(5e-7, t_end - t_start);
669 #endif
670  double Hnew;
671  double t = t_start;
672  bool Reject = false;
673  bool FirstStep = true;
674  bool SkipJac = false;
675  bool SkipLU = false;
676 
677  double * const __restrict__ A = mech->jac;
678  double * const __restrict__ sc = solver->scale;
679  double * const __restrict__ y0 = solver->y0;
680  double * const __restrict__ F0 = mech->dy;
681  double * const __restrict__ work1 = solver->work1;
682  double * const __restrict__ work2 = solver->work2;
683  cuDoubleComplex * const __restrict__ work4 = solver->work4;
684  double * const __restrict__ Z1 = solver->Z1;
685  double * const __restrict__ Z2 = solver->Z2;
686  double * const __restrict__ Z3 = solver->Z3;
687  double * const __restrict__ DZ1 = solver->DZ1;
688  double * const __restrict__ DZ2 = solver->DZ2;
689  double * const __restrict__ DZ3 = solver->DZ3;
690  double * const __restrict__ CONT = solver->CONT;
691  int * const __restrict__ result = solver->result;
692 
693  scale_init(y, sc);
694  safe_memcpy(y0, y);
695 #ifndef FORCE_ZERO
696  safe_memset(F0, 0.0);
697 #endif
698  int info = 0;
699  int Nconsecutive = 0;
700  int Nsteps = 0;
701  double NewtonRate = pow(2.0, 1.25);
702  while (t + Roundoff < t_end) {
703  #ifdef DIVERGENCE_TEST
705  #endif
706  if(!Reject) {
707  dydt (t, var, y, F0, mech);
708  }
709  if(!SkipLU) {
710  //need to update Jac/LU
711  if(!SkipJac) {
712 #ifndef FINITE_DIFFERENCE
713  eval_jacob (t, var, y, A, mech);
714 #else
715  eval_jacob (t, var, y, A, mech, work1, work2);
716 #endif
717  }
718  RK_Decomp(H, A, solver, &info);
719  if(info != 0) {
720  Nconsecutive += 1;
721  if (Nconsecutive >= 5)
722  {
723  result[T_ID] = EC_consecutive_steps;
724  return;
725  }
726  H *= 0.5;
727  Reject = true;
728  SkipJac = true;
729  SkipLU = false;
730  continue;
731  }
732  else
733  {
734  Nconsecutive = 0;
735  }
736  }
737  Nsteps += 1;
738  if (Nsteps >= Max_no_steps)
739  {
740  result[T_ID] = EC_max_steps_exceeded;
741  return;
742  }
743  if (0.1 * fabs(H) <= fabs(t) * Roundoff)
744  {
745  result[T_ID] = EC_h_plus_t_equals_h;
746  return;
747  }
748  if (FirstStep || !StartNewton) {
749  safe_memset3(Z1, Z2, Z3, 0.0);
750  } else {
751  RK_Interpolate(H, Hold, Z1, Z2, Z3, CONT);
752  }
753  bool NewtonDone = false;
754  double NewtonIncrementOld = 0;
755  double Fac = 0.5; //Step reduction if too many iterations
756  int NewtonIter = 0;
757  double Theta = 0;
758 
759  //reuse previous NewtonRate
760  NewtonRate = pow(fmax(NewtonRate, EPS), 0.8);
761 
762  for (; NewtonIter < NewtonMaxit; NewtonIter++) {
763  RK_PrepareRHS(t, var, H, y, solver, mech, work1, work2);
764  RK_Solve(H, solver, work4);
765  double d1 = RK_ErrorNorm(sc, DZ1);
766  double d2 = RK_ErrorNorm(sc, DZ2);
767  double d3 = RK_ErrorNorm(sc, DZ3);
768  double NewtonIncrement = sqrt((d1 * d1 + d2 * d2 + d3 * d3) / 3.0);
769 
770  Theta = ThetaMin;
771  if (NewtonIter > 0)
772  {
773  Theta = NewtonIncrement / NewtonIncrementOld;
774  if(Theta >= 0.99)
775  break;
776  else
777  NewtonRate = Theta / (1.0 - Theta);
778  //Predict error at the end of Newton process
779  double NewtonPredictedErr = (NewtonIncrement * pow(Theta, (NewtonMaxit - NewtonIter - 1))) / (1.0 - Theta);
780  if(NewtonPredictedErr >= NewtonTol) {
781  //Non-convergence of Newton: predicted error too large
782  double Qnewton = fmin(10.0, NewtonPredictedErr / NewtonTol);
783  Fac = 0.8 * pow(Qnewton, -1.0/((double)(NewtonMaxit-NewtonIter)));
784  break;
785  }
786  }
787 
788  NewtonIncrementOld = fmax(NewtonIncrement, Roundoff);
789  // Update solution
790  #pragma unroll 8
791  for (int i = 0; i < NSP; i++)
792  {
793  Z1[INDEX(i)] -= DZ1[INDEX(i)];
794  Z2[INDEX(i)] -= DZ2[INDEX(i)];
795  Z3[INDEX(i)] -= DZ3[INDEX(i)];
796  }
797 
798  NewtonDone = (NewtonRate * NewtonIncrement <= NewtonTol);
799  if (NewtonDone) break;
800  if (NewtonIter >= NewtonMaxit)
801  {
803  return;
804  }
805  }
806 #ifndef CONST_TIME_STEP
807  if(!NewtonDone) {
808  H = Fac * H;
809  Reject = true;
810  SkipJac = true;
811  SkipLU = false;
812  continue;
813  }
814 
815  double Err = RK_ErrorEstimate(H, t, var, y,
816  solver, mech, FirstStep, Reject);
817 
819  Fac = pow(Err, (-1.0 / rkELO)) * (1.0 + 2 * NewtonMaxit) / (NewtonIter + 1.0 + 2 * NewtonMaxit);
820  Fac = fmin(FacMax, fmax(FacMin, Fac));
821  Hnew = Fac * H;
822 
823  if (Err < 1.0) {
824 #ifdef Gustafsson
825  if (!FirstStep) {
826  double FacGus = FacSafe * (H / Hacc) * pow(Err * Err / ErrOld, -0.25);
827  FacGus = fmin(FacMax, fmax(FacMin, FacGus));
828  Fac = fmin(Fac, FacGus);
829  Hnew = Fac * H;
830  }
831  Hacc = H;
832  ErrOld = fmax(1e-2, Err);
833 #endif
834  FirstStep = false;
835  Hold = H;
836  t += H;
837  #pragma unroll 8
838  for (int i = 0; i < NSP; i++) {
839  y[INDEX(i)] += Z3[INDEX(i)];
840  }
841  // Construct the solution quadratic interpolant Q(c_i) = Z_i, i=1:3
842  if (StartNewton) {
843  RK_Make_Interpolate(Z1, Z2, Z3, CONT);
844  }
845  scale(y, y0, sc);
846  safe_memcpy(y0, y);
847  Hnew = fmin(fmax(Hnew, Hmin), t_end - t);
848  if (Reject) {
849  Hnew = fmin(Hnew, H);
850  }
851  Reject = false;
852  if (t + Hnew / Qmin - t_end >= 0.0) {
853  H = t_end - t;
854  } else {
855  double Hratio = Hnew / H;
856  // Reuse the LU decomposition
857  SkipLU = (Theta <= ThetaMin) && (Hratio>=Qmin) && (Hratio<=Qmax);
858  if (!SkipLU) H = Hnew;
859  }
860  // If convergence is fast enough, do not update Jacobian
861  SkipJac = NewtonIter == 1 || NewtonRate <= ThetaMin;
862  }
863  else {
864  if (FirstStep || Reject) {
865  H = FacRej * H;
866  } else {
867  H = Hnew;
868  }
869  Reject = true;
870  SkipJac = true;
871  SkipLU = false;
872  }
873 #else
874  //constant time stepping
875  //update y & t
876  t += H;
877  #pragma unroll 8
878  for (int i = 0; i < NSP; i++) {
879  y[INDEX(i)] += Z3[INDEX(i)];
880  }
881 #endif
882  }
883  result[T_ID] = EC_success;
884 }
static void DAXPY3(const double DA1, const double DA2, const double DA3, const double *__restrict__ DX, double *__restrict__ DY1, double *__restrict__ DY2, double *__restrict__ DY3)
Sepcialization of DAXPY with unrolled (or at least bounds known at compile time) loops.
Definition: radau2a.c:338
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Defines some simple macros to simplify GPU indexing.
static const double rkBeta
Definition: radau2a.c:181
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
#define Qmin
#define ThetaMin
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
Definition: fd_jacob.c:24
#define DIVERGENCE_TEST
static const double rkELO
Definition: radau2a.c:231
static const double rkTinv[3][3]
Definition: radau2a.c:195
static double RK_ErrorEstimate(const double H, const double t, const double pr, const double *__restrict__ Y, const double *__restrict__ F0, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, const double *__restrict__ scale, double *__restrict__ E1, int *__restrict__ ipiv1, const bool FirstStep, const bool Reject)
Computes and returns the error estimate for this step.
Definition: radau2a.c:458
static const double rkT[3][3]
Definition: radau2a.c:183
static const double rkTinvAinv[3][3]
Definition: radau2a.c:207
Header definitions for CUDA LU factorization routines.
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44
#define NSP
The IVP system size.
Definition: header.cuh:20
#define FacRej
Contains header definitions for the CUDA RHS function for the van der Pol example.
static const double rkGamma
Definition: radau2a.c:179
#define FacMax
static const double rkC[3]
Definition: radau2a.c:122
#define Roundoff
static const double rkAinvT[3][3]
Definition: radau2a.c:219
#define NewtonMaxit
Header definition of CUDA Finite Difference Jacobian.
#define RTOL
#define FacSafe
#define StartNewton
Headers for CUDA LU decomposition implementation.
#define EC_success
Successful time step.
#define Qmax
simple convenience file to include the correct solver properties file
static void RK_PrepareRHS(const double t, const double pr, const double H, const double *__restrict__ Y, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3)
Prepare the right-hand side for Newton iterations: .
Definition: radau2a.c:353
#define ATOL
__device__ void integrate(const double, const double, const double, double *const __restrict__, mechanism_memory const *const __restrict__, solver_memory const *const __restrict__)
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
An example header file that defines system size, memory functions and other required methods for inte...
static double RK_ErrorNorm(const double *__restrict__ scale, double *__restrict__ DY)
Computes the scaled error norm from the given scale and DY vectors.
Definition: radau2a.c:446
__device__ void getLU(const int n, double *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
Definition: inverse.cu:145
static void WADD(const double *__restrict__ X, const double *__restrict__ Y, double *__restrict__ Z)
Performs with unrolled (or at least bounds known at compile time) loops.
Definition: radau2a.c:320
static const double rkAlpha
Definition: radau2a.c:180
static const double rkB[3]
Definition: radau2a.c:116
#define NewtonTol
__device__ void getComplexLU(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
#define FacMin
#define EC_newton_max_iterations_exceeded
Maximum allowed Newton Iteration steps exceeded.
A file generated by Scons that specifies various options to the solvers.
static void RK_Interpolate(const double H, const double Hold, double *__restrict__ Z1, double *__restrict__ Z2, double *__restrict__ Z3, const double *__restrict__ CONT)
Apply quadaratic interpolate to get initial values.
Definition: radau2a.c:302
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
static void RK_Decomp(const double H, double *__restrict__ E1, double complex *__restrict__ E2, const double *__restrict__ Jac, int *__restrict__ ipiv1, int *__restrict__ ipiv2, int *__restrict__ info)
Compute E1 & E2 matricies and their LU Decomposition.
Definition: radau2a.c:256
static void RK_Solve(const double H, double *__restrict__ E1, double complex *__restrict__ E2, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3, int *__restrict__ ipiv1, int *__restrict__ ipiv2)
Solves for the RHS values in the Newton iteration.
Definition: radau2a.c:388
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24
#define EPS
static const double rkE[4]
Definition: radau2a.c:154
static const double rkA[3][3]
Definition: radau2a.c:101
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
static void RK_Make_Interpolate(const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ CONT)
Compute Quadaratic interpolate.
Definition: radau2a.c:283
Contains a header definition for the CUDA van der Pol Jacobian evaluation.
#define Max_no_steps