20 #include <cuComplex.h> 33 #ifndef FINITE_DIFFERENCE 47 #define ANY(X) (__any((X))) 48 #define ALL(X) (__all((X))) 55 #define Max_no_steps (200000) 56 #define NewtonMaxit (8) 58 #define StartNewton (true) 62 #define Roundoff (EPS) 72 #define ThetaMin (0.001) 74 #define NewtonTol (0.03) 82 #define Max_consecutive_errs (5) 84 #ifdef DIVERGENCE_TEST 96 void scale (
double const *
const __restrict__ y0,
97 double const *
const __restrict__ y,
98 double *
const __restrict__ sc) {
100 for (
int i = 0; i <
NSP; ++i) {
111 void scale_init (
double const *
const __restrict__ y0,
112 double *
const __restrict__ sc) {
114 for (
int i = 0; i <
NSP; ++i) {
125 void safe_memcpy(
double *
const __restrict__ dest,
126 double const *
const __restrict__ source)
129 for (
int i = 0; i <
NSP; i++)
143 void safe_memset3(
double *
const __restrict__ dest1,
144 double *
const __restrict__ dest2,
145 double *
const __restrict__ dest3,
const double val)
148 for (
int i = 0; i <
NSP; i++)
150 dest1[
INDEX(i)] = val;
151 dest2[
INDEX(i)] = val;
152 dest3[
INDEX(i)] = val;
162 void safe_memset(
double *
const __restrict__ dest1,
const double val)
165 for (
int i = 0; i <
NSP; i++)
167 dest1[
INDEX(i)] = val;
177 void safe_memset_jac(
double *
const __restrict__ dest1,
const double val)
180 for (
int i = 0; i <
NSP *
NSP; i++)
182 dest1[
INDEX(i)] = val;
191 __constant__
double rkA[3][3] = { {
192 1.968154772236604258683861429918299e-1,
193 -6.55354258501983881085227825696087e-2,
194 2.377097434822015242040823210718965e-2
196 3.944243147390872769974116714584975e-1,
197 2.920734116652284630205027458970589e-1,
198 -4.154875212599793019818600988496743e-2
200 3.764030627004672750500754423692808e-1,
201 5.124858261884216138388134465196080e-1,
202 1.111111111111111111111111111111111e-1
206 __constant__
double rkB[3] = {
207 3.764030627004672750500754423692808e-1,
208 5.124858261884216138388134465196080e-1,
209 1.111111111111111111111111111111111e-1
212 __constant__
double rkC[3] = {
213 1.550510257216821901802715925294109e-1,
214 6.449489742783178098197284074705891e-1,
229 __constant__
double rkGamma = 3.637834252744495732208418513577775e0;
230 __constant__
double rkAlpha = 2.681082873627752133895790743211112e0;
231 __constant__
double rkBeta = 3.050430199247410569426377624787569e0;
233 __constant__
double rkT[3][3] = {
234 {9.443876248897524148749007950641664e-2,
235 -1.412552950209542084279903838077973e-1,
236 -3.00291941051474244918611170890539e-2},
237 {2.502131229653333113765090675125018e-1,
238 2.041293522937999319959908102983381e-1,
239 3.829421127572619377954382335998733e-1},
245 __constant__
double rkTinv[3][3] = {
246 {4.178718591551904727346462658512057e0,
247 3.27682820761062387082533272429617e-1,
248 5.233764454994495480399309159089876e-1},
249 {-4.178718591551904727346462658512057e0,
250 -3.27682820761062387082533272429617e-1,
251 4.766235545005504519600690840910124e-1},
252 {-5.02872634945786875951247343139544e-1,
253 2.571926949855605429186785353601676e0,
254 -5.960392048282249249688219110993024e-1}
258 {1.520148562492775501049204957366528e+1,
259 1.192055789400527921212348994770778e0,
260 1.903956760517560343018332287285119e0},
261 {-9.669512977505946748632625374449567e0,
262 -8.724028436822336183071773193986487e0,
263 3.096043239482439656981667712714881e0},
264 {-1.409513259499574544876303981551774e+1,
265 5.895975725255405108079130152868952e0,
266 -1.441236197545344702389881889085515e-1}
269 __constant__
double rkAinvT[3][3] = {
270 {0.3435525649691961614912493915818282e0,
271 -0.4703191128473198422370558694426832e0,
272 0.3503786597113668965366406634269080e0},
273 {0.9102338692094599309122768354288852e0,
274 1.715425895757991796035292755937326e0,
275 0.4040171993145015239277111187301784e0},
276 {3.637834252744495732208418513577775e0,
277 2.681082873627752133895790743211112e0,
278 -3.050430199247410569426377624787569e0}
283 __constant__
double rkE[4] = {
285 -10.04880939982741556246032950764708*0.05,
286 1.382142733160748895793662840980412*0.05,
287 -0.3333333333333333333333333333333333*0.05
297 __constant__
double rkELO = 4;
309 __device__
void RK_Decomp(
double H,
const double*
const __restrict__ Jac,
311 int* __restrict__ info) {
312 double*
const __restrict__ E1 = solver->E1;
313 cuDoubleComplex*
const __restrict__ E2 = solver->E2;
314 int*
const __restrict__ ipiv1 = solver->ipiv1;
315 int*
const __restrict__ ipiv2 = solver->ipiv2;
316 cuDoubleComplex temp = make_cuDoubleComplex(
rkAlpha/H,
rkBeta/H);
318 for (
int i = 0; i <
NSP; i++)
321 for(
int j = 0; j <
NSP; j++)
336 __device__
void RK_Make_Interpolate(
const double* __restrict__ Z1,
const double* __restrict__ Z2,
337 const double* __restrict__ Z3,
double* __restrict__ CONT) {
340 for (
int i = 0; i <
NSP; i++) {
344 /den) - Z3[
INDEX(i)];
352 __device__
void RK_Interpolate(
double H,
double Hold,
double* __restrict__ Z1,
353 double* __restrict__ Z2,
double* __restrict__ Z3,
const double* __restrict__ CONT) {
355 register double x1 = 1.0 +
rkC[0] * r;
356 register double x2 = 1.0 +
rkC[1] * r;
357 register double x3 = 1.0 +
rkC[2] * r;
359 for (
int i = 0; i <
NSP; i++) {
367 __device__
void WADD(
const double* __restrict__ X,
const double* __restrict__ Y,
double* __restrict__ Z) {
369 for (
int i = 0; i <
NSP; i++)
375 __device__
void DAXPY3(
double DA1,
double DA2,
double DA3,
376 const double* __restrict__ DX,
double* __restrict__ DY1,
377 double* __restrict__ DY2,
double* __restrict__ DY3) {
379 for (
int i = 0; i <
NSP; i++) {
391 double const *
const __restrict__ Y,
393 const mechanism_memory* __restrict__ mech,
394 double* __restrict__ TMP,
395 double* __restrict__ F) {
396 double const *
const __restrict__ Z1 = solver->Z1;
397 double const *
const __restrict__ Z2 = solver->Z2;
398 double const *
const __restrict__ Z3 = solver->Z3;
399 double *
const __restrict__ R1 = solver->DZ1;
400 double *
const __restrict__ R2 = solver->DZ2;
401 double *
const __restrict__ R3 = solver->DZ3;
404 for (
int i = 0; i <
NSP; i++) {
412 dydt(t +
rkC[0] * H, pr, TMP, F, mech);
418 dydt(t +
rkC[1] * H, pr, TMP, F, mech);
424 dydt(t +
rkC[2] * H, pr, TMP, F, mech);
429 __device__
void dlaswp(
double *
const __restrict__ A,
430 int const *
const __restrict__ ipiv) {
432 for (
int i = 0; i <
NSP; i++) {
433 int ip = ipiv[
INDEX(i)];
435 double temp = A[
INDEX(i)];
444 __device__
void dtrsm(
bool upper,
bool nounit,
445 double const *
const __restrict__ A,
446 double *
const __restrict__ b) {
449 for (
int k =
NSP - 1; k >= 0; --k)
455 for (
int i = 0; i < k; i++)
463 for (
int k = 0; k <
NSP; k++) {
464 if (fabs(b[
INDEX(k)]) > 0) {
469 for (
int i = k + 1; i <
NSP; i++)
478 __device__
void dgetrs(
double *
const __restrict__ A,
479 double *
const __restrict__ B,
480 int const *
const __restrict__ ipiv) {
482 dtrsm(
false,
false, A, B);
483 dtrsm(
true,
true, A, B);
486 __device__
void zlaswp(cuDoubleComplex *
const __restrict__ A,
int const *
const __restrict__ ipiv) {
488 for (
int i = 0; i <
NSP; i++) {
489 int ip = ipiv[
INDEX(i)];
491 cuDoubleComplex temp = A[
INDEX(i)];
500 __device__
void ztrsm(
bool upper,
bool nounit,
501 cuDoubleComplex
const *
const __restrict__ A,
502 cuDoubleComplex *
const __restrict__ b) {
505 for (
int k =
NSP - 1; k >= 0; --k)
511 for (
int i = 0; i < k; i++)
519 for (
int k = 0; k <
NSP; k++) {
520 if (cuCabs(b[
INDEX(k)]) > 0) {
525 for (
int i = k + 1; i <
NSP; i++)
534 __device__
void zgetrs(cuDoubleComplex *
const __restrict__ A,
535 cuDoubleComplex *
const __restrict__ B,
536 int const *
const __restrict__ ipiv) {
538 ztrsm(
false,
false, A, B);
539 ztrsm(
true,
true, A, B);
542 __device__
void RK_Solve(
const double H,
544 cuDoubleComplex *
const __restrict__ temp) {
546 double*
const __restrict__ E1 = solver->E1;
547 cuDoubleComplex *
const __restrict__ E2 = solver->E2;
548 double *
const __restrict__ R1 = solver->DZ1;
549 double *
const __restrict__ R2 = solver->DZ2;
550 double *
const __restrict__ R3 = solver->DZ3;
551 int*
const __restrict__ ipiv1 = solver->ipiv1;
552 int*
const __restrict__ ipiv2 = solver->ipiv2;
556 for(
int i = 0; i <
NSP; i++)
558 double x1 = R1[
INDEX(i)] / H;
559 double x2 = R2[
INDEX(i)] / H;
560 double x3 = R3[
INDEX(i)] / H;
565 dgetrs(E1, R1, ipiv1);
567 for (
int i = 0; i <
NSP; ++i)
571 zgetrs(E2, temp, ipiv2);
573 for (
int i = 0; i <
NSP; ++i)
581 for (
int i = 0; i <
NSP; ++i) {
582 double x1 = R1[
INDEX(i)];
583 double x2 = R2[
INDEX(i)];
584 double x3 = R3[
INDEX(i)];
592 double const *
const __restrict__ DY) {
595 for (
int i = 0; i <
NSP; ++i){
598 return fmax(sqrt(sum / ((
double)
NSP)), 1e-10);
603 double const *
const __restrict__ Y,
605 mechanism_memory
const *
const __restrict__ mech,
606 const bool FirstStep,
const bool Reject) {
608 double HrkE1 =
rkE[1]/H;
609 double HrkE2 =
rkE[2]/H;
610 double HrkE3 =
rkE[3]/H;
612 double *
const __restrict__ E1 = mech->jac;
613 const double *
const __restrict__ F0 = mech->dy;
614 double *
const __restrict__ F1 = solver->work1;
615 double *
const __restrict__ F2 = solver->work2;
616 double *
const __restrict__ TMP = solver->work3;
617 double const *
const __restrict__ Z1 = solver->Z1;
618 double const *
const __restrict__ Z2 = solver->Z2;
619 double const *
const __restrict__ Z3 = solver->Z3;
620 int const * __restrict__ ipiv1 = solver->ipiv1;
621 double const * __restrict__
scale = solver->scale;
624 for (
int i = 0; i <
NSP; ++i) {
628 for (
int i = 0; i <
NSP; ++i) {
631 dgetrs(E1, TMP, ipiv1);
633 if (Err >= 1.0 && (FirstStep || Reject)) {
635 for (
int i = 0; i <
NSP; i++) {
638 dydt(t, pr, TMP, F1, mech);
640 for (
int i = 0; i <
NSP; i++) {
643 dgetrs(E1, TMP, ipiv1);
653 __device__
void integrate (
const double t_start,
656 double *
const __restrict__ y,
657 mechanism_memory
const *
const __restrict__ mech,
665 #ifdef CONST_TIME_STEP 666 double H = t_end - t_start;
668 double H = fmin(5e-7, t_end - t_start);
673 bool FirstStep =
true;
674 bool SkipJac =
false;
677 double *
const __restrict__ A = mech->jac;
678 double *
const __restrict__ sc = solver->scale;
679 double *
const __restrict__ y0 = solver->y0;
680 double *
const __restrict__ F0 = mech->dy;
681 double *
const __restrict__ work1 = solver->work1;
682 double *
const __restrict__ work2 = solver->work2;
683 cuDoubleComplex *
const __restrict__ work4 = solver->work4;
684 double *
const __restrict__ Z1 = solver->Z1;
685 double *
const __restrict__ Z2 = solver->Z2;
686 double *
const __restrict__ Z3 = solver->Z3;
687 double *
const __restrict__ DZ1 = solver->DZ1;
688 double *
const __restrict__ DZ2 = solver->DZ2;
689 double *
const __restrict__ DZ3 = solver->DZ3;
690 double *
const __restrict__ CONT = solver->CONT;
691 int *
const __restrict__ result = solver->result;
696 safe_memset(F0, 0.0);
699 int Nconsecutive = 0;
701 double NewtonRate = pow(2.0, 1.25);
703 #ifdef DIVERGENCE_TEST 707 dydt (t, var, y, F0, mech);
712 #ifndef FINITE_DIFFERENCE 715 eval_jacob (t, var, y, A, mech, work1, work2);
721 if (Nconsecutive >= 5)
743 if (0.1 * fabs(H) <= fabs(t) *
Roundoff)
749 safe_memset3(Z1, Z2, Z3, 0.0);
753 bool NewtonDone =
false;
754 double NewtonIncrementOld = 0;
760 NewtonRate = pow(fmax(NewtonRate,
EPS), 0.8);
768 double NewtonIncrement = sqrt((d1 * d1 + d2 * d2 + d3 * d3) / 3.0);
773 Theta = NewtonIncrement / NewtonIncrementOld;
777 NewtonRate = Theta / (1.0 - Theta);
779 double NewtonPredictedErr = (NewtonIncrement * pow(Theta, (
NewtonMaxit - NewtonIter - 1))) / (1.0 - Theta);
782 double Qnewton = fmin(10.0, NewtonPredictedErr /
NewtonTol);
783 Fac = 0.8 * pow(Qnewton, -1.0/((
double)(
NewtonMaxit-NewtonIter)));
788 NewtonIncrementOld = fmax(NewtonIncrement,
Roundoff);
791 for (
int i = 0; i <
NSP; i++)
798 NewtonDone = (NewtonRate * NewtonIncrement <=
NewtonTol);
799 if (NewtonDone)
break;
806 #ifndef CONST_TIME_STEP 816 solver, mech, FirstStep, Reject);
826 double FacGus =
FacSafe * (H / Hacc) * pow(Err * Err / ErrOld, -0.25);
828 Fac = fmin(Fac, FacGus);
832 ErrOld = fmax(1e-2, Err);
838 for (
int i = 0; i <
NSP; i++) {
847 Hnew = fmin(fmax(Hnew, Hmin), t_end - t);
849 Hnew = fmin(Hnew, H);
852 if (t + Hnew /
Qmin - t_end >= 0.0) {
855 double Hratio = Hnew / H;
857 SkipLU = (Theta <= ThetaMin) && (Hratio>=
Qmin) && (Hratio<=
Qmax);
858 if (!SkipLU) H = Hnew;
861 SkipJac = NewtonIter == 1 || NewtonRate <=
ThetaMin;
864 if (FirstStep || Reject) {
878 for (
int i = 0; i <
NSP; i++) {
static void DAXPY3(const double DA1, const double DA2, const double DA3, const double *__restrict__ DX, double *__restrict__ DY1, double *__restrict__ DY2, double *__restrict__ DY3)
Sepcialization of DAXPY with unrolled (or at least bounds known at compile time) loops.
#define T_ID
The global CUDA thread index.
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Defines some simple macros to simplify GPU indexing.
static const double rkBeta
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
static const double rkELO
static const double rkTinv[3][3]
static double RK_ErrorEstimate(const double H, const double t, const double pr, const double *__restrict__ Y, const double *__restrict__ F0, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, const double *__restrict__ scale, double *__restrict__ E1, int *__restrict__ ipiv1, const bool FirstStep, const bool Reject)
Computes and returns the error estimate for this step.
static const double rkT[3][3]
static const double rkTinvAinv[3][3]
Header definitions for CUDA LU factorization routines.
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Contains header definitions for the CUDA RHS function for the van der Pol example.
static const double rkGamma
static const double rkC[3]
static const double rkAinvT[3][3]
Header definition of CUDA Finite Difference Jacobian.
Headers for CUDA LU decomposition implementation.
#define EC_success
Successful time step.
simple convenience file to include the correct solver properties file
static void RK_PrepareRHS(const double t, const double pr, const double H, const double *__restrict__ Y, const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3)
Prepare the right-hand side for Newton iterations: .
__device__ void integrate(const double, const double, const double, double *const __restrict__, mechanism_memory const *const __restrict__, solver_memory const *const __restrict__)
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
static double RK_ErrorNorm(const double *__restrict__ scale, double *__restrict__ DY)
Computes the scaled error norm from the given scale and DY vectors.
__device__ void getLU(const int n, double *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
static void WADD(const double *__restrict__ X, const double *__restrict__ Y, double *__restrict__ Z)
Performs with unrolled (or at least bounds known at compile time) loops.
static const double rkAlpha
static const double rkB[3]
__device__ void getComplexLU(const int n, cuDoubleComplex *__restrict__ A, int *__restrict__ indPivot, int *__restrict__ info)
Computes the LU factorization of a (n x n) matrix using partial pivoting with row interchanges...
#define EC_newton_max_iterations_exceeded
Maximum allowed Newton Iteration steps exceeded.
A file generated by Scons that specifies various options to the solvers.
static void RK_Interpolate(const double H, const double Hold, double *__restrict__ Z1, double *__restrict__ Z2, double *__restrict__ Z3, const double *__restrict__ CONT)
Apply quadaratic interpolate to get initial values.
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
static void RK_Decomp(const double H, double *__restrict__ E1, double complex *__restrict__ E2, const double *__restrict__ Jac, int *__restrict__ ipiv1, int *__restrict__ ipiv2, int *__restrict__ info)
Compute E1 & E2 matricies and their LU Decomposition.
static void RK_Solve(const double H, double *__restrict__ E1, double complex *__restrict__ E2, double *__restrict__ R1, double *__restrict__ R2, double *__restrict__ R3, int *__restrict__ ipiv1, int *__restrict__ ipiv2)
Solves for the RHS values in the Newton iteration.
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
static const double rkE[4]
static const double rkA[3][3]
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
static void RK_Make_Interpolate(const double *__restrict__ Z1, const double *__restrict__ Z2, const double *__restrict__ Z3, double *__restrict__ CONT)
Compute Quadaratic interpolate.
Contains a header definition for the CUDA van der Pol Jacobian evaluation.