accelerInt/exp4_8cu_source.html

 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <stdbool.h>
 #include <cuComplex.h>

 //various mechanism/solver defns
 //these should be included first
 #include "header.cuh"
 #include "solver_options.cuh"
 #include "solver_props.cuh"

 #include "dydt.cuh"
 #ifndef FINITE_DIFFERENCE
     #include "jacob.cuh"
 #else
     #include "fd_jacob.cuh"
 #endif
 #include "arnoldi.cuh"
 #include "exponential_linear_algebra.cuh"
 #include "solver_init.cuh"
 #include "gpu_macros.cuh"

 #ifdef GENERATE_DOCS
 namespace exp4cu {
 #endif

 #ifdef LOG_KRYLOV_AND_STEPSIZES
     extern __device__ double err_log[MAX_STEPS];
     extern __device__ int m_log[MAX_STEPS];
     extern __device__ int m1_log[MAX_STEPS];
     extern __device__ int m2_log[MAX_STEPS];
     extern __device__ double t_log[MAX_STEPS];
     extern __device__ double h_log[MAX_STEPS];
     extern __device__ bool reject_log[MAX_STEPS];
     extern __device__ int num_integrator_steps;
 #endif
 #ifdef DIVERGENCE_TEST
     extern __device__ int integrator_steps[DIVERGENCE_TEST];
 #endif

 __device__
 void integrate (const double t_start, const double t_end, const double pr,
                 double* __restrict__ y, const mechanism_memory* __restrict__ mech,
                 const solver_memory* __restrict__ solver) {

     //initial time
 #ifdef CONST_TIME_STEP
     double h = t_end - t_start;
 #else
     double h = fmin(1.0e-8, t_end - t_start);
 #endif
     double h_new;

     double err_old = 1.0;
     double h_old = h;
     double beta = 0;
     double err = 0.0;

     bool reject = false;
     int failures = 0;
     int steps = 0;

     double t = t_start;

     //arrays
     double * const __restrict__ sc = solver->sc;
     double * const __restrict__ work1 = solver->work1;
     double * const __restrict__ work2 = solver->work2;
     double * const __restrict__ y1 = solver->work3;
     cuDoubleComplex * const __restrict__ work4 = solver->work4;
     double * const __restrict__ fy = mech->dy;
     double * const __restrict__ A = mech->jac;
     double * const __restrict__ Hm = solver->Hm;
     double * const __restrict__ Vm = solver->Vm;
     double * const __restrict__ phiHm = solver->phiHm;
     double * const __restrict__ k1 = solver->k1;
     double * const __restrict__ k2 = solver->k2;
     double * const __restrict__ k3 = solver->k3;
     double * const __restrict__ k4 = solver->k4;
     double * const __restrict__ k5 = solver->k5;
     double * const __restrict__ k6 = solver->k6;
     double * const __restrict__ k7 = solver->k7;
     int * const __restrict__ result = solver->result;

     // get scaling for weighted norm
     scale_init(y, sc);

     //initial krylov subspace sizes
     while (t < t_end) {

         //error checking
         if (failures >= MAX_CONSECUTIVE_ERRORS)
         {
             result[T_ID] = EC_consecutive_steps;
             return;
         }
         if (steps++ >= MAX_STEPS)
         {
             result[T_ID] = EC_max_steps_exceeded;
             return;
         }
         if (t + h <= t)
         {
             result[T_ID] = EC_h_plus_t_equals_h;
             return;
         }

         if (!reject) {
             dydt (t, pr, y, fy, mech);
         #ifdef FINITE_DIFFERENCE
             eval_jacob (t, pr, y, A, mech, work1, work2);
         #else
             eval_jacob (t, pr, y, A, mech);
         #endif
         }

         #ifdef DIVERGENCE_TEST
         integrator_steps[T_ID]++;
         #endif
         int m = arnoldi(1.0 / 3.0, P, h, A, solver, fy, &beta, work1, work4);
         if (m + P >= STRIDE || m < 0)
         {
             //need to reduce h and try again
             h /= 5.0;
             failures++;
             reject = true;
             continue;
         }

         // k1
         //k1 is partially in the first column of phiHm
         //k1 = beta * Vm * phiHm(:, 1)
         matvec_n_by_m_scale(m, beta, Vm, phiHm, k1);

         // k2
         //computing phi(2h * A)
         matvec_m_by_m (m, phiHm, phiHm, work1);
         //note: work2 will contain hm * phi * phi * e1 for later use
         matvec_m_by_m (m, Hm, work1, work2);
         matvec_n_by_m_scale_add(m, beta * (h / 6.0), Vm, work2, k2, k1);

         // k3
         //use the stored hm * phi * phi * e1 to get phi(3h * A)
         matvec_m_by_m (m, phiHm, work2, work1);
         matvec_m_by_m (m, Hm, work1, work2);
         matvec_n_by_m_scale_add_subtract(m, beta * (h * h / 27.0), Vm, work2, k3, k2, k1);

         // d4
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             // f4
             work2[INDEX(i)] = h * ((-7.0 / 300.0) * k1[INDEX(i)] + (97.0 / 150.0) * k2[INDEX(i)] - (37.0 / 300.0) * k3[INDEX(i)]);

             k4[INDEX(i)] = y[INDEX(i)] + work2[INDEX(i)];
         }

         dydt (t, pr, k4, work1, mech);
         sparse_multiplier (A, work2, k4);

         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             k4[INDEX(i)] = work1[INDEX(i)] - fy[INDEX(i)] - k4[INDEX(i)];
         }

         //do arnoldi
         int m1 = arnoldi(1.0 / 3.0, P, h, A, solver, k4, &beta, work1, work4);
         if (m1 + P >= STRIDE || m1 < 0)
         {
             //need to reduce h and try again
             h /= 5.0;
             failures++;
             reject = true;
             continue;
         }
         //k4 is partially in the m'th column of phiHm
         matvec_n_by_m_scale(m1, beta, Vm, phiHm, k4);

         // k5
         //computing phi(2h * A)
         matvec_m_by_m (m1, phiHm, phiHm, work1);
         //note: work2 will contain hm * phi * phi * e1 for later use
         matvec_m_by_m (m1, Hm, work1, work2);
         matvec_n_by_m_scale_add(m1, beta * (h / 6.0), Vm, work2, k5, k4);

         // k6
         //use the stored hm * phi * phi * e1 to get phi(3h * A)
         matvec_m_by_m (m1, phiHm, work2, work1);
         matvec_m_by_m (m1, Hm, work1, work2);
         matvec_n_by_m_scale_add_subtract(m1, beta * (h * h / 27.0), Vm, work2, k6, k5, k4);

         // k7
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             // f7
             work2[INDEX(i)] = h * ((59.0 / 300.0) * k1[INDEX(i)] - (7.0 / 75.0) * k2[INDEX(i)] + (269.0 / 300.0) * k3[INDEX(i)] + (2.0 / 3.0) * (k4[INDEX(i)] + k5[INDEX(i)] + k6[INDEX(i)]));

             k7[INDEX(i)] = y[INDEX(i)] + work2[INDEX(i)];
         }

         dydt (t, pr, k7, work1, mech);
         sparse_multiplier (A, work2, k7);

         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             k7[INDEX(i)] = work1[INDEX(i)] - fy[INDEX(i)] - k7[INDEX(i)];
         }

         int m2 = arnoldi(1.0 / 3.0, P, h, A, solver, k7, &beta, work1, work4);
         if (m2 + P >= STRIDE || m2 < 0)
         {
             //need to reduce h and try again
             h /= 5.0;
             failures++;
             reject = true;
             continue;
         }
         //k7 is partially in the m'th column of phiHm
         matvec_n_by_m_scale(m2, beta / (h / 3.0), Vm, &phiHm[GRID_DIM * m2 * STRIDE], k7);

         // y_n+1
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             y1[INDEX(i)] = y[INDEX(i)] + h * (k3[INDEX(i)] + k4[INDEX(i)] - (4.0 / 3.0) * k5[INDEX(i)] + k6[INDEX(i)] + (1.0 / 6.0) * k7[INDEX(i)]);
         }

         scale (y, y1, work2);

         // calculate errors

         // error of embedded order 3 method
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             work1[INDEX(i)] = k3[INDEX(i)] - (2.0 / 3.0) * k5[INDEX(i)] + 0.5 * (k6[INDEX(i)] + k7[INDEX(i)] - k4[INDEX(i)]) - (y1[INDEX(i)] - y[INDEX(i)]) / h;
         }
         err = h * sc_norm(work1, work2);

         // error of embedded W method
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             work1[INDEX(i)] = -k1[INDEX(i)] + 2.0 * k2[INDEX(i)] - k4[INDEX(i)] + k7[INDEX(i)] - (y1[INDEX(i)] - y[INDEX(i)]) / h;
         }
         //double err_W = h * sc_norm(temp, sc);
         err = fmax(EPS, fmin(err, h * sc_norm(work1, work2)));

         // classical step size calculation
         h_new = pow(err, -1.0 / ORD);

 #ifndef CONST_TIME_STEP
         failures = 0;
         if (err <= 1.0) {
             // update y, t and scale
             #pragma unroll
             for (int i = 0; i < NSP; ++i)
             {
                 y[INDEX(i)] = y1[INDEX(i)];
                 sc[INDEX(i)] = work2[INDEX(i)];
             }
             t += h;

             // minimum of classical and Gustafsson step size prediction
             h_new = fmin(h_new, (h / h_old) * pow((err_old / (err * err)), (1.0 / ORD)));

             // limit to 0.2 <= (h_new/8) <= 8.0
             h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);


             // store time step and error
             err_old = fmax(1.0e-2, err);
             h_old = h;

             // check if last step rejected
             if (reject) {
                 reject = false;
                 h_new = fmin(h, h_new);
             }
             h = fmin(h_new, t_end - t);

         } else {

             // limit to 0.2 <= (h_new/8) <= 8.0
             h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
             h_new = fmin(h_new, t_end - t);

             reject = true;
             h = fmin(h, h_new);
         }
 #else
         //constant time stepping
         //update y & t
         #pragma unroll
         for (int i = 0; i < NSP; ++i)
         {
             y[INDEX(i)] = y1[INDEX(i)];
         }
         t += h;
 #endif

     } // end while

     result[T_ID] = EC_success;
 }

 #ifdef GENERATE_DOCS
 }
 #endif
T_ID
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22

scale_init
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Definition: exponential_linear_algebra.cu:166

gpu_macros.cuh
Defines some simple macros to simplify GPU indexing.

P
#define P
max order of the phi functions (for error estimation)
Definition: exp4_props.cuh:22

EC_consecutive_steps
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
Definition: radau2a_props.cuh:75

van_der_pol::dydt
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22

eval_jacob
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
Definition: fd_jacob.c:24

DIVERGENCE_TEST
#define DIVERGENCE_TEST
Definition: solver_options.cuh:79

integrator_steps
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44

NSP
#define NSP
The IVP system size.
Definition: header.cuh:20

GRID_DIM
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20

solver_init.cuh
Header definitions for solver initialization routins.

STRIDE
#define STRIDE
the matrix dimensions
Definition: radau2a_props.cuh:20

arnoldi.cuh
Implementation of the GPU arnoldi iteration methods.

MAX_STEPS
#define MAX_STEPS
Maximum allowed internal timesteps per integration step.
Definition: exp4_props.cuh:30

dydt.cuh
Contains header definitions for the CUDA RHS function for the van der Pol example.

exp4cu::solver_memory
Structure containing memory needed for EXP4 algorithm.
Definition: exp4_props.cuh:37

fd_jacob.cuh
Header definition of CUDA Finite Difference Jacobian.

EC_success
#define EC_success
Successful time step.
Definition: radau2a_props.cuh:73

matvec_n_by_m_scale_add
__device__ void matvec_n_by_m_scale_add(const int m, const double scale, const double *__restrict__ A, const double *__restrict__ V, double *__restrict__ Av, const double *__restrict__ add)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:114

van_der_pol::sparse_multiplier
void sparse_multiplier(const double *A, const double *Vm, double *w)
Implements Jacobian \ vector multiplication in sparse (or unrolled) form.
Definition: sparse_multiplier.c:21

exponential_linear_algebra.cuh
Definitions of various linear algebra functions needed in the exponential integrators.

solver_props.cuh
simple convenience file to include the correct solver properties file

exp4cu
Definition: exp4.cu:41

matvec_n_by_m_scale_add_subtract
__device__ void matvec_n_by_m_scale_add_subtract(const int m, const double scale, const double *__restrict__ A, const double *V, double *__restrict__ Av, const double *__restrict__ add, const double *__restrict__ sub)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:134

scale
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
Definition: exponential_linear_algebra.cu:156

MAX_CONSECUTIVE_ERRORS
#define MAX_CONSECUTIVE_ERRORS
Number of consecutive errors on internal integration steps allowed before exit.
Definition: exp4_props.cuh:32

matvec_m_by_m
__device__ void matvec_m_by_m(const int m, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized MxM and a vector Mx1.
Definition: exponential_linear_algebra.cu:15

header.cuh
An example header file that defines system size, memory functions and other required methods for inte...

arnoldi
__device__ int arnoldi(const double scale, const int p, const double h, const double *__restrict__ A, const solver_memory *__restrict__ solver, const double *__restrict__ v, double *__restrict__ beta, double *__restrict__ work, cuDoubleComplex *__restrict__ work2)
Runs the arnoldi iteration to calculate the Krylov projection.
Definition: arnoldi.cuh:51

exp4cu::integrate
__device__ void integrate(const double t_start, const double t_end, const double pr, double *__restrict__ y, const mechanism_memory *__restrict__ mech, const solver_memory *__restrict__ solver)
4th-order exponential integrator function w/ adaptive Kyrlov subspace approximation ...
Definition: exp4.cu:71

ORD
#define ORD
order of embedded methods
Definition: exp4_props.cuh:24

solver_options.cuh
A file generated by Scons that specifies various options to the solvers.

EC_max_steps_exceeded
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
Definition: radau2a_props.cuh:77

sc_norm
__device__ double sc_norm(const double *__restrict__ nums, const double *__restrict__ sc)
Perform weighted norm.
Definition: exponential_linear_algebra.cu:176

matvec_n_by_m_scale
__device__ void matvec_n_by_m_scale(const int m, const double scale, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:48

INDEX
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24

EPS
#define EPS
Definition: solver_interface.cuh:24

EC_h_plus_t_equals_h
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
Definition: radau2a_props.cuh:79

jacob.cuh
Contains a header definition for the CUDA van der Pol Jacobian evaluation.