accelerInt/exprb43_8cu_source.html

 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <stdbool.h>
 #include <cuComplex.h>

 //various mechanism/solver defns
 //these should be included first
 #include "header.cuh"
 #include "solver_options.cuh"
 #include "solver_props.cuh"

 #include "dydt.cuh"
 #ifndef FINITE_DIFFERENCE
 #include "jacob.cuh"
 #else
 #include "fd_jacob.cuh"
 #endif
 #include "exprb43_props.cuh"
 #include "arnoldi.cuh"
 #include "exponential_linear_algebra.cuh"
 #include "solver_init.cuh"
 #include "gpu_macros.cuh"

 #ifdef GENERATE_DOCS
 namespace exprb43cu {
 #endif

 #ifdef LOG_KRYLOV_AND_STEPSIZES
     extern __device__ double err_log[MAX_STEPS];
     extern __device__ int m_log[MAX_STEPS];
     extern __device__ int m1_log[MAX_STEPS];
     extern __device__ int m2_log[MAX_STEPS];
     extern __device__ double t_log[MAX_STEPS];
     extern __device__ double h_log[MAX_STEPS];
     extern __device__ bool reject_log[MAX_STEPS];
     extern __device__ int num_integrator_steps;
 #endif
 #ifdef DIVERGENCE_TEST
     extern __device__ int integrator_steps[DIVERGENCE_TEST];
 #endif


 __device__ void integrate (const double t_start, const double t_end, const double pr,
                             double* __restrict__ y, const mechanism_memory* __restrict__ mech,
                             const solver_memory* __restrict__ solver) {

     //initial time
 #ifdef CONST_TIME_STEP
     double h = t_end - t_start;
 #else
     double h = fmin(1.0e-8, t_end - t_start);
 #endif
     double h_new;

     double err_old = 1.0;
     double h_old = h;

     bool reject = false;

     double t = t_start;

     // get scaling for weighted norm
     double * const __restrict__ sc = solver->sc;
     scale_init(y, sc);

 #ifdef LOG_KRYLOV_AND_STEPSIZES
     if (T_ID == 0)
     {
         num_integrator_steps = 0;
     }
 #endif

     double beta = 0;

     //arrays
     double * const __restrict__ work1 = solver->work1;
     double * const __restrict__ work2 = solver->work2;
     double * const __restrict__ y1 = solver->work3;
     cuDoubleComplex * const __restrict__ work4 = solver->work4;
     double * const __restrict__ fy = mech->dy;
     double * const __restrict__ A = mech->jac;
     double * const __restrict__ Vm = solver->Vm;
     double * const __restrict__ phiHm = solver->phiHm;
     double * const __restrict__ savedActions = solver->savedActions;
     double * const __restrict__ gy = solver->gy;
     int * const __restrict__ result = solver->result;

     //vectors for scaling operations
     double * in[5] = {0, 0, 0, savedActions, y};
     double * out[3] = {0, 0, work1};
     double scale_vec[3] = {0, 0, 0};

     double err = 0.0;
     int failures = 0;
     int steps = 0;
     while (t < t_end) {

         //error checking
         if (failures >= MAX_CONSECUTIVE_ERRORS)
         {
             result[T_ID] = EC_consecutive_steps;
             return;
         }
         if (steps++ >= MAX_STEPS)
         {
             result[T_ID] = EC_max_steps_exceeded;
             return;
         }
         if (t + h <= t)
         {
             result[T_ID] = EC_h_plus_t_equals_h;
             return;
         }

         if (!reject) {
             dydt (t, pr, y, fy, mech);
         #ifdef FINITE_DIFFERENCE
             eval_jacob (t, pr, y, A, mech, work1, work2);
         #else
             eval_jacob (t, pr, y, A, mech);
         #endif
             //gy = fy - A * y
             sparse_multiplier(A, y, gy);
             #pragma unroll
             for (int i = 0; i < NSP; ++i) {
                 gy[INDEX(i)] = fy[INDEX(i)] - gy[INDEX(i)];
             }
         }

         #ifdef DIVERGENCE_TEST
         integrator_steps[T_ID]++;
         #endif
         int m = arnoldi(0.5, 1, h, A, solver, fy, &beta, work2, work4);
         if (m + 1 >= STRIDE || m < 0)
         {
             //failure: too many krylov vectors required or singular matrix encountered
             //need to reduce h and try again
             h /= 5.0;
             reject = true;
             failures++;
             continue;
         }

         // Un2 to be stored in work1
         //Un2 is partially in the mth column of phiHm
         //Un2 = y + ** 0.5 * h * phi_1(0.5 * h * A)*fy **
         //Un2 = y + ** beta * Vm * phiHm(:, m) **

         //store h * beta * Vm * phi_1(h * Hm) * e1 in savedActions
         matvec_m_by_m_plusequal(m, phiHm, &phiHm[GRID_DIM * (m * STRIDE)], work1);
         matvec_n_by_m_scale(m, beta, Vm, work1, savedActions);

         //store 0.5 * h *  beta * Vm * phi_1(0.5 * h * Hm) * fy + y in work1
         matvec_n_by_m_scale_add(m, beta, Vm, &phiHm[GRID_DIM * (m * STRIDE)], work1, y);
         //work1 is now equal to Un2

         //next compute Dn2
         //Dn2 = (F(Un2) - Jn * Un2) - gy

         dydt(t, pr, work1, &savedActions[GRID_DIM * NSP], mech);
         sparse_multiplier(A, work1, work2);

         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             work1[INDEX(i)] = savedActions[INDEX(NSP + i)] - work2[INDEX(i)] - gy[INDEX(i)];
         }
         //work1 is now equal to Dn2

         //partially compute Un3 as:
         //Un3 = y + ** h * phi_1(hA) * fy ** + h * phi_1(hA) * Dn2
         //Un3 = y + ** h * beta * Vm * phiHm(:, m) **

         //now we need the action of the exponential on Dn2
         int m1 = arnoldi(1.0, 4, h, A, solver, work1, &beta, work2, work4);
         if (m1 + 4 >= STRIDE || m1 < 0)
         {
             //need to reduce h and try again
             h /= 5.0;
             reject = true;
             failures++;
             continue;
         }

         //save Phi3(h * A) * Dn2 to savedActions[0]
         //save Phi4(h * A) * Dn2 to savedActions[NSP]
         //add the action of phi_1 on Dn2 to y and hn * phi_1(hA) * fy to get Un3
         in[0] = &phiHm[GRID_DIM * ((m1 + 2) * STRIDE)];
         in[1] = &phiHm[GRID_DIM * ((m1 + 3) * STRIDE)];
         in[2] = &phiHm[GRID_DIM * ((m1) * STRIDE)];
         out[0] = &savedActions[GRID_DIM * NSP];
         out[1] = &savedActions[GRID_DIM * 2 * NSP];
         scale_vec[0] = beta / (h * h);
         scale_vec[1] = beta / (h * h * h);
         scale_vec[2] = beta;
         matvec_n_by_m_scale_special(m1, scale_vec, Vm, in, out);
         //Un3 is now in work1

         //next compute Dn3
         //Dn3 = F(Un3) - A * Un3 - gy
         dydt(t, pr, work1, &savedActions[GRID_DIM * 3 * NSP], mech);
         sparse_multiplier(A, work1, work2);

         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             work1[INDEX(i)] = savedActions[INDEX(3 * NSP + i)] - work2[INDEX(i)] - gy[INDEX(i)];
         }
         //work1 is now equal to Dn3

         //finally we need the action of the exponential on Dn3
         int m2 = arnoldi(1.0, 4, h, A, solver, work1, &beta, work2, work4);
         if (m2 + 4 >= STRIDE || m2 < 0)
         {
             //need to reduce h and try again
             h /= 5.0;
             reject = true;
             failures++;
             continue;
         }
         out[0] = &savedActions[GRID_DIM * 3 * NSP];
         out[1] = &savedActions[GRID_DIM * 4 * NSP];
         in[0] = &phiHm[GRID_DIM * (m2 + 2) * STRIDE];
         in[1] = &phiHm[GRID_DIM * (m2 + 3) * STRIDE];
         scale_vec[0] = beta / (h * h);
         scale_vec[1] = beta / (h * h * h);
         matvec_n_by_m_scale_special2(m2, scale_vec, Vm, in, out);

         //construct y1 and error vector
         #pragma unroll
         for (int i = 0; i < NSP; ++i) {
             //y1 = y + h * phi1(h * A) * fy + h * sum(bi * Dni)
             y1[INDEX(i)] = y[INDEX(i)] + savedActions[INDEX(i)] + 16.0 * savedActions[INDEX(NSP + i)] - 48.0 * savedActions[INDEX(2 * NSP + i)] + -2.0 * savedActions[INDEX(3 * NSP + i)] + 12.0 * savedActions[INDEX(4 * NSP + i)];
             //error vec
             work1[INDEX(i)] = 48.0 * savedActions[INDEX(2 * NSP + i)] - 12.0 * savedActions[INDEX(4 * NSP + i)];
         }


         //scale and find err
         scale (y, y1, work2);
         err = fmax(EPS, sc_norm(work1, work2));

         // classical step size calculation
         h_new = pow(err, -1.0 / ORD);

 #ifdef LOG_KRYLOV_AND_STEPSIZES
         if (T_ID == 0 && num_integrator_steps >= 0) {
             err_log[num_integrator_steps] = err;
             m_log[num_integrator_steps] = m;
             m1_log[num_integrator_steps] = m1;
             m2_log[num_integrator_steps] = m2;
             t_log[num_integrator_steps] = t;
             h_log[num_integrator_steps] = h;
             reject_log[num_integrator_steps] = err > 1.0;
             num_integrator_steps++;
             if (num_integrator_steps >= MAX_STEPS)
             {
                 printf("Number of steps out of bounds! Overwriting\n");
                 num_integrator_steps = -1;
             }
         }
 #endif

 #ifndef CONST_TIME_STEP
         failures = 0;
         if (err <= 1.0) {
             // update y, scale vector and t
             #pragma unroll
             for (int i = 0; i < NSP; ++i)
             {
                 sc[INDEX(i)] = work2[INDEX(i)];
                 y[INDEX(i)] = y1[INDEX(i)];
             }
             t += h;

             // minimum of classical and Gustafsson step size prediction
             h_new = fmin(h_new, (h / h_old) * pow((err_old / (err * err)), (1.0 / ORD)));

             // limit to 0.2 <= (h_new/8) <= 8.0
             h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);

             // store time step and error
             err_old = fmax(1.0e-2, err);
             h_old = h;

             // check if last step rejected
             if (reject) {
                 h_new = fmin(h, h_new);
                 reject = false;
             }
             h = fmin(h_new, t_end - t);

         } else {
             // limit to 0.2 <= (h_new/8) <= 8.0
             h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
             h_new = fmin(h_new, t_end - t);

             reject = true;
             h = fmin(h, h_new);
         }
 #else
         //constant time stepping
         //update y & t
         #pragma unroll
         for (int i = 0; i < NSP; ++i)
         {
             y[INDEX(i)] = y1[INDEX(i)];
         }
         t += h;
 #endif

     } // end while

     result[T_ID] = EC_success;

 }

 #ifdef GENERATE_DOCS
 }
 #endif
T_ID
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22

exprb43cu::integrate
__device__ void integrate(const double t_start, const double t_end, const double pr, double *__restrict__ y, const mechanism_memory *__restrict__ mech, const solver_memory *__restrict__ solver)
Definition: exprb43.cu:73

scale_init
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Definition: exponential_linear_algebra.cu:166

gpu_macros.cuh
Defines some simple macros to simplify GPU indexing.

EC_consecutive_steps
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
Definition: radau2a_props.cuh:75

van_der_pol::dydt
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22

eval_jacob
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
Definition: fd_jacob.c:24

DIVERGENCE_TEST
#define DIVERGENCE_TEST
Definition: solver_options.cuh:79

integrator_steps
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44

NSP
#define NSP
The IVP system size.
Definition: header.cuh:20

GRID_DIM
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20

solver_init.cuh
Header definitions for solver initialization routins.

exprb43cu::solver_memory
Definition: exprb43_props.cuh:35

STRIDE
#define STRIDE
the matrix dimensions
Definition: radau2a_props.cuh:20

arnoldi.cuh
Implementation of the GPU arnoldi iteration methods.

MAX_STEPS
#define MAX_STEPS
Maximum allowed internal timesteps per integration step.
Definition: exp4_props.cuh:30

dydt.cuh
Contains header definitions for the CUDA RHS function for the van der Pol example.

exprb43_props.cuh
Various macros controlling behaviour of RB43 algorithm.

matvec_n_by_m_scale_special2
__device__ void matvec_n_by_m_scale_special2(const int m, const double *__restrict__ scale, const double *__restrict__ A, double *const __restrict__ *V, double *__restrict__ *Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:92

fd_jacob.cuh
Header definition of CUDA Finite Difference Jacobian.

EC_success
#define EC_success
Successful time step.
Definition: radau2a_props.cuh:73

matvec_n_by_m_scale_add
__device__ void matvec_n_by_m_scale_add(const int m, const double scale, const double *__restrict__ A, const double *__restrict__ V, double *__restrict__ Av, const double *__restrict__ add)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:114

van_der_pol::sparse_multiplier
void sparse_multiplier(const double *A, const double *Vm, double *w)
Implements Jacobian \ vector multiplication in sparse (or unrolled) form.
Definition: sparse_multiplier.c:21

exponential_linear_algebra.cuh
Definitions of various linear algebra functions needed in the exponential integrators.

solver_props.cuh
simple convenience file to include the correct solver properties file

scale
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
Definition: exponential_linear_algebra.cu:156

MAX_CONSECUTIVE_ERRORS
#define MAX_CONSECUTIVE_ERRORS
Number of consecutive errors on internal integration steps allowed before exit.
Definition: exp4_props.cuh:32

matvec_m_by_m_plusequal
__device__ void matvec_m_by_m_plusequal(const int m, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector plus equals for a matrix of size MxM and vector of size Mx1. That is, it returns (A + I) * v.
Definition: exponential_linear_algebra.cu:31

header.cuh
An example header file that defines system size, memory functions and other required methods for inte...

arnoldi
__device__ int arnoldi(const double scale, const int p, const double h, const double *__restrict__ A, const solver_memory *__restrict__ solver, const double *__restrict__ v, double *__restrict__ beta, double *__restrict__ work, cuDoubleComplex *__restrict__ work2)
Runs the arnoldi iteration to calculate the Krylov projection.
Definition: arnoldi.cuh:51

ORD
#define ORD
order of embedded methods
Definition: exp4_props.cuh:24

exprb43cu
Definition: exprb43.cu:43

solver_options.cuh
A file generated by Scons that specifies various options to the solvers.

matvec_n_by_m_scale_special
__device__ void matvec_n_by_m_scale_special(const int m, const double *__restrict__ scale, const double *__restrict__ A, double *const __restrict__ *V, double *__restrict__ *Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:67

EC_max_steps_exceeded
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
Definition: radau2a_props.cuh:77

sc_norm
__device__ double sc_norm(const double *__restrict__ nums, const double *__restrict__ sc)
Perform weighted norm.
Definition: exponential_linear_algebra.cu:176

matvec_n_by_m_scale
__device__ void matvec_n_by_m_scale(const int m, const double scale, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Definition: exponential_linear_algebra.cu:48

INDEX
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24

EPS
#define EPS
Definition: solver_interface.cuh:24

EC_h_plus_t_equals_h
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
Definition: radau2a_props.cuh:79

jacob.cuh
Contains a header definition for the CUDA van der Pol Jacobian evaluation.