accelerInt  v0.1
exprb43.cu
Go to the documentation of this file.
1 
18 #include <stdlib.h>
19 #include <stdio.h>
20 #include <math.h>
21 #include <stdbool.h>
22 #include <cuComplex.h>
23 
24 //various mechanism/solver defns
25 //these should be included first
26 #include "header.cuh"
27 #include "solver_options.cuh"
28 #include "solver_props.cuh"
29 
30 #include "dydt.cuh"
31 #ifndef FINITE_DIFFERENCE
32 #include "jacob.cuh"
33 #else
34 #include "fd_jacob.cuh"
35 #endif
36 #include "exprb43_props.cuh"
37 #include "arnoldi.cuh"
39 #include "solver_init.cuh"
40 #include "gpu_macros.cuh"
41 
42 #ifdef GENERATE_DOCS
43 namespace exprb43cu {
44 #endif
45 
46 #ifdef LOG_KRYLOV_AND_STEPSIZES
47  extern __device__ double err_log[MAX_STEPS];
48  extern __device__ int m_log[MAX_STEPS];
49  extern __device__ int m1_log[MAX_STEPS];
50  extern __device__ int m2_log[MAX_STEPS];
51  extern __device__ double t_log[MAX_STEPS];
52  extern __device__ double h_log[MAX_STEPS];
53  extern __device__ bool reject_log[MAX_STEPS];
54  extern __device__ int num_integrator_steps;
55 #endif
56 #ifdef DIVERGENCE_TEST
57  extern __device__ int integrator_steps[DIVERGENCE_TEST];
58 #endif
59 
61 
73 __device__ void integrate (const double t_start, const double t_end, const double pr,
74  double* __restrict__ y, const mechanism_memory* __restrict__ mech,
75  const solver_memory* __restrict__ solver) {
76 
77  //initial time
78 #ifdef CONST_TIME_STEP
79  double h = t_end - t_start;
80 #else
81  double h = fmin(1.0e-8, t_end - t_start);
82 #endif
83  double h_new;
84 
85  double err_old = 1.0;
86  double h_old = h;
87 
88  bool reject = false;
89 
90  double t = t_start;
91 
92  // get scaling for weighted norm
93  double * const __restrict__ sc = solver->sc;
94  scale_init(y, sc);
95 
96 #ifdef LOG_KRYLOV_AND_STEPSIZES
97  if (T_ID == 0)
98  {
99  num_integrator_steps = 0;
100  }
101 #endif
102 
103  double beta = 0;
104 
105  //arrays
106  double * const __restrict__ work1 = solver->work1;
107  double * const __restrict__ work2 = solver->work2;
108  double * const __restrict__ y1 = solver->work3;
109  cuDoubleComplex * const __restrict__ work4 = solver->work4;
110  double * const __restrict__ fy = mech->dy;
111  double * const __restrict__ A = mech->jac;
112  double * const __restrict__ Vm = solver->Vm;
113  double * const __restrict__ phiHm = solver->phiHm;
114  double * const __restrict__ savedActions = solver->savedActions;
115  double * const __restrict__ gy = solver->gy;
116  int * const __restrict__ result = solver->result;
117 
118  //vectors for scaling operations
119  double * in[5] = {0, 0, 0, savedActions, y};
120  double * out[3] = {0, 0, work1};
121  double scale_vec[3] = {0, 0, 0};
122 
123  double err = 0.0;
124  int failures = 0;
125  int steps = 0;
126  while (t < t_end) {
127 
128  //error checking
129  if (failures >= MAX_CONSECUTIVE_ERRORS)
130  {
131  result[T_ID] = EC_consecutive_steps;
132  return;
133  }
134  if (steps++ >= MAX_STEPS)
135  {
136  result[T_ID] = EC_max_steps_exceeded;
137  return;
138  }
139  if (t + h <= t)
140  {
141  result[T_ID] = EC_h_plus_t_equals_h;
142  return;
143  }
144 
145  if (!reject) {
146  dydt (t, pr, y, fy, mech);
147  #ifdef FINITE_DIFFERENCE
148  eval_jacob (t, pr, y, A, mech, work1, work2);
149  #else
150  eval_jacob (t, pr, y, A, mech);
151  #endif
152  //gy = fy - A * y
153  sparse_multiplier(A, y, gy);
154  #pragma unroll
155  for (int i = 0; i < NSP; ++i) {
156  gy[INDEX(i)] = fy[INDEX(i)] - gy[INDEX(i)];
157  }
158  }
159 
160  #ifdef DIVERGENCE_TEST
162  #endif
163  int m = arnoldi(0.5, 1, h, A, solver, fy, &beta, work2, work4);
164  if (m + 1 >= STRIDE || m < 0)
165  {
166  //failure: too many krylov vectors required or singular matrix encountered
167  //need to reduce h and try again
168  h /= 5.0;
169  reject = true;
170  failures++;
171  continue;
172  }
173 
174  // Un2 to be stored in work1
175  //Un2 is partially in the mth column of phiHm
176  //Un2 = y + ** 0.5 * h * phi_1(0.5 * h * A)*fy **
177  //Un2 = y + ** beta * Vm * phiHm(:, m) **
178 
179  //store h * beta * Vm * phi_1(h * Hm) * e1 in savedActions
180  matvec_m_by_m_plusequal(m, phiHm, &phiHm[GRID_DIM * (m * STRIDE)], work1);
181  matvec_n_by_m_scale(m, beta, Vm, work1, savedActions);
182 
183  //store 0.5 * h * beta * Vm * phi_1(0.5 * h * Hm) * fy + y in work1
184  matvec_n_by_m_scale_add(m, beta, Vm, &phiHm[GRID_DIM * (m * STRIDE)], work1, y);
185  //work1 is now equal to Un2
186 
187  //next compute Dn2
188  //Dn2 = (F(Un2) - Jn * Un2) - gy
189 
190  dydt(t, pr, work1, &savedActions[GRID_DIM * NSP], mech);
191  sparse_multiplier(A, work1, work2);
192 
193  #pragma unroll
194  for (int i = 0; i < NSP; ++i) {
195  work1[INDEX(i)] = savedActions[INDEX(NSP + i)] - work2[INDEX(i)] - gy[INDEX(i)];
196  }
197  //work1 is now equal to Dn2
198 
199  //partially compute Un3 as:
200  //Un3 = y + ** h * phi_1(hA) * fy ** + h * phi_1(hA) * Dn2
201  //Un3 = y + ** h * beta * Vm * phiHm(:, m) **
202 
203  //now we need the action of the exponential on Dn2
204  int m1 = arnoldi(1.0, 4, h, A, solver, work1, &beta, work2, work4);
205  if (m1 + 4 >= STRIDE || m1 < 0)
206  {
207  //need to reduce h and try again
208  h /= 5.0;
209  reject = true;
210  failures++;
211  continue;
212  }
213 
214  //save Phi3(h * A) * Dn2 to savedActions[0]
215  //save Phi4(h * A) * Dn2 to savedActions[NSP]
216  //add the action of phi_1 on Dn2 to y and hn * phi_1(hA) * fy to get Un3
217  in[0] = &phiHm[GRID_DIM * ((m1 + 2) * STRIDE)];
218  in[1] = &phiHm[GRID_DIM * ((m1 + 3) * STRIDE)];
219  in[2] = &phiHm[GRID_DIM * ((m1) * STRIDE)];
220  out[0] = &savedActions[GRID_DIM * NSP];
221  out[1] = &savedActions[GRID_DIM * 2 * NSP];
222  scale_vec[0] = beta / (h * h);
223  scale_vec[1] = beta / (h * h * h);
224  scale_vec[2] = beta;
225  matvec_n_by_m_scale_special(m1, scale_vec, Vm, in, out);
226  //Un3 is now in work1
227 
228  //next compute Dn3
229  //Dn3 = F(Un3) - A * Un3 - gy
230  dydt(t, pr, work1, &savedActions[GRID_DIM * 3 * NSP], mech);
231  sparse_multiplier(A, work1, work2);
232 
233  #pragma unroll
234  for (int i = 0; i < NSP; ++i) {
235  work1[INDEX(i)] = savedActions[INDEX(3 * NSP + i)] - work2[INDEX(i)] - gy[INDEX(i)];
236  }
237  //work1 is now equal to Dn3
238 
239  //finally we need the action of the exponential on Dn3
240  int m2 = arnoldi(1.0, 4, h, A, solver, work1, &beta, work2, work4);
241  if (m2 + 4 >= STRIDE || m2 < 0)
242  {
243  //need to reduce h and try again
244  h /= 5.0;
245  reject = true;
246  failures++;
247  continue;
248  }
249  out[0] = &savedActions[GRID_DIM * 3 * NSP];
250  out[1] = &savedActions[GRID_DIM * 4 * NSP];
251  in[0] = &phiHm[GRID_DIM * (m2 + 2) * STRIDE];
252  in[1] = &phiHm[GRID_DIM * (m2 + 3) * STRIDE];
253  scale_vec[0] = beta / (h * h);
254  scale_vec[1] = beta / (h * h * h);
255  matvec_n_by_m_scale_special2(m2, scale_vec, Vm, in, out);
256 
257  //construct y1 and error vector
258  #pragma unroll
259  for (int i = 0; i < NSP; ++i) {
260  //y1 = y + h * phi1(h * A) * fy + h * sum(bi * Dni)
261  y1[INDEX(i)] = y[INDEX(i)] + savedActions[INDEX(i)] + 16.0 * savedActions[INDEX(NSP + i)] - 48.0 * savedActions[INDEX(2 * NSP + i)] + -2.0 * savedActions[INDEX(3 * NSP + i)] + 12.0 * savedActions[INDEX(4 * NSP + i)];
262  //error vec
263  work1[INDEX(i)] = 48.0 * savedActions[INDEX(2 * NSP + i)] - 12.0 * savedActions[INDEX(4 * NSP + i)];
264  }
265 
266 
267  //scale and find err
268  scale (y, y1, work2);
269  err = fmax(EPS, sc_norm(work1, work2));
270 
271  // classical step size calculation
272  h_new = pow(err, -1.0 / ORD);
273 
274 #ifdef LOG_KRYLOV_AND_STEPSIZES
275  if (T_ID == 0 && num_integrator_steps >= 0) {
276  err_log[num_integrator_steps] = err;
277  m_log[num_integrator_steps] = m;
278  m1_log[num_integrator_steps] = m1;
279  m2_log[num_integrator_steps] = m2;
280  t_log[num_integrator_steps] = t;
281  h_log[num_integrator_steps] = h;
282  reject_log[num_integrator_steps] = err > 1.0;
283  num_integrator_steps++;
284  if (num_integrator_steps >= MAX_STEPS)
285  {
286  printf("Number of steps out of bounds! Overwriting\n");
287  num_integrator_steps = -1;
288  }
289  }
290 #endif
291 
292 #ifndef CONST_TIME_STEP
293  failures = 0;
294  if (err <= 1.0) {
295  // update y, scale vector and t
296  #pragma unroll
297  for (int i = 0; i < NSP; ++i)
298  {
299  sc[INDEX(i)] = work2[INDEX(i)];
300  y[INDEX(i)] = y1[INDEX(i)];
301  }
302  t += h;
303 
304  // minimum of classical and Gustafsson step size prediction
305  h_new = fmin(h_new, (h / h_old) * pow((err_old / (err * err)), (1.0 / ORD)));
306 
307  // limit to 0.2 <= (h_new/8) <= 8.0
308  h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
309 
310  // store time step and error
311  err_old = fmax(1.0e-2, err);
312  h_old = h;
313 
314  // check if last step rejected
315  if (reject) {
316  h_new = fmin(h, h_new);
317  reject = false;
318  }
319  h = fmin(h_new, t_end - t);
320 
321  } else {
322  // limit to 0.2 <= (h_new/8) <= 8.0
323  h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
324  h_new = fmin(h_new, t_end - t);
325 
326  reject = true;
327  h = fmin(h, h_new);
328  }
329 #else
330  //constant time stepping
331  //update y & t
332  #pragma unroll
333  for (int i = 0; i < NSP; ++i)
334  {
335  y[INDEX(i)] = y1[INDEX(i)];
336  }
337  t += h;
338 #endif
339 
340  } // end while
341 
342  result[T_ID] = EC_success;
343 
344 }
345 
346 #ifdef GENERATE_DOCS
347 }
348 #endif
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22
__device__ void integrate(const double t_start, const double t_end, const double pr, double *__restrict__ y, const mechanism_memory *__restrict__ mech, const solver_memory *__restrict__ solver)
Definition: exprb43.cu:73
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Defines some simple macros to simplify GPU indexing.
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
Definition: fd_jacob.c:24
#define DIVERGENCE_TEST
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44
#define NSP
The IVP system size.
Definition: header.cuh:20
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20
Header definitions for solver initialization routins.
#define STRIDE
the matrix dimensions
Implementation of the GPU arnoldi iteration methods.
#define MAX_STEPS
Maximum allowed internal timesteps per integration step.
Definition: exp4_props.cuh:30
Contains header definitions for the CUDA RHS function for the van der Pol example.
Various macros controlling behaviour of RB43 algorithm.
__device__ void matvec_n_by_m_scale_special2(const int m, const double *__restrict__ scale, const double *__restrict__ A, double *const __restrict__ *V, double *__restrict__ *Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
Header definition of CUDA Finite Difference Jacobian.
#define EC_success
Successful time step.
__device__ void matvec_n_by_m_scale_add(const int m, const double scale, const double *__restrict__ A, const double *__restrict__ V, double *__restrict__ Av, const double *__restrict__ add)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
void sparse_multiplier(const double *A, const double *Vm, double *w)
Implements Jacobian \ vector multiplication in sparse (or unrolled) form.
Definitions of various linear algebra functions needed in the exponential integrators.
simple convenience file to include the correct solver properties file
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
#define MAX_CONSECUTIVE_ERRORS
Number of consecutive errors on internal integration steps allowed before exit.
Definition: exp4_props.cuh:32
__device__ void matvec_m_by_m_plusequal(const int m, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector plus equals for a matrix of size MxM and vector of size Mx1. That is, it returns (A + I) * v.
An example header file that defines system size, memory functions and other required methods for inte...
__device__ int arnoldi(const double scale, const int p, const double h, const double *__restrict__ A, const solver_memory *__restrict__ solver, const double *__restrict__ v, double *__restrict__ beta, double *__restrict__ work, cuDoubleComplex *__restrict__ work2)
Runs the arnoldi iteration to calculate the Krylov projection.
Definition: arnoldi.cuh:51
#define ORD
order of embedded methods
Definition: exp4_props.cuh:24
A file generated by Scons that specifies various options to the solvers.
__device__ void matvec_n_by_m_scale_special(const int m, const double *__restrict__ scale, const double *__restrict__ A, double *const __restrict__ *V, double *__restrict__ *Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
__device__ double sc_norm(const double *__restrict__ nums, const double *__restrict__ sc)
Perform weighted norm.
__device__ void matvec_n_by_m_scale(const int m, const double scale, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24
#define EPS
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
Contains a header definition for the CUDA van der Pol Jacobian evaluation.