accelerInt  v0.1
exp4.cu
Go to the documentation of this file.
1 
17 #include <stdlib.h>
18 #include <stdio.h>
19 #include <math.h>
20 #include <stdbool.h>
21 #include <cuComplex.h>
22 
23 //various mechanism/solver defns
24 //these should be included first
25 #include "header.cuh"
26 #include "solver_options.cuh"
27 #include "solver_props.cuh"
28 
29 #include "dydt.cuh"
30 #ifndef FINITE_DIFFERENCE
31  #include "jacob.cuh"
32 #else
33  #include "fd_jacob.cuh"
34 #endif
35 #include "arnoldi.cuh"
37 #include "solver_init.cuh"
38 #include "gpu_macros.cuh"
39 
40 #ifdef GENERATE_DOCS
41 namespace exp4cu {
42 #endif
43 
44 #ifdef LOG_KRYLOV_AND_STEPSIZES
45  extern __device__ double err_log[MAX_STEPS];
46  extern __device__ int m_log[MAX_STEPS];
47  extern __device__ int m1_log[MAX_STEPS];
48  extern __device__ int m2_log[MAX_STEPS];
49  extern __device__ double t_log[MAX_STEPS];
50  extern __device__ double h_log[MAX_STEPS];
51  extern __device__ bool reject_log[MAX_STEPS];
52  extern __device__ int num_integrator_steps;
53 #endif
54 #ifdef DIVERGENCE_TEST
55  extern __device__ int integrator_steps[DIVERGENCE_TEST];
56 #endif
57 
70 __device__
71 void integrate (const double t_start, const double t_end, const double pr,
72  double* __restrict__ y, const mechanism_memory* __restrict__ mech,
73  const solver_memory* __restrict__ solver) {
74 
75  //initial time
76 #ifdef CONST_TIME_STEP
77  double h = t_end - t_start;
78 #else
79  double h = fmin(1.0e-8, t_end - t_start);
80 #endif
81  double h_new;
82 
83  double err_old = 1.0;
84  double h_old = h;
85  double beta = 0;
86  double err = 0.0;
87 
88  bool reject = false;
89  int failures = 0;
90  int steps = 0;
91 
92  double t = t_start;
93 
94  //arrays
95  double * const __restrict__ sc = solver->sc;
96  double * const __restrict__ work1 = solver->work1;
97  double * const __restrict__ work2 = solver->work2;
98  double * const __restrict__ y1 = solver->work3;
99  cuDoubleComplex * const __restrict__ work4 = solver->work4;
100  double * const __restrict__ fy = mech->dy;
101  double * const __restrict__ A = mech->jac;
102  double * const __restrict__ Hm = solver->Hm;
103  double * const __restrict__ Vm = solver->Vm;
104  double * const __restrict__ phiHm = solver->phiHm;
105  double * const __restrict__ k1 = solver->k1;
106  double * const __restrict__ k2 = solver->k2;
107  double * const __restrict__ k3 = solver->k3;
108  double * const __restrict__ k4 = solver->k4;
109  double * const __restrict__ k5 = solver->k5;
110  double * const __restrict__ k6 = solver->k6;
111  double * const __restrict__ k7 = solver->k7;
112  int * const __restrict__ result = solver->result;
113 
114  // get scaling for weighted norm
115  scale_init(y, sc);
116 
117  //initial krylov subspace sizes
118  while (t < t_end) {
119 
120  //error checking
121  if (failures >= MAX_CONSECUTIVE_ERRORS)
122  {
123  result[T_ID] = EC_consecutive_steps;
124  return;
125  }
126  if (steps++ >= MAX_STEPS)
127  {
128  result[T_ID] = EC_max_steps_exceeded;
129  return;
130  }
131  if (t + h <= t)
132  {
133  result[T_ID] = EC_h_plus_t_equals_h;
134  return;
135  }
136 
137  if (!reject) {
138  dydt (t, pr, y, fy, mech);
139  #ifdef FINITE_DIFFERENCE
140  eval_jacob (t, pr, y, A, mech, work1, work2);
141  #else
142  eval_jacob (t, pr, y, A, mech);
143  #endif
144  }
145 
146  #ifdef DIVERGENCE_TEST
148  #endif
149  int m = arnoldi(1.0 / 3.0, P, h, A, solver, fy, &beta, work1, work4);
150  if (m + P >= STRIDE || m < 0)
151  {
152  //need to reduce h and try again
153  h /= 5.0;
154  failures++;
155  reject = true;
156  continue;
157  }
158 
159  // k1
160  //k1 is partially in the first column of phiHm
161  //k1 = beta * Vm * phiHm(:, 1)
162  matvec_n_by_m_scale(m, beta, Vm, phiHm, k1);
163 
164  // k2
165  //computing phi(2h * A)
166  matvec_m_by_m (m, phiHm, phiHm, work1);
167  //note: work2 will contain hm * phi * phi * e1 for later use
168  matvec_m_by_m (m, Hm, work1, work2);
169  matvec_n_by_m_scale_add(m, beta * (h / 6.0), Vm, work2, k2, k1);
170 
171  // k3
172  //use the stored hm * phi * phi * e1 to get phi(3h * A)
173  matvec_m_by_m (m, phiHm, work2, work1);
174  matvec_m_by_m (m, Hm, work1, work2);
175  matvec_n_by_m_scale_add_subtract(m, beta * (h * h / 27.0), Vm, work2, k3, k2, k1);
176 
177  // d4
178  #pragma unroll
179  for (int i = 0; i < NSP; ++i) {
180  // f4
181  work2[INDEX(i)] = h * ((-7.0 / 300.0) * k1[INDEX(i)] + (97.0 / 150.0) * k2[INDEX(i)] - (37.0 / 300.0) * k3[INDEX(i)]);
182 
183  k4[INDEX(i)] = y[INDEX(i)] + work2[INDEX(i)];
184  }
185 
186  dydt (t, pr, k4, work1, mech);
187  sparse_multiplier (A, work2, k4);
188 
189  #pragma unroll
190  for (int i = 0; i < NSP; ++i) {
191  k4[INDEX(i)] = work1[INDEX(i)] - fy[INDEX(i)] - k4[INDEX(i)];
192  }
193 
194  //do arnoldi
195  int m1 = arnoldi(1.0 / 3.0, P, h, A, solver, k4, &beta, work1, work4);
196  if (m1 + P >= STRIDE || m1 < 0)
197  {
198  //need to reduce h and try again
199  h /= 5.0;
200  failures++;
201  reject = true;
202  continue;
203  }
204  //k4 is partially in the m'th column of phiHm
205  matvec_n_by_m_scale(m1, beta, Vm, phiHm, k4);
206 
207  // k5
208  //computing phi(2h * A)
209  matvec_m_by_m (m1, phiHm, phiHm, work1);
210  //note: work2 will contain hm * phi * phi * e1 for later use
211  matvec_m_by_m (m1, Hm, work1, work2);
212  matvec_n_by_m_scale_add(m1, beta * (h / 6.0), Vm, work2, k5, k4);
213 
214  // k6
215  //use the stored hm * phi * phi * e1 to get phi(3h * A)
216  matvec_m_by_m (m1, phiHm, work2, work1);
217  matvec_m_by_m (m1, Hm, work1, work2);
218  matvec_n_by_m_scale_add_subtract(m1, beta * (h * h / 27.0), Vm, work2, k6, k5, k4);
219 
220  // k7
221  #pragma unroll
222  for (int i = 0; i < NSP; ++i) {
223  // f7
224  work2[INDEX(i)] = h * ((59.0 / 300.0) * k1[INDEX(i)] - (7.0 / 75.0) * k2[INDEX(i)] + (269.0 / 300.0) * k3[INDEX(i)] + (2.0 / 3.0) * (k4[INDEX(i)] + k5[INDEX(i)] + k6[INDEX(i)]));
225 
226  k7[INDEX(i)] = y[INDEX(i)] + work2[INDEX(i)];
227  }
228 
229  dydt (t, pr, k7, work1, mech);
230  sparse_multiplier (A, work2, k7);
231 
232  #pragma unroll
233  for (int i = 0; i < NSP; ++i) {
234  k7[INDEX(i)] = work1[INDEX(i)] - fy[INDEX(i)] - k7[INDEX(i)];
235  }
236 
237  int m2 = arnoldi(1.0 / 3.0, P, h, A, solver, k7, &beta, work1, work4);
238  if (m2 + P >= STRIDE || m2 < 0)
239  {
240  //need to reduce h and try again
241  h /= 5.0;
242  failures++;
243  reject = true;
244  continue;
245  }
246  //k7 is partially in the m'th column of phiHm
247  matvec_n_by_m_scale(m2, beta / (h / 3.0), Vm, &phiHm[GRID_DIM * m2 * STRIDE], k7);
248 
249  // y_n+1
250  #pragma unroll
251  for (int i = 0; i < NSP; ++i) {
252  y1[INDEX(i)] = y[INDEX(i)] + h * (k3[INDEX(i)] + k4[INDEX(i)] - (4.0 / 3.0) * k5[INDEX(i)] + k6[INDEX(i)] + (1.0 / 6.0) * k7[INDEX(i)]);
253  }
254 
255  scale (y, y1, work2);
256 
258  // calculate errors
260 
261  // error of embedded order 3 method
262  #pragma unroll
263  for (int i = 0; i < NSP; ++i) {
264  work1[INDEX(i)] = k3[INDEX(i)] - (2.0 / 3.0) * k5[INDEX(i)] + 0.5 * (k6[INDEX(i)] + k7[INDEX(i)] - k4[INDEX(i)]) - (y1[INDEX(i)] - y[INDEX(i)]) / h;
265  }
266  err = h * sc_norm(work1, work2);
267 
268  // error of embedded W method
269  #pragma unroll
270  for (int i = 0; i < NSP; ++i) {
271  work1[INDEX(i)] = -k1[INDEX(i)] + 2.0 * k2[INDEX(i)] - k4[INDEX(i)] + k7[INDEX(i)] - (y1[INDEX(i)] - y[INDEX(i)]) / h;
272  }
273  //double err_W = h * sc_norm(temp, sc);
274  err = fmax(EPS, fmin(err, h * sc_norm(work1, work2)));
275 
276  // classical step size calculation
277  h_new = pow(err, -1.0 / ORD);
278 
279 #ifndef CONST_TIME_STEP
280  failures = 0;
281  if (err <= 1.0) {
282  // update y, t and scale
283  #pragma unroll
284  for (int i = 0; i < NSP; ++i)
285  {
286  y[INDEX(i)] = y1[INDEX(i)];
287  sc[INDEX(i)] = work2[INDEX(i)];
288  }
289  t += h;
290 
291  // minimum of classical and Gustafsson step size prediction
292  h_new = fmin(h_new, (h / h_old) * pow((err_old / (err * err)), (1.0 / ORD)));
293 
294  // limit to 0.2 <= (h_new/8) <= 8.0
295  h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
296 
297 
298  // store time step and error
299  err_old = fmax(1.0e-2, err);
300  h_old = h;
301 
302  // check if last step rejected
303  if (reject) {
304  reject = false;
305  h_new = fmin(h, h_new);
306  }
307  h = fmin(h_new, t_end - t);
308 
309  } else {
310 
311  // limit to 0.2 <= (h_new/8) <= 8.0
312  h_new = h * fmax(fmin(0.9 * h_new, 8.0), 0.2);
313  h_new = fmin(h_new, t_end - t);
314 
315  reject = true;
316  h = fmin(h, h_new);
317  }
318 #else
319  //constant time stepping
320  //update y & t
321  #pragma unroll
322  for (int i = 0; i < NSP; ++i)
323  {
324  y[INDEX(i)] = y1[INDEX(i)];
325  }
326  t += h;
327 #endif
328 
329  } // end while
330 
331  result[T_ID] = EC_success;
332 }
333 
334 #ifdef GENERATE_DOCS
335 }
336 #endif
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22
__device__ void scale_init(const double *__restrict__ y0, double *__restrict__ sc)
Get scaling for weighted norm for the initial timestep (used in krylov process)
Defines some simple macros to simplify GPU indexing.
#define P
max order of the phi functions (for error estimation)
Definition: exp4_props.cuh:22
#define EC_consecutive_steps
Maximum number of consecutive internal timesteps with error reached.
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22
void eval_jacob(const double t, const double pres, const double *cy, double *jac)
Computes a finite difference Jacobian of order FD_ORD of the RHS function dydt at the given pressure ...
Definition: fd_jacob.c:24
#define DIVERGENCE_TEST
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44
#define NSP
The IVP system size.
Definition: header.cuh:20
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20
Header definitions for solver initialization routins.
#define STRIDE
the matrix dimensions
Implementation of the GPU arnoldi iteration methods.
#define MAX_STEPS
Maximum allowed internal timesteps per integration step.
Definition: exp4_props.cuh:30
Contains header definitions for the CUDA RHS function for the van der Pol example.
Structure containing memory needed for EXP4 algorithm.
Definition: exp4_props.cuh:37
Header definition of CUDA Finite Difference Jacobian.
#define EC_success
Successful time step.
__device__ void matvec_n_by_m_scale_add(const int m, const double scale, const double *__restrict__ A, const double *__restrict__ V, double *__restrict__ Av, const double *__restrict__ add)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
void sparse_multiplier(const double *A, const double *Vm, double *w)
Implements Jacobian \ vector multiplication in sparse (or unrolled) form.
Definitions of various linear algebra functions needed in the exponential integrators.
simple convenience file to include the correct solver properties file
Definition: exp4.cu:41
__device__ void matvec_n_by_m_scale_add_subtract(const int m, const double scale, const double *__restrict__ A, const double *V, double *__restrict__ Av, const double *__restrict__ add, const double *__restrict__ sub)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
__device__ void scale(const double *__restrict__ y0, const double *__restrict__ y1, double *__restrict__ sc)
Get scaling for weighted norm.
#define MAX_CONSECUTIVE_ERRORS
Number of consecutive errors on internal integration steps allowed before exit.
Definition: exp4_props.cuh:32
__device__ void matvec_m_by_m(const int m, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized MxM and a vector Mx1.
An example header file that defines system size, memory functions and other required methods for inte...
__device__ int arnoldi(const double scale, const int p, const double h, const double *__restrict__ A, const solver_memory *__restrict__ solver, const double *__restrict__ v, double *__restrict__ beta, double *__restrict__ work, cuDoubleComplex *__restrict__ work2)
Runs the arnoldi iteration to calculate the Krylov projection.
Definition: arnoldi.cuh:51
__device__ void integrate(const double t_start, const double t_end, const double pr, double *__restrict__ y, const mechanism_memory *__restrict__ mech, const solver_memory *__restrict__ solver)
4th-order exponential integrator function w/ adaptive Kyrlov subspace approximation ...
Definition: exp4.cu:71
#define ORD
order of embedded methods
Definition: exp4_props.cuh:24
A file generated by Scons that specifies various options to the solvers.
#define EC_max_steps_exceeded
Maximum number of internal timesteps exceeded.
__device__ double sc_norm(const double *__restrict__ nums, const double *__restrict__ sc)
Perform weighted norm.
__device__ void matvec_n_by_m_scale(const int m, const double scale, const double *const __restrict__ A, const double *const __restrict__ V, double *const __restrict__ Av)
Matrix-vector multiplication of a matrix sized NSPxM and a vector of size Mx1 scaled by a specified f...
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24
#define EPS
#define EC_h_plus_t_equals_h
Timescale reduced such that t + h == t in floating point math.
Contains a header definition for the CUDA van der Pol Jacobian evaluation.