accelerInt  v0.1
rkc.cu
Go to the documentation of this file.
1 #include "rkc.cuh"
2 #include "dydt.cuh"
3 
4 #ifdef GENERATE_DOCS
5 namespace rkc_cu {
6 #endif
7 
8 //#define SHARED
9 
11 
12 __device__
13 Real rkc_spec_rad (const Real t, const Real pr, const Real hmax, const Real* y,
14  const Real* F, Real* v, Real* Fv,
15  mechanism_memory const * const __restrict__ mech) {
28  const int itmax = 50;
29  Real small = ONE / hmax;
30 
31  //for (int i = 0; i < NSP; ++i) {
32  // v[i] = F[i];
33  //}
34 
35  Real nrm1 = ZERO;
36  Real nrm2 = ZERO;
37  for (int i = 0; i < NSP; ++i) {
38  nrm1 += (y[INDEX(i)] * y[INDEX(i)]);
39  nrm2 += (v[INDEX(i)] * v[INDEX(i)]);
40  }
41  nrm1 = sqrt(nrm1);
42  nrm2 = sqrt(nrm2);
43 
44  Real dynrm;
45  if ((nrm1 != ZERO) && (nrm2 != ZERO)) {
46  dynrm = nrm1 * sqrt(UROUND);
47  for (int i = 0; i < NSP; ++i) {
48  v[INDEX(i)] = y[INDEX(i)] + v[INDEX(i)] * (dynrm / nrm2);
49  }
50  } else if (nrm1 != ZERO) {
51  dynrm = nrm1 * sqrt(UROUND);
52  for (int i = 0; i < NSP; ++i) {
53  v[INDEX(i)] = y[INDEX(i)] * (ONE + sqrt(UROUND));
54  }
55  } else if (nrm2 != ZERO) {
56  dynrm = UROUND;
57  for (int i = 0; i < NSP; ++i) {
58  v[INDEX(i)] *= (dynrm / nrm2);
59  }
60  } else {
61  dynrm = UROUND;
62  for (int i = 0; i < NSP; ++i) {
63  v[INDEX(i)] = UROUND;
64  }
65  }
66 
67  // now iterate using nonlinear power method
68  Real sigma = ZERO;
69  for (int iter = 1; iter <= itmax; ++iter) {
70 
71  dydt (t, pr, v, Fv, mech);
72 
73  nrm1 = ZERO;
74  for (int i = 0; i < NSP; ++i) {
75  nrm1 += ((Fv[INDEX(i)] - F[INDEX(i)]) * (Fv[INDEX(i)] - F[INDEX(i)]));
76  }
77  nrm1 = sqrt(nrm1);
78  nrm2 = sigma;
79  sigma = nrm1 / dynrm;
80 
81  nrm2 = fabs(sigma - nrm2) / sigma;
82  if ((iter >= 2) && (fabs(sigma - nrm2) <= (fmax(sigma, small) * P01))) {
83  for (int i = 0; i < NSP; ++i) {
84  v[INDEX(i)] -= y[INDEX(i)];
85  }
86  return (ONEP2 * sigma);
87  }
88 
89  if (nrm1 != ZERO) {
90  for (int i = 0; i < NSP; ++i) {
91  v[INDEX(i)] = y[INDEX(i)] + ((Fv[INDEX(i)] - F[INDEX(i)]) * (dynrm / nrm1));
92  }
93  } else {
94  int ind = (iter % NSP);
95  v[INDEX(ind)] = y[INDEX(ind)] - (v[INDEX(ind)] - y[INDEX(ind)]);
96  }
97  }
98  return (ONEP2 * sigma);
99 }
100 
102 
103 __device__
104 void rkc_step (const Real t, const Real pr, const Real h, const Real* y_0,
105  const Real* F_0, const int s, Real* y_j,
106  Real* y_jm1, Real* y_jm2,
107  mechanism_memory const * const __restrict__ mech) {
120  const Real w0 = ONE + TWO / (13.0 * (Real)(s * s));
121  Real temp1 = (w0 * w0) - ONE;
122  Real temp2 = sqrt(temp1);
123  const Real arg = (Real)(s) * log(w0 + temp2);
124  const Real w1 = sinh(arg) * temp1 / (cosh(arg) * (Real)(s) * temp2 - w0 * sinh(arg));
125 
126  Real b_jm1 = ONE / (FOUR * (w0 * w0));
127  Real b_jm2 = b_jm1;
128 
129  //Real y_jm1[NSP];
130  //Real y_jm2[NSP];
131 
132  // calculate y_1
133  Real mu_t = w1 * b_jm1;
134  for (int i = 0; i < NSP; ++i) {
135  y_jm2[INDEX(i)] = y_0[INDEX(i)];
136  y_jm1[INDEX(i)] = y_0[INDEX(i)] + (mu_t * h * F_0[INDEX(i)]);
137  }
138 
139  Real c_jm2 = ZERO;
140  Real c_jm1 = mu_t;
141  Real zjm1 = w0;
142  Real zjm2 = ONE;
143  Real dzjm1 = ONE;
144  Real dzjm2 = ZERO;
145  Real d2zjm1 = ZERO;
146  Real d2zjm2 = ZERO;
147 
148  for (int j = 2; j <= s; ++j) {
149 
150  Real zj = TWO * w0 * zjm1 - zjm2;
151  Real dzj = TWO * w0 * dzjm1 - dzjm2 + TWO * zjm1;
152  Real d2zj = TWO * w0 * d2zjm1 - d2zjm2 + FOUR * dzjm1;
153  Real b_j = d2zj / (dzj * dzj);
154  Real gamma_t = ONE - (zjm1 * b_jm1);
155 
156  Real nu = -b_j / b_jm2;
157  Real mu = TWO * b_j * w0 / b_jm1;
158  mu_t = mu * w1 / w0;
159 
160  // calculate derivative, use y array for temporary storage
161  dydt (t + (h * c_jm1), pr, y_jm1, y_j, mech);
162 
163  for (int i = 0; i < NSP; ++i) {
164  y_j[INDEX(i)] = (ONE - mu - nu) * y_0[INDEX(i)] + (mu * y_jm1[INDEX(i)]) + (nu * y_jm2[INDEX(i)])
165  + h * mu_t * (y_j[INDEX(i)] - (gamma_t * F_0[INDEX(i)]));
166  }
167  Real c_j = (mu * c_jm1) + (nu * c_jm2) + mu_t * (ONE - gamma_t);
168 
169  if (j < s) {
170  for (int i = 0; i < NSP; ++i) {
171  y_jm2[INDEX(i)] = y_jm1[INDEX(i)];
172  y_jm1[INDEX(i)] = y_j[INDEX(i)];
173  }
174  }
175 
176  c_jm2 = c_jm1;
177  c_jm1 = c_j;
178  b_jm2 = b_jm1;
179  b_jm1 = b_j;
180  zjm2 = zjm1;
181  zjm1 = zj;
182  dzjm2 = dzjm1;
183  dzjm1 = dzj;
184  d2zjm2 = d2zjm1;
185  d2zjm1 = d2zj;
186  }
187 
188 } // rkc_step
189 
191 
192 __device__ void integrate (const Real tstart,
193  const Real tEnd,
194  const Real pr,
195  Real *y,
196  mechanism_memory const * const __restrict__ mech,
197  solver_memory const * const __restrict__ solver) {
209  Real t = tstart;
210  int nstep = 0;
211  int mMax = (int)(round(sqrt(RTOL / (10.0 * UROUND))));
212 
213  if (mMax < 2) {
214  mMax = 2;
215  }
216 
217  Real * const __restrict__ y_n = solver->y_n;
218  //Real y_n[INDEX(NSP)];
219  for (int i = 0; i < NSP; ++i) {
220  y_n[INDEX(i)] = y[INDEX(i)];
221  }
222 
223  // calculate F_n for initial y
224  Real * const __restrict__ F_n = solver->F_n;
225  //Real F_n[INDEX(NSP)];
226  dydt (t, pr, y_n, F_n, mech);
227 
228  // load initial estimate for eigenvector
229  // Real work [INDEX(NSP + 4)];
230  Real * const __restrict__ work = solver->work;
231  if (work[INDEX(2)] < UROUND) {
232  for (int i = 0; i < NSP; ++i) {
233  work[INDEX(4 + i)] = F_n[INDEX(i)];
234  }
235  }
236 
237  const Real stepSizeMax = fabs(tEnd - t);
238  Real stepSizeMin = TEN * UROUND * fmax(fabs(t), stepSizeMax);
239 
240  //Real spec_rad;
241  Real * const __restrict__ temp_arr = solver->temp_arr;
242  //Real temp_arr[INDEX(NSP)];
243  Real * const __restrict__ temp_arr2 = solver->temp_arr2;
244  //Real temp_arr2[INDEX(NSP)];
245 
246  Real * const __restrict__ y_jm1 = solver->y_jm1;
247  Real * const __restrict__ y_jm2 = solver->y_jm2;
248 
249  while (t < tEnd) {
250  Real err;
251 
252  // estimate Jacobian spectral radius
253  // only if 25 steps passed
254  if ((nstep % 25) == 0) {
255  //spec_rad = rkc_spec_rad (t, pr, y_n, F_n, temp_arr, temp_arr2);
256  work[INDEX(3)] = rkc_spec_rad (t, pr, stepSizeMax, y_n, F_n, &work[4 * GRID_DIM], temp_arr2, mech);
257  }
258  //Real spec_rad = rkc_spec_rad (t, pr, y_n, F_n, temp_arr, temp_arr2);
259 
260  if (work[INDEX(2)] < UROUND) {
261  // estimate first time step
262  work[INDEX(2)] = stepSizeMax;
263  if ((work[INDEX(3)] * work[INDEX(2)]) > ONE) {
264  work[INDEX(2)] = ONE / work[INDEX(3)];
265  }
266  work[INDEX(2)] = fmax(work[INDEX(2)], stepSizeMin);
267 
268  for (int i = 0; i < NSP; ++i) {
269  temp_arr[INDEX(i)] = y_n[INDEX(i)] + (work[INDEX(2)] * F_n[INDEX(i)]);
270  }
271  dydt (t + work[INDEX(2)], pr, temp_arr, temp_arr2, mech);
272 
273  err = ZERO;
274  for (int i = 0; i < NSP; ++i) {
275  Real est = (temp_arr2[INDEX(i)] - F_n[INDEX(i)]) / (ATOL + RTOL * fabs(y_n[INDEX(i)]));
276  err += est * est;
277  }
278  err = work[INDEX(2)] * sqrt(err / NSP);
279 
280  if ((P1 * work[INDEX(2)]) < (stepSizeMax * sqrt(err))) {
281  work[INDEX(2)] = fmax(P1 * work[INDEX(2)] / sqrt(err), stepSizeMin);
282  } else {
283  work[INDEX(2)] = stepSizeMax;
284  }
285  }
286 
287  // otherwise use value stored in work[INDEX(2)], calculated in previous step
288 
289  // check if last step
290  if ((ONEP1 * work[INDEX(2)]) >= fabs(tEnd - t)) {
291  work[INDEX(2)] = fabs(tEnd - t);
292  }
293 
294  // calculate number of steps
295  int m = 1 + (int)(sqrt(ONEP54 * work[INDEX(2)] * work[INDEX(3)] + ONE));
296 
297  if (m > mMax) {
298  m = mMax;
299  work[INDEX(2)] = ((Real)(m * m - 1)) / (ONEP54 * work[INDEX(3)]);
300  }
301 
302  // perform tentative time step
303  rkc_step (t, pr, work[INDEX(2)], y_n, F_n, m, y, y_jm1, y_jm2, mech);
304 
305  // calculate F_np1 with tenative y_np1
306  dydt (t + work[INDEX(2)], pr, y, temp_arr, mech);
307 
308  // estimate error
309  err = ZERO;
310  for (int i = 0; i < NSP; ++i) {
311  Real est = P8 * (y_n[INDEX(i)] - y[INDEX(i)]) + P4 * work[INDEX(2)] * (F_n[INDEX(i)] + temp_arr[INDEX(i)]);
312  est /= (ATOL + RTOL * fmax(fabs(y[INDEX(i)]), fabs(y_n[INDEX(i)])));
313  err += est * est;
314  }
315  err = sqrt(err / ((Real)NSP));
316 
317  if (err > ONE) {
318  // error too large, step is rejected
319 
320  // select smaller step size
321  work[INDEX(2)] = P8 * work[INDEX(2)] / (pow(err, ONE3RD));
322 
323  // reevaluate spectral radius
324  //spec_rad = rkc_spec_rad (t, pr, y_n, F_n, temp_arr, temp_arr2);
325  work[INDEX(3)] = rkc_spec_rad (t, pr, stepSizeMax, y_n, F_n, &work[GRID_DIM * 4], temp_arr2, mech);
326  } else {
327  // step accepted
328  t += work[INDEX(2)];
329  nstep++;
330 
331  Real fac = TEN;
332  Real temp1, temp2;
333 
334  if (work[INDEX(1)] < UROUND) {
335  temp2 = pow(err, ONE3RD);
336  if (P8 < (fac * temp2)) {
337  fac = P8 / temp2;
338  }
339  } else {
340  temp1 = P8 * work[INDEX(2)] * pow(work[INDEX(0)], ONE3RD);
341  temp2 = work[INDEX(1)] * pow(err, TWO3RD);
342  if (temp1 < (fac * temp2)) {
343  fac = temp1 / temp2;
344  }
345  }
346 
347  // set "old" values to those for current time step
348  work[INDEX(0)] = err;
349  work[INDEX(1)] = work[INDEX(2)];
350 
351  for (int i = 0; i < NSP; ++i) {
352  y_n[INDEX(i)] = y[INDEX(i)];
353  F_n[INDEX(i)] = temp_arr[INDEX(i)];
354  }
355 
356  work[INDEX(2)] *= fmax(P1, fac);
357  work[INDEX(2)] = fmax(stepSizeMin, fmin(stepSizeMax, work[INDEX(2)]));
358 
359  /* for the momentm this isn't supported
360  if (task == 0) {
361  // only perform one step
362  return;
363  }*/
364  }
365 
366  }
367 
368  int * const __restrict__ result = solver->result;
369  result[T_ID] = EC_success;
370 
371 } // rkc_driver
372 
373 
374 #ifdef GENERATE_DOCS
375 }
376 #endif
#define T_ID
The global CUDA thread index.
Definition: gpu_macros.cuh:22
#define TWO
Definition: rkc_props.cuh:26
#define TEN
Definition: rkc_props.cuh:30
#define ONEP1
Definition: rkc_props.cuh:31
Definition: rkc.cu:5
#define UROUND
Definition: rkc_props.cuh:40
#define ONEP54
Definition: rkc_props.cuh:33
void dydt(const double t, const double mu, const double *__restrict__ y, double *__restrict__ dy)
An implementation of the RHS of the van der Pol equation.
Definition: dydt.c:22
#define ZERO
Definition: rkc_props.cuh:24
#define TWO3RD
Definition: rkc_props.cuh:39
#define P4
Definition: rkc_props.cuh:35
#define NSP
The IVP system size.
Definition: header.cuh:20
#define GRID_DIM
The total number of threads in the Grid, provides an offset between vector entries.
Definition: gpu_macros.cuh:20
#define ONEP2
Definition: rkc_props.cuh:32
Contains header definitions for the CUDA RHS function for the van der Pol example.
__device__ Real rkc_spec_rad(const Real t, const Real pr, const Real hmax, const Real *y, const Real *F, Real *v, Real *Fv, mechanism_memory const *const __restrict__ mech)
Definition: rkc.cu:13
__device__ void integrate(const Real tstart, const Real tEnd, const Real pr, Real *y, mechanism_memory const *const __restrict__ mech, solver_memory const *const __restrict__ solver)
Definition: rkc.cu:192
#define RTOL
#define EC_success
Successful time step.
#define ONE
Definition: rkc_props.cuh:25
#define ATOL
#define P1
Definition: rkc_props.cuh:36
#define Real
Definition: rkc_props.cuh:22
#define P01
Definition: rkc_props.cuh:37
__device__ void rkc_step(const Real t, const Real pr, const Real h, const Real *y_0, const Real *F_0, const int s, Real *y_j, Real *y_jm1, Real *y_jm2, mechanism_memory const *const __restrict__ mech)
Definition: rkc.cu:104
#define FOUR
Definition: rkc_props.cuh:28
#define P8
Definition: rkc_props.cuh:34
#define INDEX(i)
Convenience macro to get the value of a vector at index i, calculated as i * GRID_DIM + T_ID...
Definition: gpu_macros.cuh:24
#define ONE3RD
Definition: rkc_props.cuh:38
Memory required for Radau-IIa GPU solver.
Definition: rkc_props.cuh:64