accelerInt  v0.1
solver_main.cu
Go to the documentation of this file.
1 
12 /* Include common code. */
13 #include <stdlib.h>
14 #include <stdio.h>
15 #include <math.h>
16 #include <string.h>
17 #include <stdbool.h>
18 #include <complex.h>
19 #include <sys/types.h>
20 #include <sys/stat.h>
21 #ifdef DEBUG
22 #include <fenv.h>
23 #endif
24 
25 /* Include CUDA libraries. */
26 #include <cuda.h>
27 #include <cuda_runtime.h>
28 #include <helper_cuda.h>
29 #include <cuComplex.h>
30 
31 //our code
32 #include "header.cuh"
33 #include "timer.h"
34 //get our solver stuff
35 #include "solver.cuh"
36 #include "gpu_memory.cuh"
38 #include "launch_bounds.cuh"
39 
40 #ifdef DIVERGENCE_TEST
41  #include <assert.h>
43  // internal integrator steps per thread
44  __device__ int integrator_steps[DIVERGENCE_TEST] = {0};
45 #endif
46 
59 void write_log(int NUM, double t, const double* y_host, FILE* pFile)
60 {
61  fwrite(&t, sizeof(double), 1, pFile);
62  double buffer[NN];
63  for (int j = 0; j < NUM; j++)
64  {
65  double Y_N = 1.0;
66  buffer[0] = y_host[j];
67  for (int i = 1; i < NSP; ++i)
68  {
69  buffer[i] = y_host[NUM * i + j];
70  Y_N -= buffer[i];
71  }
72  #if NN == NSP + 1 //pyjac
73  buffer[NSP] = Y_N;
74  #endif
75  apply_reverse_mask(&buffer[1]);
76  fwrite(buffer, sizeof(double), NN, pFile);
77  }
78 }
79 
96 inline void memcpy2D_in(double* dst, const int pitch_dst, double const * src, const int pitch_src,
97  const int offset, const size_t width, const int height) {
98  for (int i = 0; i < height; ++i)
99  {
100  memcpy(dst, &src[offset], width);
101  dst += pitch_dst;
102  src += pitch_src;
103  }
104 }
105 
122 inline void memcpy2D_out(double* dst, const int pitch_dst, double const * src, const int pitch_src,
123  const int offset, const size_t width, const int height) {
124  for (int i = 0; i < height; ++i)
125  {
126  memcpy(&dst[offset], src, width);
127  dst += pitch_dst;
128  src += pitch_src;
129  }
130 }
131 
133 
150 int main (int argc, char *argv[])
151 {
152 
153 //enable signaling NAN and other bad numerics tracking for easier debugging
154 #ifdef DEBUG
155  feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
156 #endif
157 
158  int NUM = 1;
159 
160  // check for problem size given as command line option
161  if (argc > 1)
162  {
163  int problemsize = NUM;
164  if (sscanf(argv[1], "%i", &problemsize) != 1 || (problemsize <= 0))
165  {
166  printf("Error: Problem size not in correct range\n");
167  printf("Provide number greater than 0\n");
168  exit(1);
169  }
170  NUM = problemsize;
171  }
172 
173  // set & initialize device using command line argument (if any)
174  cudaDeviceProp devProp;
175  if (argc <= 2)
176  {
177  // default device id is 0
178  cudaErrorCheck (cudaSetDevice (0) );
179  cudaErrorCheck (cudaGetDeviceProperties(&devProp, 0));
180  }
181  else
182  {
183  // use second argument for number
184 
185  // get number of devices
186  int num_devices;
187  cudaGetDeviceCount(&num_devices);
188 
189  int id = 0;
190  if (sscanf(argv[2], "%i", &id) == 1 && (id >= 0) && (id < num_devices))
191  {
192  cudaErrorCheck( cudaSetDevice (id) );
193  }
194  else
195  {
196  // not in range, error
197  printf("Error: GPU device number not in correct range\n");
198  printf("Provide number between 0 and %i\n", num_devices - 1);
199  exit(1);
200  }
201  cudaErrorCheck (cudaGetDeviceProperties(&devProp, id));
202  }
203  cudaErrorCheck( cudaDeviceReset() );
204  cudaErrorCheck( cudaPeekAtLastError() );
205  cudaErrorCheck( cudaDeviceSynchronize() );
206  #ifdef DIVERGENCE_TEST
207  NUM = DIVERGENCE_TEST;
208  assert(NUM % 32 == 0);
209  #endif
210  //bump up shared mem bank size
211  cudaErrorCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
212  //and L1 size
213  cudaErrorCheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
214 
215  size_t size_per_thread = required_mechanism_size() + required_solver_size();
216  size_t free_mem = 0;
217  size_t total_mem = 0;
218  cudaErrorCheck( cudaMemGetInfo (&free_mem, &total_mem) );
219 
220  //conservatively estimate the maximum allowable threads
221  int max_threads = int(floor(0.8 * ((double)free_mem) / ((double)size_per_thread)));
222  int padded = min(NUM, max_threads);
223  //padded is next factor of block size up
224  padded = int(ceil(padded / float(TARGET_BLOCK_SIZE)) * TARGET_BLOCK_SIZE);
225  if (padded == 0)
226  {
227  printf("Mechanism is too large to fit into global CUDA memory... exiting.");
228  exit(-1);
229  }
230 
232  mechanism_memory* host_mech, *device_mech;
233  host_solver = (solver_memory*)malloc(sizeof(solver_memory));
234  host_mech = (mechanism_memory*)malloc(sizeof(mechanism_memory));
235 
238 
239  // print number of threads and block size
240  printf ("# threads: %d \t block size: %d\n", NUM, TARGET_BLOCK_SIZE);
241 
242 #ifdef SHUFFLE
243  const char* filename = "shuffled_data.bin";
244 #elif !defined(SAME_IC)
245  const char* filename = "ign_data.bin";
246 #endif
247 
248  double* y_host;
249  double* var_host;
250 #ifdef SAME_IC
251  set_same_initial_conditions(NUM, &y_host, &var_host);
252 #else
253  read_initial_conditions(filename, NUM, &y_host, &var_host);
254 #endif
255 
256  //grid sizes
257  dim3 dimBlock(TARGET_BLOCK_SIZE, 1);
258  dim3 dimGrid (padded / TARGET_BLOCK_SIZE, 1 );
259  int* result_flag = (int*)malloc(padded * sizeof(int));
260 
261 // flag for ignition
262 #ifdef IGN
263  bool ign_flag = false;
264  // ignition delay time, units [s]
265  double t_ign = 0.0;
266  double T0 = y_host[0];
267 #endif
268 
269 #ifdef LOG_OUTPUT
270  // file for data
271  FILE *pFile;
272  const char* f_name = solver_name();
273  int len = strlen(f_name);
274  char out_name[len + 13];
275  struct stat info;
276  if (stat("./log/", &info) != 0)
277  {
278  printf("Expecting 'log' subdirectory in current working directory. Please run"
279  " mkdir log (or the equivalent) and run again.\n");
280  exit(-1);
281  }
282  sprintf(out_name, "log/%s-log.bin", f_name);
283  pFile = fopen(out_name, "wb");
284 
285  write_log(NUM, 0, y_host, pFile);
286 
287  //initialize integrator specific log
288  init_solver_log();
289 #endif
290 
291  double* y_temp = 0;
292  y_temp = (double*)malloc(padded * NSP * sizeof(double));
293 
295  // start timer
296  StartTimer();
298 
299  // set initial time
300  double t = 0;
301  double t_next = fmin(end_time, t_step);
302  int numSteps = 0;
303 
304  // time integration loop
305  while (t + EPS < end_time)
306  {
307  numSteps++;
308 
309  int num_solved = 0;
310  while (num_solved < NUM)
311  {
312  int num_cond = min(NUM - num_solved, padded);
313 
314  cudaErrorCheck( cudaMemcpy (host_mech->var, &var_host[num_solved],
315  num_cond * sizeof(double), cudaMemcpyHostToDevice));
316 
317  //copy our memory into y_temp
318  memcpy2D_in(y_temp, padded, y_host, NUM,
319  num_solved, num_cond * sizeof(double), NSP);
320  // transfer memory to GPU
321  cudaErrorCheck( cudaMemcpy2D (host_mech->y, padded * sizeof(double),
322  y_temp, padded * sizeof(double),
323  num_cond * sizeof(double), NSP,
324  cudaMemcpyHostToDevice) );
325  intDriver <<< dimGrid, dimBlock, SHARED_SIZE >>> (num_cond, t, t_next, host_mech->var, host_mech->y, device_mech, device_solver);
326  #ifdef DEBUG
327  cudaErrorCheck( cudaPeekAtLastError() );
328  cudaErrorCheck( cudaDeviceSynchronize() );
329  #endif
330  // copy the result flag back
331  cudaErrorCheck( cudaMemcpy(result_flag, host_solver->result, num_cond * sizeof(int), cudaMemcpyDeviceToHost) );
332  check_error(num_cond, result_flag);
333  // transfer memory back to CPU
334  cudaErrorCheck( cudaMemcpy2D (y_temp, padded * sizeof(double),
335  host_mech->y, padded * sizeof(double),
336  num_cond * sizeof(double), NSP,
337  cudaMemcpyDeviceToHost) );
338  memcpy2D_out(y_host, NUM, y_temp, padded,
339  num_solved, num_cond * sizeof(double), NSP);
340 
341  num_solved += num_cond;
342 
343  }
344 
345  t = t_next;
346  t_next = fmin(end_time, (numSteps + 1) * t_step);
347 
348  #if defined(PRINT)
349  printf("%.15le\t%.15le\n", t, y_host[0]);
350  #endif
351  #ifdef DEBUG
352  // check if within bounds
353  if ((y_host[0] < 0.0) || (y_host[0] > 10000.0))
354  {
355  printf("Error, out of bounds.\n");
356  printf("Time: %e, ind %d val %e\n", t, 0, y_host[0]);
357  return 1;
358  }
359  #endif
360  #ifdef LOG_OUTPUT
361  #if !defined(LOG_END_ONLY)
362  write_log(NUM, t, y_host, pFile);
363  solver_log();
364  #endif
365  #endif
366  #ifdef IGN
367  // determine if ignition has occurred
368  if ((y_host[0] >= (T0 + 400.0)) && !(ign_flag)) {
369  ign_flag = true;
370  t_ign = t;
371  }
372  #endif
373  }
374 
375 #ifdef LOG_END_ONLY
376  write_log(NUM, t, y_host, pFile);
377  solver_log();
378 #endif
379 
381  // end timer
382  double runtime = GetTimer();
384 
385  #ifdef DIVERGENCE_TEST
386  int host_integrator_steps[DIVERGENCE_TEST];
387  cudaErrorCheck( cudaMemcpyFromSymbol(host_integrator_steps, integrator_steps, DIVERGENCE_TEST * sizeof(int)) );
388  int warps = NUM / 32;
389 
390  FILE *dFile;
391  const char* f_name = solver_name();
392  int len = strlen(f_name);
393  char out_name[len + 13];
394  sprintf(out_name, "log/%s-div.txt", f_name);
395  dFile = fopen(out_name, "w");
396  int index = 0;
397  for (int i = 0; i < warps; ++i)
398  {
399  double d = 0;
400  int max = 0;
401  for (int j = 0; j < 32; ++j)
402  {
403  int steps = host_integrator_steps[index];
404  d += steps;
405  max = steps > max ? steps : max;
406  index++;
407  }
408  d /= (32.0 * max);
409  fprintf(dFile, "%.15e\n", d);
410  }
411  fclose(dFile);
412  #endif
413 
414  runtime /= 1000.0;
415  printf ("Time: %.15e sec\n", runtime);
416  runtime = runtime / ((double)(numSteps));
417  printf ("Time per step: %e (s)\t%.15e (s/thread)\n", runtime, runtime / NUM);
418 #ifdef IGN
419  printf ("Ig. Delay (s): %e\n", t_ign);
420 #endif
421  printf("TFinal: %e\n", y_host[0]);
422 
423 #ifdef LOG_OUTPUT
424  fclose (pFile);
425 #endif
426 
427 
430  free(y_temp);
431  free(host_mech);
432  free(host_solver);
433  free(result_flag);
434  cudaErrorCheck( cudaDeviceReset() );
435 
436  return 0;
437 }
void write_log(int NUM, double t, const double *y_host, FILE *pFile)
Writes state vectors to file.
Definition: solver_main.cu:59
void memcpy2D_out(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
Definition: solver_main.cu:122
#define TARGET_BLOCK_SIZE
The target number of threads per block.
__host__ void check_error(int num_conditions, int *code_arr)
#define DIVERGENCE_TEST
int padded
Padded # of ODEs to solve.
mechanism_memory * device_mech
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
Definition: solver_main.cu:44
void solver_log()
Definition: cvodes_init.c:192
#define NSP
The IVP system size.
Definition: header.cuh:20
mechanism_memory * host_mech
The mechanism memory structs.
const char * solver_name()
Returns a descriptive solver name.
Definition: cvodes_init.c:181
Timer interface for Linux.
#define t_step
void apply_reverse_mask(double *y_host)
Not needed for van der Pol.
Definition: ics.cu:53
void initialize_gpu_memory(int padded, mechanism_memory **h_mem, mechanism_memory **d_mem)
Initializes the host and device mechanism_memory structs. This is required in order to enable passing...
Definition: gpu_memory.cu:30
void free_gpu_memory(mechanism_memory **h_mem, mechanism_memory **d_mem)
Frees the host and device mechanism_memory structs.
Definition: gpu_memory.cu:47
void StartTimer()
Definition: timer.h:43
void initialize_solver(int num_threads)
Initializes the solver.
Definition: cvodes_init.c:49
int main(int argc, char *argv[])
Definition: solver_main.cu:150
double * y_temp
temorary storage
size_t required_solver_size()
Returns the total size (in bytes) required for memory storage for a single GPU thread Used in calcula...
Definition: exp4_init.cu:167
dim3 dimBlock
block and grid sizes
the generic main file for all GPU solvers
void set_same_initial_conditions(int NUM, double **y_host, double **var_host)
Set same ICs for all problems.
Definition: ics.cu:25
void memcpy2D_in(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
Definition: solver_main.cu:96
An example header file that defines system size, memory functions and other required methods for inte...
void cleanup_solver(int num_threads)
Cleans up the created solvers.
Definition: cvodes_init.c:165
#define cudaErrorCheck(ans)
Definition: gpu_macros.cuh:26
definition of the generic initial condition reader
double GetTimer()
Definition: timer.h:60
Headers for GPU memory initialization.
void init_solver_log()
Definition: cvodes_init.c:190
int * result_flag
result flag
void read_initial_conditions(const char *filename, int NUM, double **y_host, double **variable_host)
Reads initial conditions for IVPs from binary file.
size_t required_mechanism_size()
Calculates and returns the total memory size (in bytes) required by an individual thread for the mech...
Definition: gpu_memory.cu:15
A number of definitions that control CUDA kernel launches.
#define EPS
#define NN
Input vector size (in read_initial_conditions)
Definition: header.cuh:22
#define end_time
solver_memory * host_solver
The solver memory structs.
solver_memory * device_solver