accelerInt  v0.1
solver_interface.cu
Go to the documentation of this file.
1 
11 #include "solver_interface.cuh"
12 
13 #ifdef GENERATE_DOCS
14 namespace genericcu {
15 #endif
16 
18 int padded;
22 mechanism_memory* host_mech, *device_mech;
28 double* y_temp;
29 
46 inline void memcpy2D_in(double* dst, const int pitch_dst, double const * src, const int pitch_src,
47  const int offset, const size_t width, const int height) {
48  for (int i = 0; i < height; ++i)
49  {
50  memcpy(dst, &src[offset], width);
51  dst += pitch_dst;
52  src += pitch_src;
53  }
54 }
55 
72 inline void memcpy2D_out(double* dst, const int pitch_dst, double const * src, const int pitch_src,
73  const int offset, const size_t width, const int height) {
74  for (int i = 0; i < height; ++i)
75  {
76  memcpy(&dst[offset], src, width);
77  dst += pitch_dst;
78  src += pitch_src;
79  }
80 }
81 
82 
88 void accelerInt_initialize(int NUM, int device) {
89  device = device < 0 ? 0 : device;
90 
91  // set & initialize device using command line argument (if any)
92  cudaDeviceProp devProp;
93  // get number of devices
94  int num_devices;
95  cudaGetDeviceCount(&num_devices);
96 
97  if ((device >= 0) && (device < num_devices))
98  {
99  cudaErrorCheck( cudaSetDevice (device) );
100  }
101  else
102  {
103  // not in range, error
104  printf("Error: GPU device number not in correct range\n");
105  printf("Provide number between 0 and %i\n", num_devices - 1);
106  exit(1);
107  }
108  cudaErrorCheck (cudaGetDeviceProperties(&devProp, device));
109 
110  // reset device
111  cudaErrorCheck( cudaDeviceReset() );
112  cudaErrorCheck( cudaPeekAtLastError() );
113  cudaErrorCheck( cudaDeviceSynchronize() );
114 
115  //bump up shared mem bank size
116  cudaErrorCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
117  //and L1 size
118  cudaErrorCheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));
119 
120  //get the memory sizes
121  size_t size_per_thread = required_mechanism_size() + required_solver_size();
122  size_t free_mem = 0;
123  size_t total_mem = 0;
124  cudaErrorCheck( cudaMemGetInfo (&free_mem, &total_mem) );
125 
126  //conservatively estimate the maximum allowable threads
127  int max_threads = int(floor(0.8 * ((double)free_mem) / ((double)size_per_thread)));
128  int padded = min(NUM, max_threads);
129  //padded is next factor of block size up
130  padded = int(ceil(padded / float(TARGET_BLOCK_SIZE)) * TARGET_BLOCK_SIZE);
131  if (padded == 0)
132  {
133  printf("Mechanism is too large to fit into global CUDA memory... exiting.");
134  exit(-1);
135  }
136 
137  //initalize memory
140 
141  //grid sizes
142  dimBlock = dim3(TARGET_BLOCK_SIZE, 1);
143  dimGrid = dim3(padded / TARGET_BLOCK_SIZE, 1 );
144  //local storage
145  result_flag = (int*)malloc(padded * sizeof(int));
146  y_temp = (double*)malloc(padded * NSP * sizeof(double));
147 }
148 
149 
161 void accelerInt_integrate(const int NUM, const double t_start, const double t_end, const double stepsize,
162  double * __restrict__ y_host, const double * __restrict__ var_host)
163 {
164  double step = stepsize < 0 ? t_end - t_start : stepsize;
165  double t = t_start;
166  double t_next = fmin(end_time, t + step);
167  int numSteps = 0;
168 
169  // time integration loop
170  while (t + EPS < t_end)
171  {
172  numSteps++;
173  int num_solved = 0;
174  while (num_solved < NUM)
175  {
176  int num_cond = min(NUM - num_solved, padded);
177 
178  cudaErrorCheck( cudaMemcpy (host_mech->var, &var_host[num_solved],
179  num_cond * sizeof(double), cudaMemcpyHostToDevice));
180 
181  //copy our memory into y_temp
182  memcpy2D_in(y_temp, padded, y_host, NUM,
183  num_solved, num_cond * sizeof(double), NSP);
184  // transfer memory to GPU
185  cudaErrorCheck( cudaMemcpy2D (host_mech->y, padded * sizeof(double),
186  y_temp, padded * sizeof(double),
187  num_cond * sizeof(double), NSP,
188  cudaMemcpyHostToDevice) );
189  intDriver <<< dimGrid, dimBlock, SHARED_SIZE >>> (num_cond, t, t_next, host_mech->var, host_mech->y, device_mech, device_solver);
190  #ifdef DEBUG
191  cudaErrorCheck( cudaPeekAtLastError() );
192  cudaErrorCheck( cudaDeviceSynchronize() );
193  #endif
194  // copy the result flag back
195  cudaErrorCheck( cudaMemcpy(result_flag, host_solver->result, num_cond * sizeof(int), cudaMemcpyDeviceToHost) );
196  check_error(num_cond, result_flag);
197  // transfer memory back to CPU
198  cudaErrorCheck( cudaMemcpy2D (y_temp, padded * sizeof(double),
199  host_mech->y, padded * sizeof(double),
200  num_cond * sizeof(double), NSP,
201  cudaMemcpyDeviceToHost) );
202  memcpy2D_out(y_host, NUM, y_temp, padded,
203  num_solved, num_cond * sizeof(double), NSP);
204 
205  num_solved += num_cond;
206 
207  }
208  t = t_next;
209  t_next = fmin(t_end, (numSteps + 1) * step);
210  }
211 }
212 
213 
220  free(y_temp);
221  free(host_mech);
222  free(host_solver);
223  free(result_flag);
224  cudaErrorCheck( cudaDeviceReset() );
225 }
226 
227 
228 
229 
230 #ifdef GENERATE_DOCS
231 }
232 #endif
Interface implementation for GPU solvers to be called as a library.
#define TARGET_BLOCK_SIZE
The target number of threads per block.
void memcpy2D_in(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
__host__ void check_error(int num_conditions, int *code_arr)
void accelerInt_integrate(const int NUM, const double t_start, const double t_end, const double stepsize, double *__restrict__ y_host, const double *__restrict__ var_host)
integrate NUM odes from time t_start to time t_end, using stepsizes of stepsize
int padded
Padded # of ODEs to solve.
mechanism_memory * device_mech
#define NSP
The IVP system size.
Definition: header.cuh:20
mechanism_memory * host_mech
The mechanism memory structs.
void cleanup_solver(solver_memory **, solver_memory **)
void accelerInt_initialize(int NUM, int device)
Initializes the solver.
void initialize_gpu_memory(int padded, mechanism_memory **h_mem, mechanism_memory **d_mem)
Initializes the host and device mechanism_memory structs. This is required in order to enable passing...
Definition: gpu_memory.cu:30
void free_gpu_memory(mechanism_memory **h_mem, mechanism_memory **d_mem)
Frees the host and device mechanism_memory structs.
Definition: gpu_memory.cu:47
void memcpy2D_out(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
double * y_temp
temorary storage
size_t required_solver_size()
Returns the total size (in bytes) required for memory storage for a single GPU thread Used in calcula...
Definition: exp4_init.cu:167
dim3 dimBlock
block and grid sizes
#define cudaErrorCheck(ans)
Definition: gpu_macros.cuh:26
void initialize_solver(const int, solver_memory **, solver_memory **)
int * result_flag
result flag
size_t required_mechanism_size()
Calculates and returns the total memory size (in bytes) required by an individual thread for the mech...
Definition: gpu_memory.cu:15
#define EPS
void accelerInt_cleanup()
Cleans up the solver.
#define end_time
solver_memory * host_solver
The solver memory structs.
solver_memory * device_solver