46 inline void memcpy2D_in(
double* dst,
const int pitch_dst,
double const * src,
const int pitch_src,
47 const int offset,
const size_t width,
const int height) {
48 for (
int i = 0; i < height; ++i)
50 memcpy(dst, &src[offset], width);
72 inline void memcpy2D_out(
double* dst,
const int pitch_dst,
double const * src,
const int pitch_src,
73 const int offset,
const size_t width,
const int height) {
74 for (
int i = 0; i < height; ++i)
76 memcpy(&dst[offset], src, width);
89 device = device < 0 ? 0 : device;
92 cudaDeviceProp devProp;
95 cudaGetDeviceCount(&num_devices);
97 if ((device >= 0) && (device < num_devices))
104 printf(
"Error: GPU device number not in correct range\n");
105 printf(
"Provide number between 0 and %i\n", num_devices - 1);
116 cudaErrorCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
123 size_t total_mem = 0;
127 int max_threads = int(floor(0.8 * ((
double)free_mem) / ((
double)size_per_thread)));
128 int padded = min(NUM, max_threads);
133 printf(
"Mechanism is too large to fit into global CUDA memory... exiting.");
162 double * __restrict__ y_host,
const double * __restrict__ var_host)
164 double step = stepsize < 0 ? t_end - t_start : stepsize;
166 double t_next = fmin(
end_time, t + step);
170 while (t +
EPS < t_end)
174 while (num_solved < NUM)
176 int num_cond = min(NUM - num_solved,
padded);
179 num_cond *
sizeof(
double), cudaMemcpyHostToDevice));
183 num_solved, num_cond *
sizeof(
double),
NSP);
187 num_cond *
sizeof(
double),
NSP,
188 cudaMemcpyHostToDevice) );
200 num_cond *
sizeof(
double),
NSP,
201 cudaMemcpyDeviceToHost) );
203 num_solved, num_cond *
sizeof(
double),
NSP);
205 num_solved += num_cond;
209 t_next = fmin(t_end, (numSteps + 1) * step);
Interface implementation for GPU solvers to be called as a library.
#define TARGET_BLOCK_SIZE
The target number of threads per block.
void memcpy2D_in(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
__host__ void check_error(int num_conditions, int *code_arr)
void accelerInt_integrate(const int NUM, const double t_start, const double t_end, const double stepsize, double *__restrict__ y_host, const double *__restrict__ var_host)
integrate NUM odes from time t_start to time t_end, using stepsizes of stepsize
int padded
Padded # of ODEs to solve.
mechanism_memory * device_mech
mechanism_memory * host_mech
The mechanism memory structs.
void cleanup_solver(solver_memory **, solver_memory **)
void accelerInt_initialize(int NUM, int device)
Initializes the solver.
void initialize_gpu_memory(int padded, mechanism_memory **h_mem, mechanism_memory **d_mem)
Initializes the host and device mechanism_memory structs. This is required in order to enable passing...
void free_gpu_memory(mechanism_memory **h_mem, mechanism_memory **d_mem)
Frees the host and device mechanism_memory structs.
void memcpy2D_out(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
double * y_temp
temorary storage
size_t required_solver_size()
Returns the total size (in bytes) required for memory storage for a single GPU thread Used in calcula...
dim3 dimBlock
block and grid sizes
#define cudaErrorCheck(ans)
void initialize_solver(const int, solver_memory **, solver_memory **)
int * result_flag
result flag
size_t required_mechanism_size()
Calculates and returns the total memory size (in bytes) required by an individual thread for the mech...
void accelerInt_cleanup()
Cleans up the solver.
solver_memory * host_solver
The solver memory structs.
solver_memory * device_solver