accelerInt/solver__interface_8cu_source.html

 #include "solver_interface.cuh"

 #ifdef GENERATE_DOCS
 namespace genericcu {
 #endif

 int padded;
 solver_memory* host_solver, *device_solver;
 mechanism_memory* host_mech, *device_mech;
 dim3 dimBlock, dimGrid;
 int* result_flag;
 double* y_temp;

 inline void memcpy2D_in(double* dst, const int pitch_dst, double const * src, const int pitch_src,
                                      const int offset, const size_t width, const int height) {
     for (int i = 0; i < height; ++i)
     {
         memcpy(dst, &src[offset], width);
         dst += pitch_dst;
         src += pitch_src;
     }
 }

 inline void memcpy2D_out(double* dst, const int pitch_dst, double const * src, const int pitch_src,
                                       const int offset, const size_t width, const int height) {
     for (int i = 0; i < height; ++i)
     {
         memcpy(&dst[offset], src, width);
         dst += pitch_dst;
         src += pitch_src;
     }
 }


 void accelerInt_initialize(int NUM, int device) {
     device = device < 0 ? 0 : device;

     // set & initialize device using command line argument (if any)
     cudaDeviceProp devProp;
     // get number of devices
     int num_devices;
     cudaGetDeviceCount(&num_devices);

     if ((device >= 0) && (device < num_devices))
     {
         cudaErrorCheck( cudaSetDevice (device) );
     }
     else
     {
         // not in range, error
         printf("Error: GPU device number not in correct range\n");
         printf("Provide number between 0 and %i\n", num_devices - 1);
         exit(1);
     }
     cudaErrorCheck (cudaGetDeviceProperties(&devProp, device));

     // reset device
     cudaErrorCheck( cudaDeviceReset() );
     cudaErrorCheck( cudaPeekAtLastError() );
     cudaErrorCheck( cudaDeviceSynchronize() );

     //bump up shared mem bank size
     cudaErrorCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
     //and L1 size
     cudaErrorCheck(cudaDeviceSetCacheConfig(cudaFuncCachePreferL1));

     //get the memory sizes
     size_t size_per_thread = required_mechanism_size() + required_solver_size();
     size_t free_mem = 0;
     size_t total_mem = 0;
     cudaErrorCheck( cudaMemGetInfo (&free_mem, &total_mem) );

     //conservatively estimate the maximum allowable threads
     int max_threads = int(floor(0.8 * ((double)free_mem) / ((double)size_per_thread)));
     int padded = min(NUM, max_threads);
     //padded is next factor of block size up
     padded = int(ceil(padded / float(TARGET_BLOCK_SIZE)) * TARGET_BLOCK_SIZE);
     if (padded == 0)
     {
         printf("Mechanism is too large to fit into global CUDA memory... exiting.");
         exit(-1);
     }

     //initalize memory
     initialize_gpu_memory(padded, &host_mech, &device_mech);
     initialize_solver(padded, &host_solver, &device_solver);

     //grid sizes
     dimBlock = dim3(TARGET_BLOCK_SIZE, 1);
     dimGrid = dim3(padded / TARGET_BLOCK_SIZE, 1 );
     //local storage
     result_flag = (int*)malloc(padded * sizeof(int));
     y_temp = (double*)malloc(padded * NSP * sizeof(double));
 }


 void accelerInt_integrate(const int NUM, const double t_start, const double t_end, const double stepsize,
                           double * __restrict__ y_host, const double * __restrict__ var_host)
 {
     double step = stepsize < 0 ? t_end - t_start : stepsize;
     double t = t_start;
     double t_next = fmin(end_time, t + step);
     int numSteps = 0;

     // time integration loop
     while (t + EPS < t_end)
     {
         numSteps++;
         int num_solved = 0;
         while (num_solved < NUM)
         {
             int num_cond = min(NUM - num_solved, padded);

             cudaErrorCheck( cudaMemcpy (host_mech->var, &var_host[num_solved],
                                         num_cond * sizeof(double), cudaMemcpyHostToDevice));

              //copy our memory into y_temp
             memcpy2D_in(y_temp, padded, y_host, NUM,
                             num_solved, num_cond * sizeof(double), NSP);
             // transfer memory to GPU
             cudaErrorCheck( cudaMemcpy2D (host_mech->y, padded * sizeof(double),
                                             y_temp, padded * sizeof(double),
                                             num_cond * sizeof(double), NSP,
                                             cudaMemcpyHostToDevice) );
             intDriver <<< dimGrid, dimBlock, SHARED_SIZE >>> (num_cond, t, t_next, host_mech->var, host_mech->y, device_mech, device_solver);
     #ifdef DEBUG
             cudaErrorCheck( cudaPeekAtLastError() );
             cudaErrorCheck( cudaDeviceSynchronize() );
     #endif
             // copy the result flag back
             cudaErrorCheck( cudaMemcpy(result_flag, host_solver->result, num_cond * sizeof(int), cudaMemcpyDeviceToHost) );
             check_error(num_cond, result_flag);
             // transfer memory back to CPU
             cudaErrorCheck( cudaMemcpy2D (y_temp, padded * sizeof(double),
                                             host_mech->y, padded * sizeof(double),
                                             num_cond * sizeof(double), NSP,
                                             cudaMemcpyDeviceToHost) );
             memcpy2D_out(y_host, NUM, y_temp, padded,
                             num_solved, num_cond * sizeof(double), NSP);

             num_solved += num_cond;

         }
         t = t_next;
         t_next = fmin(t_end, (numSteps + 1) * step);
     }
 }


 void accelerInt_cleanup() {
     free_gpu_memory(&host_mech, &device_mech);
     cleanup_solver(&host_solver, &device_solver);
     free(y_temp);
     free(host_mech);
     free(host_solver);
     free(result_flag);
     cudaErrorCheck( cudaDeviceReset() );
 }


 #ifdef GENERATE_DOCS
 }
 #endif
solver_interface.cuh
Interface implementation for GPU solvers to be called as a library.

TARGET_BLOCK_SIZE
#define TARGET_BLOCK_SIZE
The target number of threads per block.
Definition: launch_bounds.cuh:10

genericcu::memcpy2D_in
void memcpy2D_in(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
Definition: solver_interface.cu:46

genericcu::check_error
__host__ void check_error(int num_conditions, int *code_arr)
Definition: radau2a_props.cu:15

genericcu::accelerInt_integrate
void accelerInt_integrate(const int NUM, const double t_start, const double t_end, const double stepsize, double *__restrict__ y_host, const double *__restrict__ var_host)
integrate NUM odes from time t_start to time t_end, using stepsizes of stepsize
Definition: solver_interface.cu:161

genericcu
Definition: solver.cuh:19

genericcu::padded
int padded
Padded # of ODEs to solve.
Definition: solver_interface.cu:18

genericcu::device_mech
mechanism_memory * device_mech
Definition: solver_interface.cu:22

NSP
#define NSP
The IVP system size.
Definition: header.cuh:20

genericcu::host_mech
mechanism_memory * host_mech
The mechanism memory structs.
Definition: solver_interface.cu:22

genericcu::cleanup_solver
void cleanup_solver(solver_memory **, solver_memory **)

genericcu::dimGrid
dim3 dimGrid
Definition: solver_interface.cu:24

genericcu::accelerInt_initialize
void accelerInt_initialize(int NUM, int device)
Initializes the solver.
Definition: solver_interface.cu:88

van_der_pol_cu::initialize_gpu_memory
void initialize_gpu_memory(int padded, mechanism_memory **h_mem, mechanism_memory **d_mem)
Initializes the host and device mechanism_memory structs. This is required in order to enable passing...
Definition: gpu_memory.cu:30

van_der_pol_cu::free_gpu_memory
void free_gpu_memory(mechanism_memory **h_mem, mechanism_memory **d_mem)
Frees the host and device mechanism_memory structs.
Definition: gpu_memory.cu:47

genericcu::memcpy2D_out
void memcpy2D_out(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
Definition: solver_interface.cu:72

genericcu::y_temp
double * y_temp
temorary storage
Definition: solver_interface.cu:28

genericcu::required_solver_size
size_t required_solver_size()
Returns the total size (in bytes) required for memory storage for a single GPU thread Used in calcula...
Definition: exp4_init.cu:167

genericcu::dimBlock
dim3 dimBlock
block and grid sizes
Definition: solver_interface.cu:24

cudaErrorCheck
#define cudaErrorCheck(ans)
Definition: gpu_macros.cuh:26

genericcu::initialize_solver
void initialize_solver(const int, solver_memory **, solver_memory **)

genericcu::result_flag
int * result_flag
result flag
Definition: solver_interface.cu:26

van_der_pol_cu::required_mechanism_size
size_t required_mechanism_size()
Calculates and returns the total memory size (in bytes) required by an individual thread for the mech...
Definition: gpu_memory.cu:15

solver_memory
Definition: solver_props.cuh:18

EPS
#define EPS
Definition: solver_interface.cuh:24

genericcu::accelerInt_cleanup
void accelerInt_cleanup()
Cleans up the solver.
Definition: solver_interface.cu:217

end_time
#define end_time
Definition: solver_options.cuh:28

genericcu::host_solver
solver_memory * host_solver
The solver memory structs.
Definition: solver_interface.cu:20

genericcu::device_solver
solver_memory * device_solver
Definition: solver_interface.cu:20