19 #include <sys/types.h> 27 #include <cuda_runtime.h> 28 #include <helper_cuda.h> 29 #include <cuComplex.h> 40 #ifdef DIVERGENCE_TEST 59 void write_log(
int NUM,
double t,
const double* y_host, FILE* pFile)
61 fwrite(&t,
sizeof(
double), 1, pFile);
63 for (
int j = 0; j < NUM; j++)
66 buffer[0] = y_host[j];
67 for (
int i = 1; i <
NSP; ++i)
69 buffer[i] = y_host[NUM * i + j];
72 #if NN == NSP + 1 //pyjac 76 fwrite(buffer,
sizeof(
double),
NN, pFile);
96 inline void memcpy2D_in(
double* dst,
const int pitch_dst,
double const * src,
const int pitch_src,
97 const int offset,
const size_t width,
const int height) {
98 for (
int i = 0; i < height; ++i)
100 memcpy(dst, &src[offset], width);
122 inline void memcpy2D_out(
double* dst,
const int pitch_dst,
double const * src,
const int pitch_src,
123 const int offset,
const size_t width,
const int height) {
124 for (
int i = 0; i < height; ++i)
126 memcpy(&dst[offset], src, width);
150 int main (
int argc,
char *argv[])
155 feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
163 int problemsize = NUM;
164 if (sscanf(argv[1],
"%i", &problemsize) != 1 || (problemsize <= 0))
166 printf(
"Error: Problem size not in correct range\n");
167 printf(
"Provide number greater than 0\n");
174 cudaDeviceProp devProp;
187 cudaGetDeviceCount(&num_devices);
190 if (sscanf(argv[2],
"%i", &
id) == 1 && (
id >= 0) && (
id < num_devices))
197 printf(
"Error: GPU device number not in correct range\n");
198 printf(
"Provide number between 0 and %i\n", num_devices - 1);
206 #ifdef DIVERGENCE_TEST 208 assert(NUM % 32 == 0);
211 cudaErrorCheck(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte));
217 size_t total_mem = 0;
221 int max_threads = int(floor(0.8 * ((
double)free_mem) / ((
double)size_per_thread)));
222 int padded = min(NUM, max_threads);
227 printf(
"Mechanism is too large to fit into global CUDA memory... exiting.");
234 host_mech = (mechanism_memory*)malloc(
sizeof(mechanism_memory));
243 const char* filename =
"shuffled_data.bin";
244 #elif !defined(SAME_IC) 245 const char* filename =
"ign_data.bin";
263 bool ign_flag =
false;
266 double T0 = y_host[0];
273 int len = strlen(f_name);
274 char out_name[len + 13];
276 if (stat(
"./log/", &info) != 0)
278 printf(
"Expecting 'log' subdirectory in current working directory. Please run" 279 " mkdir log (or the equivalent) and run again.\n");
282 sprintf(out_name,
"log/%s-log.bin", f_name);
283 pFile = fopen(out_name,
"wb");
310 while (num_solved < NUM)
312 int num_cond = min(NUM - num_solved,
padded);
315 num_cond *
sizeof(
double), cudaMemcpyHostToDevice));
319 num_solved, num_cond *
sizeof(
double),
NSP);
323 num_cond *
sizeof(
double),
NSP,
324 cudaMemcpyHostToDevice) );
336 num_cond *
sizeof(
double),
NSP,
337 cudaMemcpyDeviceToHost) );
339 num_solved, num_cond *
sizeof(
double),
NSP);
341 num_solved += num_cond;
349 printf(
"%.15le\t%.15le\n", t, y_host[0]);
353 if ((y_host[0] < 0.0) || (y_host[0] > 10000.0))
355 printf(
"Error, out of bounds.\n");
356 printf(
"Time: %e, ind %d val %e\n", t, 0, y_host[0]);
361 #if !defined(LOG_END_ONLY) 368 if ((y_host[0] >= (T0 + 400.0)) && !(ign_flag)) {
385 #ifdef DIVERGENCE_TEST 388 int warps = NUM / 32;
392 int len = strlen(f_name);
393 char out_name[len + 13];
394 sprintf(out_name,
"log/%s-div.txt", f_name);
395 dFile = fopen(out_name,
"w");
397 for (
int i = 0; i < warps; ++i)
401 for (
int j = 0; j < 32; ++j)
403 int steps = host_integrator_steps[index];
405 max = steps > max ? steps : max;
409 fprintf(dFile,
"%.15e\n", d);
415 printf (
"Time: %.15e sec\n", runtime);
416 runtime = runtime / ((double)(numSteps));
417 printf (
"Time per step: %e (s)\t%.15e (s/thread)\n", runtime, runtime / NUM);
419 printf (
"Ig. Delay (s): %e\n", t_ign);
421 printf(
"TFinal: %e\n", y_host[0]);
void write_log(int NUM, double t, const double *y_host, FILE *pFile)
Writes state vectors to file.
void memcpy2D_out(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
#define TARGET_BLOCK_SIZE
The target number of threads per block.
__host__ void check_error(int num_conditions, int *code_arr)
int padded
Padded # of ODEs to solve.
mechanism_memory * device_mech
__device__ int integrator_steps[DIVERGENCE_TEST]
If DIVERGENCE_TEST is defined, this creates a device array for tracking.
mechanism_memory * host_mech
The mechanism memory structs.
const char * solver_name()
Returns a descriptive solver name.
Timer interface for Linux.
void apply_reverse_mask(double *y_host)
Not needed for van der Pol.
void initialize_gpu_memory(int padded, mechanism_memory **h_mem, mechanism_memory **d_mem)
Initializes the host and device mechanism_memory structs. This is required in order to enable passing...
void free_gpu_memory(mechanism_memory **h_mem, mechanism_memory **d_mem)
Frees the host and device mechanism_memory structs.
void initialize_solver(int num_threads)
Initializes the solver.
int main(int argc, char *argv[])
double * y_temp
temorary storage
size_t required_solver_size()
Returns the total size (in bytes) required for memory storage for a single GPU thread Used in calcula...
dim3 dimBlock
block and grid sizes
the generic main file for all GPU solvers
void set_same_initial_conditions(int NUM, double **y_host, double **var_host)
Set same ICs for all problems.
void memcpy2D_in(double *dst, const int pitch_dst, double const *src, const int pitch_src, const int offset, const size_t width, const int height)
A convienience method to copy memory between host pointers of different pitches, widths and heights...
void cleanup_solver(int num_threads)
Cleans up the created solvers.
#define cudaErrorCheck(ans)
definition of the generic initial condition reader
Headers for GPU memory initialization.
int * result_flag
result flag
void read_initial_conditions(const char *filename, int NUM, double **y_host, double **variable_host)
Reads initial conditions for IVPs from binary file.
size_t required_mechanism_size()
Calculates and returns the total memory size (in bytes) required by an individual thread for the mech...
A number of definitions that control CUDA kernel launches.
solver_memory * host_solver
The solver memory structs.
solver_memory * device_solver