Source code for pyjac.core.CUDAParams
"""
Module containing parameters that control CUDA code generation
Parameters
----------
Jacob_Unroll : int
The number of reactions to attempt to place in each Jacobian reaction update subfile
Jacob_Spec_Unroll : int
The number of species to attempt to place in each Jacobian species update subfile
Rates_Unroll : int
The number of reactions to limit each reaction rate subfile to
Max_Lines : int
The number of lines to attempt to limit each Jacobian reaction update subfile to
Max_Spec_Lines : int
The number of lines to attempt to limit each Jacobian species update subfile to
"""
# Standard libraries
import os
from math import floor
Jacob_Unroll = 40
Jacob_Spec_Unroll = 40
Rates_Unroll = 250
Max_Lines = 10000
Max_Spec_Lines = 5000
[docs]def get_L1_size(L1_Preferred):
"""
Returns the size (in number of doubles) of the L1 cache for sm_20
Parameters
----------
L1_Preferred : bool
If true, prefer a larger L1 cache over more shared memory (recommended)
"""
if L1_Preferred:
return 49152 / 8 # doubles
else:
return 16384 / 8 # doubles
[docs]def get_shared_size(L1_Preferred):
"""
Returns the size (in number of doubles) of shared memory for sm_20
Parameters
----------
L1_Preferred : bool
If true, prefer a larger L1 cache over more shared memory (recommended)
"""
if not L1_Preferred:
return 49152 / 8 # doubles
else:
return 16384 / 8 # doubles
[docs]def get_register_count(num_blocks, num_threads):
"""
Returns the number of registers available per block for sm_20
Parameters
----------
num_blocks : int
The number of blocks to target per kernel launch
num_threads : int
The number of threads to target per kernel launch
"""
return max(min((32768 / num_blocks) / num_threads, 63), 1)
[docs]def write_launch_bounds(builddir, blocks_per_sm=8, num_threads=64,
L1_PREFERRED=True, no_shared=False
):
"""Creates the launch_bounds.cuh file that may be included by CUDA solvers
Parameters
----------
builddir : str
The directory to place the source file in
blocks_per_sm : int, optional
The number of blocks to target per kernel launch
num_threads : int, optional
The number of threads per block in the per kernel launch
L1_PREFERRED : bool, optional
If true, prefer a larger L1 cache over more shared memory (recommended)
no_shared : bool, optional
If false, turn off shared memory
Returns
-------
None
"""
shared_per_block = (int(floor(get_shared_size(L1_PREFERRED) / blocks_per_sm))
if not no_shared
else 0
)
with open(os.path.join(builddir, 'launch_bounds.cuh'), "w") as file:
file.write('#ifndef LAUNCH_BOUNDS_CUH\n'
'#define LAUNCH_BOUNDS_CUH\n'
'#define TARGET_BLOCK_SIZE ({})\n'.format(num_threads) +
'#define TARGET_BLOCKS ({})\n'.format(blocks_per_sm) +
('' if no_shared else '//shared memory active\n') +
'#define SHARED_SIZE ({}'.format(shared_per_block) +
' * sizeof(double))\n' +
('//Large L1 cache active\n#define PREFERL1\n'
if L1_PREFERRED else '//Large shared memory active\n'
) + '#endif\n'
)
with open(os.path.join(builddir, 'regcount'), 'w') as file:
file.write('{}'.format(get_register_count(blocks_per_sm, num_threads)))