9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
20 template <
typename ctype,
int d,
typename NT,
typename KERNEL,
typename... T>
22 const ctype *x0_quadrature_p,
const ctype *x0_quadrature_w,
23 const ctype x_extent,
const ctype x0_extent,
const uint x0_summands,
24 const ctype m_T,
const ctype k, T... t)
26 uint len_x = gridDim.x * blockDim.x;
27 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
28 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
29 uint idx = idx_y * len_x + idx_x;
31 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
36 if (idx_y >= x0_summands) {
37 const ctype integral_start = (2 * x0_summands * (ctype)M_PI * m_T) / k;
38 const ctype log_start = log(integral_start + (m_T == 0) * ctype(1e-3));
39 const ctype log_ext = log(x0_extent / (integral_start + (m_T == 0) * ctype(1e-3)));
41 const ctype q0 = k * (exp(log_start + log_ext * x0_quadrature_p[idx_y - x0_summands]) - (m_T == 0) * ctype(1e-3));
43 const ctype int_element = S_dm1
47 const ctype weight = x_quadrature_w[idx_x] * x_extent * (x0_quadrature_w[idx_y - x0_summands] * log_ext * q0 / k);
49 res = int_element * weight * (KERNEL::kernel(q, q0, k, t...) + KERNEL::kernel(q, -q0, k, t...));
51 const ctype q0 = 2 * (ctype)M_PI * m_T * idx_y;
52 const ctype int_element = m_T * S_dm1
55 const ctype weight = x_quadrature_w[idx_x] * x_extent;
56 res = int_element * weight *
57 (idx_y == 0 ? KERNEL::kernel(q, (ctype)0, k, t...)
58 : KERNEL::kernel(q, q0, k, t...) + KERNEL::kernel(q, -q0, k, t...));
72 json.get_double(
"/physical/T"), json.get_uint(
"/integration/cudathreadsperblock"))
78 const uint max_block_size = 256)
80 device_data_size(grid_sizes[0] * grid_sizes[1]), x_extent(x_extent), x0_extent(x0_extent),
81 original_x0_summands(_x0_summands),
82 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
85 ptr_x_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[0]);
89 block_sizes = {max_block_size, max_block_size};
92 uint optimize_dim = 1;
93 while (block_sizes[0] * block_sizes[1] > max_block_size || block_sizes[0] > grid_sizes[0] ||
94 block_sizes[1] > grid_sizes[1]) {
95 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
96 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
97 block_sizes[optimize_dim]--;
98 optimize_dim = (optimize_dim + 1) % 2;
101 uint blocks1 = grid_sizes[0] / block_sizes[0];
102 uint threads1 = block_sizes[0];
103 uint blocks2 = grid_sizes[1] / block_sizes[1];
104 uint threads2 = block_sizes[1];
106 num_blocks = dim3(blocks1, blocks2);
107 threads_per_block = dim3(threads1, threads2);
116 x0_summands = original_x0_summands;
117 ptr_x0_quadrature_p = quadrature_provider.get_device_points<
ctype>(grid_sizes[1] - x0_summands);
118 ptr_x0_quadrature_w = quadrature_provider.get_device_weights<
ctype>(grid_sizes[1] - x0_summands);
124 : quadrature_provider(other.quadrature_provider), grid_sizes(other.grid_sizes),
125 device_data_size(other.device_data_size), ptr_x_quadrature_p(other.ptr_x_quadrature_p),
126 ptr_x_quadrature_w(other.ptr_x_quadrature_w), ptr_x0_quadrature_p(other.ptr_x0_quadrature_p),
127 ptr_x0_quadrature_w(other.ptr_x0_quadrature_w), x_extent(other.x_extent), x0_extent(other.x0_extent),
128 original_x0_summands(other.original_x0_summands), x0_summands(other.x0_summands), m_T(other.m_T),
129 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
136 template <
typename... T> NT
get(
const ctype k,
const T &...t)
const
138 const auto cuda_stream = cuda_stream_pool.get_stream();
139 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
141 device_data.data(), ptr_x_quadrature_p, ptr_x_quadrature_w, ptr_x0_quadrature_p, ptr_x0_quadrature_w,
142 x_extent, x0_extent, x0_summands, m_T, k, t...);
144 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
145 device_data.end(), NT(0), thrust::plus<NT>());
148 template <
typename... T> std::future<NT>
request(
const ctype k,
const T &...t)
const
150 const auto cuda_stream = cuda_stream_pool.get_stream();
151 std::shared_ptr<rmm::device_uvector<NT>> device_data =
152 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
154 (*device_data).data(), ptr_x_quadrature_p, ptr_x_quadrature_w, ptr_x0_quadrature_p, ptr_x0_quadrature_w,
155 x_extent, x0_extent, x0_summands, m_T, k, t...);
157 const NT constant = KERNEL::constant(k, t...);
159 return std::async(std::launch::deferred, [=,
this]() {
160 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
161 (*device_data).end(), NT(0), thrust::plus<NT>());
187 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
199 template <
int d,
typename NT,
typename KERNEL>
class IntegratorFiniteTx0GPU;
208 template <
int d,
typename NT,
typename KERNEL>
209 class IntegratorFiniteTx0GPU :
public IntegratorFiniteTx0TBB<d, NT, KERNEL>
216 const uint max_block_size = 256)
Definition integrator_finiteTx0_gpu.hh:65
void set_T(const ctype T)
Definition integrator_finiteTx0_gpu.hh:110
NT get(const ctype k, const T &...t) const
Definition integrator_finiteTx0_gpu.hh:136
std::array< uint, 2 > block_sizes
Definition integrator_finiteTx0_gpu.hh:169
const ctype * ptr_x0_quadrature_p
Definition integrator_finiteTx0_gpu.hh:175
typename get_type::ctype< NT > ctype
Definition integrator_finiteTx0_gpu.hh:67
QuadratureProvider & quadrature_provider
Definition integrator_finiteTx0_gpu.hh:166
const ctype x_extent
Definition integrator_finiteTx0_gpu.hh:178
const uint original_x0_summands
Definition integrator_finiteTx0_gpu.hh:180
IntegratorFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 2 > grid_sizes, const ctype x_extent, const ctype x0_extent, const uint x0_summands, const JSONValue &json)
Definition integrator_finiteTx0_gpu.hh:69
const ctype * ptr_x_quadrature_p
Definition integrator_finiteTx0_gpu.hh:173
PoolMR pool
Definition integrator_finiteTx0_gpu.hh:188
uint x0_summands
Definition integrator_finiteTx0_gpu.hh:181
ctype m_T
Definition integrator_finiteTx0_gpu.hh:182
const ctype * ptr_x0_quadrature_w
Definition integrator_finiteTx0_gpu.hh:176
ctype x0_extent
Definition integrator_finiteTx0_gpu.hh:179
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_finiteTx0_gpu.hh:187
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_finiteTx0_gpu.hh:189
IntegratorFiniteTx0GPU(const IntegratorFiniteTx0GPU &other)
Definition integrator_finiteTx0_gpu.hh:123
const std::array< uint, 2 > grid_sizes
Definition integrator_finiteTx0_gpu.hh:168
std::future< NT > request(const ctype k, const T &...t) const
Definition integrator_finiteTx0_gpu.hh:148
const ctype * ptr_x_quadrature_w
Definition integrator_finiteTx0_gpu.hh:174
const uint device_data_size
Definition integrator_finiteTx0_gpu.hh:171
dim3 num_blocks
Definition integrator_finiteTx0_gpu.hh:184
IntegratorFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 2 > _grid_sizes, const ctype x_extent, const ctype x0_extent, const uint _x0_summands, const ctype T, const uint max_block_size=256)
Definition integrator_finiteTx0_gpu.hh:76
void set_x0_extent(const ctype val)
Definition integrator_finiteTx0_gpu.hh:121
dim3 threads_per_block
Definition integrator_finiteTx0_gpu.hh:185
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
bool __forceinline__ __host__ __device__ is_close(T1 a, T2 b, T3 eps_)
Function to evaluate whether two floats are equal to numerical precision. Tests for both relative and...
Definition math.hh:160
__global__ void gridreduce_1d_finiteTx0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *x0_quadrature_p, const ctype *x0_quadrature_w, const ctype x_extent, const ctype x0_extent, const uint x0_summands, const ctype m_T, const ctype k, T... t)
Definition integrator_finiteTx0_gpu.hh:21
consteval NT S_d_prec(uint d)
Surface of a d-dimensional sphere (precompiled)
Definition math.hh:104
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22