DiFfRG
Loading...
Searching...
No Matches
integrator_4D_2ang_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
20 template <typename ctype, typename NT, typename KERNEL, typename... T>
21 __global__ void gridreduce_4d_2ang(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
22 const ctype *ang_quadrature_p, const ctype *ang_quadrature_w, const ctype x_extent,
23 const ctype k, T... t)
24 {
25 const uint len_x = gridDim.x * blockDim.x;
26 const uint len_y = gridDim.y * blockDim.y;
27 const uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
28 const uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
29 const uint idx_z = (blockIdx.z * blockDim.z) + threadIdx.z;
30 const uint idx = idx_z * len_x * len_y + idx_y * len_x + idx_x;
31
32 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
33 const ctype cos1 = 2 * (ang_quadrature_p[idx_y] - (ctype)0.5);
34 const ctype int_element = (powr<2>(q) * (ctype)0.5 * powr<2>(k)) // x = p^2 / k^2 integral
35 * sqrt(1. - powr<2>(cos1)) // cos1 integral jacobian
36 * 2 * (ctype)M_PI // evaluated phi integral
37 / powr<4>(2 * (ctype)M_PI); // fourier factor
38 const ctype weight = 2 * ang_quadrature_w[idx_y] // cos1 weight
39 * x_quadrature_w[idx_x] * x_extent // x weight
40 * 2 * ang_quadrature_w[idx_z]; // cos2 weight
41 const ctype cos2 = 2 * (ang_quadrature_p[idx_z] - (ctype)0.5);
42
43 dest[idx] = KERNEL::kernel(q, cos1, cos2, k, t...) // kernel
44 * int_element * weight; // weight and integration element
45 }
46
58 template <typename NT, typename KERNEL> class Integrator4D2AngGPU
59 {
60 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
61
62 public:
66 using ctype = typename get_type::ctype<NT>;
67
69 const ctype x_extent, const JSONValue &json)
71 json.get_uint("/integration/cudathreadsperblock"))
72 {
73 }
74
76 const uint max_block_size = 256)
79 {
80 cudaGetDeviceCount(&n_devices);
81 if (n_devices == 0) throw std::runtime_error("No CUDA devices found!");
82
83 for (int device = 0; device < n_devices; ++device) {
84 const rmm::cuda_device_id device_id(device);
85 pool.emplace_back(
86 std::make_shared<PoolMR>(rmm::mr::get_per_device_resource(device_id), (device_data_size / 256 + 1) * 256));
87 }
88
89 if (grid_sizes[2] != grid_sizes[1])
90 throw std::runtime_error("Grid sizes must be currently equal for all angular dimensions!");
91
92 block_sizes = {max_block_size, max_block_size, max_block_size};
93 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
94 // are as close to each other as possible
95 uint optimize_dim = 2;
96 while (block_sizes[0] * block_sizes[1] * block_sizes[2] > max_block_size || block_sizes[0] > grid_sizes[0] ||
97 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
98 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
99 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
100 block_sizes[optimize_dim]--;
101 optimize_dim = (optimize_dim + 2) % 3;
102 }
103
104 uint blocks1 = grid_sizes[0] / block_sizes[0];
105 uint threads1 = block_sizes[0];
106 uint blocks2 = grid_sizes[1] / block_sizes[1];
107 uint threads2 = block_sizes[1];
108 uint blocks3 = grid_sizes[2] / block_sizes[2];
109 uint threads3 = block_sizes[2];
110
111 num_blocks = dim3(blocks1, blocks2, blocks3);
112 threads_per_block = dim3(threads1, threads2, threads3);
113
114 cudaSetDevice(0);
115 }
116
127
138 template <typename... T> NT get(const ctype k, const T &...t) const
139 {
140 const int m_device = evaluations++ % n_devices;
141 cudaSetDevice(m_device);
142
147
148 const auto cuda_stream = cuda_stream_pool.get_stream();
149 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, pool[m_device].get());
152 x_extent, k, t...);
153 check_cuda();
154 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
155 device_data.end(), NT(0.), thrust::plus<NT>());
156
157 cudaSetDevice(0);
158 }
159
170 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
171 {
172 const int m_device = evaluations++ % n_devices;
173 cudaSetDevice(m_device);
174
179
180 const auto cuda_stream = cuda_stream_pool.get_stream();
181 std::shared_ptr<rmm::device_uvector<NT>> device_data =
182 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, pool[m_device].get());
185 x_extent, k, t...);
186 check_cuda();
187 const NT constant = KERNEL::constant(k, t...);
188
189 return std::async(std::launch::deferred, [=, this]() {
190 cudaSetDevice(m_device);
191
192 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
193 (*device_data).end(), NT(0.), thrust::plus<NT>());
194 });
195
196 cudaSetDevice(0);
197 }
198
199 private:
201 const std::array<uint, 3> grid_sizes;
202 std::array<uint, 3> block_sizes;
203
205
210
212
215
217 mutable std::vector<std::shared_ptr<PoolMR>> pool;
218 const rmm::cuda_stream_pool cuda_stream_pool;
219
220 mutable std::atomic_ullong evaluations;
221 };
222} // namespace DiFfRG
223
224#else
225
226#ifdef USE_CUDA
227
228namespace DiFfRG
229{
230 template <typename NT, typename KERNEL> class Integrator4D2AngGPU;
231}
232
233#else
234
236
237namespace DiFfRG
238{
239 template <typename NT, typename KERNEL> class Integrator4D2AngGPU : public Integrator4D2AngTBB<NT, KERNEL>
240 {
241 public:
242 using ctype = typename get_type::ctype<NT>;
243
244 Integrator4D2AngGPU(QuadratureProvider &quadrature_provider, std::array<uint, 3> grid_sizes, const ctype x_extent,
245 const uint max_block_size = 256)
246 : Integrator4D2AngTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent)
247 {
248 }
249
250 Integrator4D2AngGPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> grid_sizes,
251 const ctype x_extent, const JSONValue &)
252 : Integrator4D2AngTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent)
253 {
254 }
255 };
256} // namespace DiFfRG
257
258#endif
259
260#endif
GPU integrator for the integration of a 4D function with two angles with CUDA. Calculates.
Definition integrator_4D_2ang_gpu.hh:59
const ctype x_extent
Definition integrator_4D_2ang_gpu.hh:211
const ctype * ptr_x_quadrature_p
Definition integrator_4D_2ang_gpu.hh:206
QuadratureProvider & quadrature_provider
Definition integrator_4D_2ang_gpu.hh:200
NT get(const ctype k, const T &...t) const
Get the integral of the kernel.
Definition integrator_4D_2ang_gpu.hh:138
Integrator4D2AngGPU(QuadratureProvider &quadrature_provider, std::array< uint, 3 > grid_sizes, const ctype x_extent, const uint max_block_size=256)
Definition integrator_4D_2ang_gpu.hh:75
std::array< uint, 3 > block_sizes
Definition integrator_4D_2ang_gpu.hh:202
int n_devices
Definition integrator_4D_2ang_gpu.hh:216
Integrator4D2AngGPU(const Integrator4D2AngGPU &other)
Definition integrator_4D_2ang_gpu.hh:117
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_4D_2ang_gpu.hh:218
const ctype * ptr_x_quadrature_w
Definition integrator_4D_2ang_gpu.hh:207
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_4D_2ang_gpu.hh:66
const ctype * ptr_ang_quadrature_w
Definition integrator_4D_2ang_gpu.hh:209
dim3 num_blocks
Definition integrator_4D_2ang_gpu.hh:213
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_4D_2ang_gpu.hh:60
dim3 threads_per_block
Definition integrator_4D_2ang_gpu.hh:214
const uint device_data_size
Definition integrator_4D_2ang_gpu.hh:204
Integrator4D2AngGPU(QuadratureProvider &quadrature_provider, const std::array< uint, 3 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_4D_2ang_gpu.hh:68
const std::array< uint, 3 > grid_sizes
Definition integrator_4D_2ang_gpu.hh:201
const ctype * ptr_ang_quadrature_p
Definition integrator_4D_2ang_gpu.hh:208
std::vector< std::shared_ptr< PoolMR > > pool
Definition integrator_4D_2ang_gpu.hh:217
std::atomic_ullong evaluations
Definition integrator_4D_2ang_gpu.hh:220
std::future< NT > request(const ctype k, const T &...t) const
Request a future for the integral of the kernel.
Definition integrator_4D_2ang_gpu.hh:170
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
__global__ void gridreduce_4d_2ang(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *ang_quadrature_p, const ctype *ang_quadrature_w, const ctype x_extent, const ctype k, T... t)
Definition integrator_4D_2ang_gpu.hh:21
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22