DiFfRG
Loading...
Searching...
No Matches
integrator_2Dpx0_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
27 template <typename ctype, typename NT, typename KERNEL, typename... T>
28 __global__ void gridreduce_2dpx0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
29 const ctype *ang_quadrature_p, const ctype *ang_quadrature_w,
30 const ctype *x0_quadrature_p, const ctype *x0_quadrature_w, const ctype x_extent,
31 const ctype x0_extent, const ctype k, T... t)
32 {
33 uint len_x = gridDim.x * blockDim.x;
34 uint len_y = gridDim.y * blockDim.y;
35 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
36 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
37 uint idx_z = (blockIdx.z * blockDim.z) + threadIdx.z;
38 uint idx = idx_z * len_x * len_y + idx_y * len_x + idx_x;
39
40 const ctype cos = 2 * (ang_quadrature_p[idx_y] - (ctype)0.5);
41 const ctype q0 = k * x0_quadrature_p[idx_z] * x0_extent;
42 const ctype weight =
43 x0_quadrature_w[idx_z] * x0_extent * 2 * ang_quadrature_w[idx_y] * x_quadrature_w[idx_x] * x_extent;
44 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
45 const ctype S_2d = 2 * (ctype)M_PI;
46 const ctype int_element = (1 / (ctype)2 * powr<2>(k)) // x = p^2 / k^2 integral
47 * (S_2d) * (1 / (ctype)2) // S_2 and divide the cos integral out
48 * (k) // x0 = q0 / k integral
49 / powr<3>(2 * (ctype)M_PI); // fourier factor
50
51 NT res = KERNEL::kernel(q, cos, q0, k, t...) + KERNEL::kernel(q, cos, -q0, k, t...);
52
53 dest[idx] = int_element * res * weight;
54 }
55
66 template <typename NT, typename KERNEL> class Integrator2Dpx0GPU
67 {
68 public:
72 using ctype = typename get_type::ctype<NT>;
73
83 Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> grid_sizes,
84 const ctype x_extent, const ctype x0_extent, const JSONValue &json)
85 : Integrator2Dpx0GPU(quadrature_provider, grid_sizes, x_extent, x0_extent,
86 json.get_uint("/integration/cudathreadsperblock"))
87 {
88 }
89
99 Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, std::array<uint, 3> grid_sizes, const ctype x_extent,
100 const ctype x0_extent, const uint max_block_size = 256)
102 x0_extent(x0_extent), pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
103 {
104 ptr_x_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[0]);
105 ptr_x_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[0]);
106 ptr_ang_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[1]);
107 ptr_ang_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[1]);
108 ptr_x0_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[2]);
109 ptr_x0_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[2]);
110
111 block_sizes = {max_block_size, max_block_size, max_block_size};
112 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
113 // are as close to each other as possible
114 uint optimize_dim = 2;
115 while (block_sizes[0] * block_sizes[1] * block_sizes[2] > max_block_size || block_sizes[0] > grid_sizes[0] ||
116 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
117 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
118 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
119 block_sizes[optimize_dim]--;
120 optimize_dim = (optimize_dim + 2) % 3;
121 }
122
123 uint blocks1 = grid_sizes[0] / block_sizes[0];
124 uint threads1 = block_sizes[0];
125 uint blocks2 = grid_sizes[1] / block_sizes[1];
126 uint threads2 = block_sizes[1];
127 uint blocks3 = grid_sizes[2] / block_sizes[2];
128 uint threads3 = block_sizes[2];
129
130 num_blocks = dim3(blocks1, blocks2, blocks3);
131 threads_per_block = dim3(threads1, threads2, threads3);
132 }
133
146
157 template <typename... T> NT get(const ctype k, const T &...t) const
158 {
159 const auto cuda_stream = cuda_stream_pool.get_stream();
160 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
164 check_cuda();
165 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
166 device_data.end(), NT(0.), thrust::plus<NT>());
167 }
168
179 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
180 {
181 const auto cuda_stream = cuda_stream_pool.get_stream();
182 std::shared_ptr<rmm::device_uvector<NT>> device_data =
183 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
187 check_cuda();
188 const NT constant = KERNEL::constant(k, t...);
189
190 return std::async(std::launch::deferred, [=, this]() {
191 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
192 (*device_data).end(), NT(0.), thrust::plus<NT>());
193 });
194 }
195
196 private:
197 const std::array<uint, 3> grid_sizes;
198 std::array<uint, 3> block_sizes;
199
201
208
211
214
215 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
216 mutable PoolMR pool;
217 const rmm::cuda_stream_pool cuda_stream_pool;
218 };
219} // namespace DiFfRG
220
221#else
222
223#ifdef USE_CUDA
224
225#else
226
228
229namespace DiFfRG
230{
231 template <typename NT, typename KERNEL> class Integrator2Dpx0GPU : public Integrator2Dpx0TBB<NT, KERNEL>
232 {
233 public:
234 using ctype = typename get_type::ctype<NT>;
235
236 Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, std::array<uint, 3> grid_sizes, const ctype x_extent,
237 const ctype x0_extent, const uint max_block_size = 256)
238 : Integrator2Dpx0TBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, x0_extent)
239 {
240 }
241
242 Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> grid_sizes,
243 const ctype x_extent, const ctype x0_extent, const JSONValue &)
244 : Integrator2Dpx0TBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, x0_extent)
245 {
246 }
247 };
248} // namespace DiFfRG
249
250#endif
251
252#endif
Integrator for 2+1D integrals over p, x0 and an angle using the GPU. Calculates.
Definition integrator_2Dpx0_gpu.hh:67
Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 3 > grid_sizes, const ctype x_extent, const ctype x0_extent, const JSONValue &json)
Construct a new Integrator2Dpx0GPU object.
Definition integrator_2Dpx0_gpu.hh:83
NT get(const ctype k, const T &...t) const
Get the integral of the kernel.
Definition integrator_2Dpx0_gpu.hh:157
std::future< NT > request(const ctype k, const T &...t) const
Request a future for the integral of the kernel.
Definition integrator_2Dpx0_gpu.hh:179
const uint device_data_size
Definition integrator_2Dpx0_gpu.hh:200
const ctype x_extent
Definition integrator_2Dpx0_gpu.hh:209
Integrator2Dpx0GPU(const Integrator2Dpx0GPU &other)
Definition integrator_2Dpx0_gpu.hh:134
const ctype * ptr_ang_quadrature_w
Definition integrator_2Dpx0_gpu.hh:205
const ctype * ptr_ang_quadrature_p
Definition integrator_2Dpx0_gpu.hh:204
dim3 num_blocks
Definition integrator_2Dpx0_gpu.hh:212
std::array< uint, 3 > block_sizes
Definition integrator_2Dpx0_gpu.hh:198
dim3 threads_per_block
Definition integrator_2Dpx0_gpu.hh:213
const ctype * ptr_x_quadrature_p
Definition integrator_2Dpx0_gpu.hh:202
PoolMR pool
Definition integrator_2Dpx0_gpu.hh:216
const ctype * ptr_x_quadrature_w
Definition integrator_2Dpx0_gpu.hh:203
const std::array< uint, 3 > grid_sizes
Definition integrator_2Dpx0_gpu.hh:197
const ctype * ptr_x0_quadrature_w
Definition integrator_2Dpx0_gpu.hh:207
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_2Dpx0_gpu.hh:72
Integrator2Dpx0GPU(QuadratureProvider &quadrature_provider, std::array< uint, 3 > grid_sizes, const ctype x_extent, const ctype x0_extent, const uint max_block_size=256)
Construct a new Integrator2Dpx0GPU object.
Definition integrator_2Dpx0_gpu.hh:99
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_2Dpx0_gpu.hh:215
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_2Dpx0_gpu.hh:217
const ctype * ptr_x0_quadrature_p
Definition integrator_2Dpx0_gpu.hh:206
const ctype x0_extent
Definition integrator_2Dpx0_gpu.hh:210
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22
__global__ void gridreduce_2dpx0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *ang_quadrature_p, const ctype *ang_quadrature_w, const ctype *x0_quadrature_p, const ctype *x0_quadrature_w, const ctype x_extent, const ctype x0_extent, const ctype k, T... t)
GPU kernel for the integration of a function dependent on p, an angle cos1 and q0.
Definition integrator_2Dpx0_gpu.hh:28