DiFfRG
Loading...
Searching...
No Matches
integrator_3D_cartesian_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
20 template <typename ctype, typename NT, typename KERNEL, typename... T>
21 __global__ void gridreduce_3d_cartesian(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
22 const ctype *y_quadrature_p, const ctype *y_quadrature_w,
23 const ctype *z_quadrature_p, const ctype *z_quadrature_w, const ctype qx_min,
24 const ctype qy_min, const ctype qz_min, const ctype qx_extent,
25 const ctype qy_extent, const ctype qz_extent, const ctype k, T... t)
26 {
27 uint len_x = gridDim.x * blockDim.x;
28 uint len_y = gridDim.y * blockDim.y;
29 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
30 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
31 uint idx_z = (blockIdx.z * blockDim.z) + threadIdx.z;
32 uint idx = idx_z * len_x * len_y + idx_y * len_x + idx_x;
33
34 constexpr int d = 3;
35 constexpr ctype int_element = powr<-d>(2 * (ctype)M_PI); // fourier factor
36
37 const ctype qx = qx_min + qx_extent * x_quadrature_p[idx_x];
38 const ctype qy = qy_min + qy_extent * y_quadrature_p[idx_y];
39 const ctype qz = qz_min + qz_extent * z_quadrature_p[idx_z];
40 const ctype weight =
41 qz_extent * z_quadrature_w[idx_z] * qy_extent * y_quadrature_w[idx_y] * qx_extent * x_quadrature_w[idx_x];
42
43 NT res = KERNEL::kernel(qx, qy, qz, k, t...);
44
45 dest[idx] = int_element * res * weight;
46 }
47
48 template <typename NT, typename KERNEL> class Integrator3DCartesianGPU
49 {
50 public:
54 using ctype = typename get_type::ctype<NT>;
55
56 Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> grid_sizes,
57 const ctype x_extent, const JSONValue &json)
58 : Integrator3DCartesianGPU(quadrature_provider, grid_sizes, x_extent,
59 json.get_uint("/integration/cudathreadsperblock"),
60 json.get_double("/discretization/integration/qx_min", -M_PI),
61 json.get_double("/discretization/integration/qy_min", -M_PI),
62 json.get_double("/discretization/integration/qz_min", -M_PI),
63 json.get_double("/discretization/integration/qx_max", M_PI),
64 json.get_double("/discretization/integration/qy_max", M_PI),
65 json.get_double("/discretization/integration/qz_max", M_PI))
66 {
67 }
68
69 Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, std::array<uint, 3> grid_sizes,
70 const ctype x_extent, const uint max_block_size = 256, const ctype qx_min = -M_PI,
71 const ctype qy_min = -M_PI, const ctype qz_min = -M_PI, const ctype qx_max = M_PI,
72 const ctype qy_max = M_PI, const ctype qz_max = M_PI)
74 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
75 {
76 ptr_x_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[0]);
77 ptr_x_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[0]);
78 ptr_y_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[1]);
79 ptr_y_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[1]);
80 ptr_z_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[2]);
81 ptr_z_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[2]);
82
83 block_sizes = {max_block_size, max_block_size, max_block_size};
84 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
85 // are as close to each other as possible
86 uint optimize_dim = 2;
87 while (block_sizes[0] * block_sizes[1] * block_sizes[2] > max_block_size || block_sizes[0] > grid_sizes[0] ||
88 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
89 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
90 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
91 block_sizes[optimize_dim]--;
92 optimize_dim = (optimize_dim + 2) % 3;
93 }
94
95 uint blocks1 = grid_sizes[0] / block_sizes[0];
96 uint threads1 = block_sizes[0];
97 uint blocks2 = grid_sizes[1] / block_sizes[1];
98 uint threads2 = block_sizes[1];
99 uint blocks3 = grid_sizes[2] / block_sizes[2];
100 uint threads3 = block_sizes[2];
101
102 num_blocks = dim3(blocks1, blocks2, blocks3);
103 threads_per_block = dim3(threads1, threads2, threads3);
104
105 this->qx_min = qx_min;
106 this->qy_min = qy_min;
107 this->qz_min = qz_min;
108 this->qx_extent = qx_max - qx_min;
109 this->qy_extent = qy_max - qy_min;
110 this->qz_extent = qz_max - qz_min;
111 }
112
131
136 {
137 this->qx_extent = this->qx_extent - qx_min + this->qx_min;
138 this->qx_min = qx_min;
139 }
140
145 {
146 this->qy_extent = this->qy_extent - qy_min + this->qy_min;
147 this->qy_min = qy_min;
148 }
149
154 {
155 this->qz_extent = this->qz_extent - qz_min + this->qz_min;
156 this->qz_min = qz_min;
157 }
158
162 void set_qx_max(const ctype qx_max) { this->qx_extent = qx_max - qx_min; }
163
167 void set_qy_max(const ctype qy_max) { this->qy_extent = qy_max - qy_min; }
168
172 void set_qz_max(const ctype qz_max) { this->qz_extent = qz_max - qz_min; }
173
174 template <typename... T> NT get(const ctype k, const T &...t) const
175 {
176 const auto cuda_stream = cuda_stream_pool.get_stream();
177 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
181 check_cuda();
182 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
183 device_data.end(), NT(0.), thrust::plus<NT>());
184 }
185
196 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
197 {
198 const auto cuda_stream = cuda_stream_pool.get_stream();
199 std::shared_ptr<rmm::device_uvector<NT>> device_data =
200 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
204 check_cuda();
205 const NT constant = KERNEL::constant(k, t...);
206
207 return std::async(std::launch::deferred, [=, this]() {
208 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
209 (*device_data).end(), NT(0.), thrust::plus<NT>());
210 });
211 }
212
213 private:
214 const std::array<uint, 3> grid_sizes;
215 std::array<uint, 3> block_sizes;
216
218
225
226 ctype qx_min = -M_PI;
227 ctype qy_min = -M_PI;
228 ctype qx_extent = 2 * M_PI;
229 ctype qy_extent = 2 * M_PI;
230 ctype qz_min = -M_PI;
231 ctype qz_extent = 2 * M_PI;
232
235
236 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
237 mutable PoolMR pool;
238 const rmm::cuda_stream_pool cuda_stream_pool;
239 };
240} // namespace DiFfRG
241
242#else
243
244#ifdef USE_CUDA
245
246#else
247
249
250namespace DiFfRG
251{
252 template <typename NT, typename KERNEL> class Integrator3DCartesianGPU : public Integrator3DCartesianTBB<NT, KERNEL>
253 {
254 public:
255 using ctype = typename get_type::ctype<NT>;
256
257 Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, std::array<uint, 3> grid_sizes,
258 const ctype x_extent, const uint max_block_size = 256, const ctype qx_min = -M_PI,
259 const ctype qy_min = -M_PI, const ctype qz_min = -M_PI, const ctype qx_max = M_PI,
260 const ctype qy_max = M_PI, const ctype qz_max = M_PI)
261 : Integrator3DCartesianTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, max_block_size, qx_min,
262 qy_min, qz_min, qx_max, qy_max, qz_max)
263 {
264 }
265
266 Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> grid_sizes,
267 const ctype x_extent, const JSONValue &json)
268 : Integrator3DCartesianTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, json)
269 {
270 }
271 };
272} // namespace DiFfRG
273
274#endif
275
276#endif
Definition integrator_3D_cartesian_gpu.hh:49
const ctype * ptr_z_quadrature_p
Definition integrator_3D_cartesian_gpu.hh:223
ctype qx_extent
Definition integrator_3D_cartesian_gpu.hh:228
void set_qz_min(const ctype qz_min)
Set the minimum value of the qz integration range.
Definition integrator_3D_cartesian_gpu.hh:153
const ctype * ptr_y_quadrature_w
Definition integrator_3D_cartesian_gpu.hh:222
const ctype * ptr_z_quadrature_w
Definition integrator_3D_cartesian_gpu.hh:224
ctype qx_min
Definition integrator_3D_cartesian_gpu.hh:226
void set_qx_min(const ctype qx_min)
Set the minimum value of the qx integration range.
Definition integrator_3D_cartesian_gpu.hh:135
Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array< uint, 3 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_3D_cartesian_gpu.hh:56
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_3D_cartesian_gpu.hh:54
Integrator3DCartesianGPU(QuadratureProvider &quadrature_provider, std::array< uint, 3 > grid_sizes, const ctype x_extent, const uint max_block_size=256, const ctype qx_min=-M_PI, const ctype qy_min=-M_PI, const ctype qz_min=-M_PI, const ctype qx_max=M_PI, const ctype qy_max=M_PI, const ctype qz_max=M_PI)
Definition integrator_3D_cartesian_gpu.hh:69
dim3 threads_per_block
Definition integrator_3D_cartesian_gpu.hh:234
NT get(const ctype k, const T &...t) const
Definition integrator_3D_cartesian_gpu.hh:174
void set_qy_min(const ctype qy_min)
Set the minimum value of the qy integration range.
Definition integrator_3D_cartesian_gpu.hh:144
const ctype * ptr_x_quadrature_w
Definition integrator_3D_cartesian_gpu.hh:220
ctype qz_min
Definition integrator_3D_cartesian_gpu.hh:230
void set_qx_max(const ctype qx_max)
Set the maximum value of the qx integration range.
Definition integrator_3D_cartesian_gpu.hh:162
const uint device_data_size
Definition integrator_3D_cartesian_gpu.hh:217
ctype qy_min
Definition integrator_3D_cartesian_gpu.hh:227
PoolMR pool
Definition integrator_3D_cartesian_gpu.hh:237
void set_qz_max(const ctype qz_max)
Set the maximum value of the qz integration range.
Definition integrator_3D_cartesian_gpu.hh:172
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_3D_cartesian_gpu.hh:236
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_3D_cartesian_gpu.hh:238
const ctype * ptr_x_quadrature_p
Definition integrator_3D_cartesian_gpu.hh:219
ctype qz_extent
Definition integrator_3D_cartesian_gpu.hh:231
void set_qy_max(const ctype qy_max)
Set the maximum value of the qy integration range.
Definition integrator_3D_cartesian_gpu.hh:167
const std::array< uint, 3 > grid_sizes
Definition integrator_3D_cartesian_gpu.hh:214
std::future< NT > request(const ctype k, const T &...t) const
Request a future for the integral of the kernel.
Definition integrator_3D_cartesian_gpu.hh:196
const ctype * ptr_y_quadrature_p
Definition integrator_3D_cartesian_gpu.hh:221
Integrator3DCartesianGPU(const Integrator3DCartesianGPU &other)
Definition integrator_3D_cartesian_gpu.hh:113
std::array< uint, 3 > block_sizes
Definition integrator_3D_cartesian_gpu.hh:215
ctype qy_extent
Definition integrator_3D_cartesian_gpu.hh:229
dim3 num_blocks
Definition integrator_3D_cartesian_gpu.hh:233
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
__global__ void gridreduce_3d_cartesian(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *y_quadrature_p, const ctype *y_quadrature_w, const ctype *z_quadrature_p, const ctype *z_quadrature_w, const ctype qx_min, const ctype qy_min, const ctype qz_min, const ctype qx_extent, const ctype qy_extent, const ctype qz_extent, const ctype k, T... t)
Definition integrator_3D_cartesian_gpu.hh:21
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22