DiFfRG
Loading...
Searching...
No Matches
integrator_2D_cartesian_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
27 template <typename ctype, typename NT, typename KERNEL, typename... T>
28 __global__ void gridreduce_2d_cartesian(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
29 const ctype *y_quadrature_p, const ctype *y_quadrature_w, const ctype qx_min,
30 const ctype qy_min, const ctype qx_extent, const ctype qy_extent,
31 const ctype k, T... t)
32 {
33 uint len_x = gridDim.x * blockDim.x;
34 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
35 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
36 uint idx = idx_y * len_x + idx_x;
37
38 constexpr int d = 2;
39 constexpr ctype int_element = powr<-d>(2 * (ctype)M_PI); // fourier factor
40
41 const ctype qx = qx_min + qx_extent * x_quadrature_p[idx_x];
42 const ctype qy = qy_min + qy_extent * y_quadrature_p[idx_y];
43 const ctype weight = qy_extent * y_quadrature_w[idx_y] * qx_extent * x_quadrature_w[idx_x];
44
45 dest[idx] = int_element * weight * KERNEL::kernel(qx, qy, k, t...);
46 }
47
54 template <typename NT, typename KERNEL> class Integrator2DCartesianGPU
55 {
56 public:
60 using ctype = typename get_type::ctype<NT>;
61
70 Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array<uint, 2> grid_sizes,
71 const ctype x_extent, const JSONValue &json)
72 : Integrator2DCartesianGPU(quadrature_provider, grid_sizes, x_extent,
73 json.get_uint("/integration/cudathreadsperblock"))
74 {
75 }
76
88 Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, std::array<uint, 2> grid_sizes,
89 const ctype x_extent = 0., const uint max_block_size = 256, const ctype qx_min = -M_PI,
90 const ctype qy_min = -M_PI, const ctype qx_max = M_PI, const ctype qy_max = M_PI)
92 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
93 {
94 ptr_x_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[0]);
95 ptr_x_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[0]);
96 ptr_y_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[1]);
97 ptr_y_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[1]);
98
99 block_sizes = {max_block_size, max_block_size};
100 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
101 // are as close to each other as possible
102 uint optimize_dim = 1;
103 while (block_sizes[0] * block_sizes[1] > max_block_size || block_sizes[0] > grid_sizes[0] ||
104 block_sizes[1] > grid_sizes[1]) {
105 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
106 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
107 block_sizes[optimize_dim]--;
108 optimize_dim = (optimize_dim + 1) % 2;
109 }
110
111 uint blocks1 = grid_sizes[0] / block_sizes[0];
112 uint threads1 = block_sizes[0];
113 uint blocks2 = grid_sizes[1] / block_sizes[1];
114 uint threads2 = block_sizes[1];
115
116 num_blocks = dim3(blocks1, blocks2);
117 threads_per_block = dim3(threads1, threads2);
118
119 this->qx_min = qx_min;
120 this->qy_min = qy_min;
121 this->qx_extent = qx_max - qx_min;
122 this->qy_extent = qy_max - qy_min;
123 }
124
134 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
135 {
137 block_sizes = other.block_sizes;
138 num_blocks = other.num_blocks;
140
141 qx_min = other.qx_min;
142 qy_min = other.qy_min;
143 qx_extent = other.qx_extent;
144 qy_extent = other.qy_extent;
145 }
146
151 {
152 this->qx_extent = this->qx_extent - qx_min + this->qx_min;
153 this->qx_min = qx_min;
154 }
155
160 {
161 this->qy_extent = this->qy_extent - qy_min + this->qy_min;
162 this->qy_min = qy_min;
163 }
164
168 void set_qx_max(const ctype qx_max) { this->qx_extent = qx_max - qx_min; }
169
173 void set_qy_max(const ctype qy_max) { this->qy_extent = qy_max - qy_min; }
174
182 template <typename... T> NT get(const ctype k, const T &...t) const
183 {
184 const auto cuda_stream = cuda_stream_pool.get_stream();
185 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream.value(), &pool);
188 qy_min, qx_extent, qy_extent, k, t...);
189 check_cuda();
190 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
191 device_data.end(), NT(0.), thrust::plus<NT>());
192 }
193
201 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
202 {
203 const auto cuda_stream = cuda_stream_pool.get_stream();
204 std::shared_ptr<rmm::device_uvector<NT>> device_data =
205 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
208 qy_min, qx_extent, qy_extent, k, t...);
209 check_cuda();
210 const NT constant = KERNEL::constant(k, t...);
211
212 return std::async(std::launch::deferred, [=, this]() {
213 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
214 (*device_data).end(), NT(0.), thrust::plus<NT>());
215 });
216 }
217
218 private:
219 const std::array<uint, 2> grid_sizes;
220 std::array<uint, 2> block_sizes;
221
223
224 ctype qx_min = -M_PI;
225 ctype qy_min = -M_PI;
226 ctype qx_extent = 2 * M_PI;
227 ctype qy_extent = 2 * M_PI;
228
233
236
237 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
238 mutable PoolMR pool;
239 const rmm::cuda_stream_pool cuda_stream_pool;
240 };
241} // namespace DiFfRG
242
243#else
244
245#ifdef USE_CUDA
246
247namespace DiFfRG
248{
249 template <typename NT, typename KERNEL> class Integrator2DCartesianGPU;
250}
251
252#else
253
255
256namespace DiFfRG
257{
258 template <utypename NT, typename KERNEL> class Integrator2DCartesianGPU : public Integrator2DCartesianTBB<NT, KERNEL>
259 {
260 public:
261 using ctype = typename get_type::ctype<NT>;
262
263 Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, std::array<uint, 2> grid_sizes,
264 const ctype x_extent, const uint max_block_size = 256, const ctype qx_min = -M_PI,
265 const ctype qy_min = -M_PI, const ctype qx_max = M_PI, const ctype qy_max = M_PI)
266 : Integrator2DCartesianTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, max_block_size, qx_min,
267 qy_min, qx_max, qy_max)
268 {
269 }
270
271 Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array<uint, 2> grid_sizes,
272 const ctype x_extent, const JSONValue &json)
273 : Integrator2DCartesianTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, json)
274 {
275 }
276 };
277} // namespace DiFfRG
278
279#endif
280
281#endif
Integration of an arbitrary 2D function from (qx_min, qy_min) to (qx_max, qy_max) using TBB.
Definition integrator_2D_cartesian_gpu.hh:55
const std::array< uint, 2 > grid_sizes
Definition integrator_2D_cartesian_gpu.hh:219
void set_qy_max(const ctype qy_max)
Set the maximum value of the qy integration range.
Definition integrator_2D_cartesian_gpu.hh:173
const ctype * ptr_y_quadrature_w
Definition integrator_2D_cartesian_gpu.hh:232
ctype qx_min
Definition integrator_2D_cartesian_gpu.hh:224
dim3 num_blocks
Definition integrator_2D_cartesian_gpu.hh:234
const uint device_data_size
Definition integrator_2D_cartesian_gpu.hh:222
void set_qx_min(const ctype qx_min)
Set the minimum value of the qx integration range.
Definition integrator_2D_cartesian_gpu.hh:150
const ctype * ptr_x_quadrature_p
Definition integrator_2D_cartesian_gpu.hh:229
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_2D_cartesian_gpu.hh:237
Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, const std::array< uint, 2 > grid_sizes, const ctype x_extent, const JSONValue &json)
Construct a new Integrator2DCartesianGPU object.
Definition integrator_2D_cartesian_gpu.hh:70
const ctype * ptr_x_quadrature_w
Definition integrator_2D_cartesian_gpu.hh:230
Integrator2DCartesianGPU(QuadratureProvider &quadrature_provider, std::array< uint, 2 > grid_sizes, const ctype x_extent=0., const uint max_block_size=256, const ctype qx_min=-M_PI, const ctype qy_min=-M_PI, const ctype qx_max=M_PI, const ctype qy_max=M_PI)
Construct a new Integrator2DCartesianGPU object.
Definition integrator_2D_cartesian_gpu.hh:88
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_2D_cartesian_gpu.hh:239
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_2D_cartesian_gpu.hh:60
Integrator2DCartesianGPU(const Integrator2DCartesianGPU &other)
Copy a Integrator2DCartesianGPU object.
Definition integrator_2D_cartesian_gpu.hh:130
dim3 threads_per_block
Definition integrator_2D_cartesian_gpu.hh:235
ctype qy_min
Definition integrator_2D_cartesian_gpu.hh:225
const ctype * ptr_y_quadrature_p
Definition integrator_2D_cartesian_gpu.hh:231
std::future< NT > request(const ctype k, const T &...t) const
Get the result of the integration asynchronously.
Definition integrator_2D_cartesian_gpu.hh:201
NT get(const ctype k, const T &...t) const
Get the result of the integration.
Definition integrator_2D_cartesian_gpu.hh:182
ctype qy_extent
Definition integrator_2D_cartesian_gpu.hh:227
void set_qx_max(const ctype qx_max)
Set the maximum value of the qx integration range.
Definition integrator_2D_cartesian_gpu.hh:168
PoolMR pool
Definition integrator_2D_cartesian_gpu.hh:238
std::array< uint, 2 > block_sizes
Definition integrator_2D_cartesian_gpu.hh:220
void set_qy_min(const ctype qy_min)
Set the minimum value of the qy integration range.
Definition integrator_2D_cartesian_gpu.hh:159
ctype qx_extent
Definition integrator_2D_cartesian_gpu.hh:226
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
__global__ void gridreduce_2d_cartesian(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *y_quadrature_p, const ctype *y_quadrature_w, const ctype qx_min, const ctype qy_min, const ctype qx_extent, const ctype qy_extent, const ctype k, T... t)
GPU kernel for the integration of an arbitrary 2D function from qx_min to qx_max and qy_min to qy_max...
Definition integrator_2D_cartesian_gpu.hh:28
unsigned int uint
Definition utils.hh:22