DiFfRG
Loading...
Searching...
No Matches
integrator_4D_finiteTx0_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
20 template <typename ctype, typename NT, typename KERNEL, typename... T>
21 __global__ void gridreduce_4d_finiteTx0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
22 const ctype *ang_quadrature_p, const ctype *ang_quadrature_w,
23 const ctype *x0_quadrature_p, const ctype *x0_quadrature_w,
24 const ctype x_extent, const ctype x0_extent, const uint x0_quadrature_size,
25 const uint x0_summands, const ctype m_T, const ctype k, T... t)
26 {
27 constexpr uint d = 4;
28
29 uint len_x = gridDim.x * blockDim.x;
30 uint len_y = gridDim.y * blockDim.y;
31 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
32 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
33 uint idx_z = (blockIdx.z * blockDim.z) + threadIdx.z;
34 uint idx = idx_z * len_x * len_y + idx_y * len_x + idx_x;
35
36 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
37 const ctype cos = 2 * (ang_quadrature_p[idx_y] - (ctype)0.5);
38 const ctype phi = 2 * (ctype)M_PI * ang_quadrature_p[idx_z];
39 const ctype weight =
40 2 * (ctype)M_PI * ang_quadrature_w[idx_z] * 2 * ang_quadrature_w[idx_y] * x_quadrature_w[idx_x] * x_extent;
41
42 NT res = 0;
43
44 // integral
45 const ctype int_element_int = (powr<d - 3>(q) / (ctype)2 * powr<2>(k)) // x = p^2 / k^2 integral
46 * (k) // x0 = q0 / k integral
47 / powr<d>(2 * (ctype)M_PI); // fourier factor
48
49 const ctype integral_start = (2 * x0_summands * (ctype)M_PI * m_T) / k;
50 const ctype log_start = log(integral_start + (m_T == 0) * ctype(1e-3));
51 const ctype log_ext = log(x0_extent / (integral_start + (m_T == 0) * ctype(1e-3)));
52
53 for (uint idx_0 = 0; idx_0 < x0_quadrature_size; ++idx_0) {
54 const ctype q0 = k * (exp(log_start + log_ext * x0_quadrature_p[idx_0]) - (m_T == 0) * ctype(1e-3));
55
56 const ctype m_weight = weight * (x0_quadrature_w[idx_0] * log_ext * q0 / k);
57
58 res += int_element_int * m_weight *
59 (KERNEL::kernel(q, cos, phi, q0, k, t...) + KERNEL::kernel(q, cos, phi, -q0, k, t...));
60 }
61
62 // sum
63 const ctype int_element_sum = m_T // solid nd angle
64 * (powr<d - 3>(q) / (ctype)2 * powr<2>(k)) // x = p^2 / k^2 integral
65 / powr<d - 1>(2 * (ctype)M_PI); // fourier factor
66 for (uint idx_0 = 0; idx_0 < x0_summands; ++idx_0) {
67 const ctype q0 = 2 * (ctype)M_PI * m_T * idx_0;
68 res += int_element_sum * weight *
69 (idx_0 == 0 ? KERNEL::kernel(q, cos, phi, (ctype)0, k, t...)
70 : KERNEL::kernel(q, cos, phi, q0, k, t...) + KERNEL::kernel(q, cos, phi, -q0, k, t...));
71 }
72
73 dest[idx] = res;
74 }
75
76 template <typename NT, typename KERNEL> class Integrator4DFiniteTx0GPU
77 {
78 public:
79 using ctype = typename get_type::ctype<NT>;
80
82 const ctype x_extent, const ctype x0_extent, const uint x0_summands, const JSONValue &json)
84 json.get_double("/physical/T"), json.get_uint("/integration/cudathreadsperblock"))
85 {
86 }
87
88 Integrator4DFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 4> _grid_sizes,
89 const ctype x_extent, const ctype x0_extent, const uint _x0_summands, const ctype T,
90 const uint max_block_size = 256)
92 grid_sizes({_grid_sizes[0], _grid_sizes[1], _grid_sizes[2], _grid_sizes[3] + _x0_summands}),
94 original_x0_summands(_x0_summands),
95 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256)
96 {
97 if (grid_sizes[2] != grid_sizes[1])
98 throw std::runtime_error("Grid sizes must be currently equal for all angular dimensions!");
99
104
105 set_T(T);
106
107 block_sizes = {max_block_size, max_block_size, max_block_size};
108 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
109 // are as close to each other as possible
110 uint optimize_dim = 2;
111 while (block_sizes[0] * block_sizes[1] * block_sizes[2] > max_block_size || block_sizes[0] > grid_sizes[0] ||
112 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
113 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
114 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
115 block_sizes[optimize_dim]--;
116 optimize_dim = (optimize_dim + 2) % 3;
117 }
118
119 uint blocks1 = grid_sizes[0] / block_sizes[0];
120 uint threads1 = block_sizes[0];
121 uint blocks2 = grid_sizes[1] / block_sizes[1];
122 uint threads2 = block_sizes[1];
123 uint blocks3 = grid_sizes[2] / block_sizes[2];
124 uint threads3 = block_sizes[2];
125
126 num_blocks = dim3(blocks1, blocks2, blocks3);
127 threads_per_block = dim3(threads1, threads2, threads3);
128 }
129
140
141 void set_x0_extent(const ctype val) { x0_extent = val; }
142
156
157 template <typename... T> NT get(const ctype k, const T &...t) const
158 {
159 const auto cuda_stream = cuda_stream_pool.get_stream();
160 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
164 k, t...);
165 check_cuda();
166 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
167 device_data.end(), NT(0.), thrust::plus<NT>());
168 }
169
170 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
171 {
172 const auto cuda_stream = cuda_stream_pool.get_stream();
173 std::shared_ptr<rmm::device_uvector<NT>> device_data =
174 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
178 k, t...);
179 check_cuda();
180 const NT constant = KERNEL::constant(k, t...);
181
182 return std::async(std::launch::deferred, [=, this]() {
183 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
184 (*device_data).end(), NT(0.), thrust::plus<NT>());
185 });
186 }
187
188 private:
190
191 const std::array<uint, 4> grid_sizes;
192 std::array<uint, 3> block_sizes;
193
195
202
208
211
212 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
213 mutable PoolMR pool;
214 const rmm::cuda_stream_pool cuda_stream_pool;
215 };
216} // namespace DiFfRG
217
218#else
219
220#ifdef USE_CUDA
221
222namespace DiFfRG
223{
224 template <typename NT, typename KERNEL> class Integrator4DFiniteTx0GPU;
225}
226
227#else
228
230
231namespace DiFfRG
232{
233 template <typename NT, typename KERNEL> class Integrator4DFiniteTx0GPU : public Integrator4DFiniteTx0TBB<NT, KERNEL>
234 {
235 public:
236 using ctype = typename get_type::ctype<NT>;
237
238 Integrator4DFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 4> grid_sizes,
239 const ctype x_extent, const ctype x0_extent, const uint x0_summands, const ctype T,
240 const uint max_block_size = 256)
241 : Integrator4DFiniteTx0TBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, x0_extent, x0_summands, T,
242 max_block_size)
243 {
244 }
245 };
246} // namespace DiFfRG
247
248#endif
249
250#endif
Definition integrator_4D_finiteTx0_gpu.hh:77
const ctype * ptr_x_quadrature_w
Definition integrator_4D_finiteTx0_gpu.hh:197
dim3 num_blocks
Definition integrator_4D_finiteTx0_gpu.hh:209
uint device_data_size
Definition integrator_4D_finiteTx0_gpu.hh:194
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_4D_finiteTx0_gpu.hh:214
NT get(const ctype k, const T &...t) const
Definition integrator_4D_finiteTx0_gpu.hh:157
Integrator4DFiniteTx0GPU(const Integrator4DFiniteTx0GPU &other)
Definition integrator_4D_finiteTx0_gpu.hh:143
const ctype x_extent
Definition integrator_4D_finiteTx0_gpu.hh:203
ctype x0_extent
Definition integrator_4D_finiteTx0_gpu.hh:204
const ctype * ptr_x_quadrature_p
Definition integrator_4D_finiteTx0_gpu.hh:196
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_4D_finiteTx0_gpu.hh:212
Integrator4DFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 4 > _grid_sizes, const ctype x_extent, const ctype x0_extent, const uint _x0_summands, const ctype T, const uint max_block_size=256)
Definition integrator_4D_finiteTx0_gpu.hh:88
dim3 threads_per_block
Definition integrator_4D_finiteTx0_gpu.hh:210
typename get_type::ctype< NT > ctype
Definition integrator_4D_finiteTx0_gpu.hh:79
PoolMR pool
Definition integrator_4D_finiteTx0_gpu.hh:213
const ctype * ptr_ang_quadrature_p
Definition integrator_4D_finiteTx0_gpu.hh:198
std::future< NT > request(const ctype k, const T &...t) const
Definition integrator_4D_finiteTx0_gpu.hh:170
const uint original_x0_summands
Definition integrator_4D_finiteTx0_gpu.hh:205
uint x0_summands
Definition integrator_4D_finiteTx0_gpu.hh:206
const ctype * ptr_ang_quadrature_w
Definition integrator_4D_finiteTx0_gpu.hh:199
const ctype * ptr_x0_quadrature_w
Definition integrator_4D_finiteTx0_gpu.hh:201
std::array< uint, 3 > block_sizes
Definition integrator_4D_finiteTx0_gpu.hh:192
void set_x0_extent(const ctype val)
Definition integrator_4D_finiteTx0_gpu.hh:141
const std::array< uint, 4 > grid_sizes
Definition integrator_4D_finiteTx0_gpu.hh:191
void set_T(const ctype T)
Definition integrator_4D_finiteTx0_gpu.hh:130
QuadratureProvider & quadrature_provider
Definition integrator_4D_finiteTx0_gpu.hh:189
ctype m_T
Definition integrator_4D_finiteTx0_gpu.hh:207
Integrator4DFiniteTx0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 4 > grid_sizes, const ctype x_extent, const ctype x0_extent, const uint x0_summands, const JSONValue &json)
Definition integrator_4D_finiteTx0_gpu.hh:81
const ctype * ptr_x0_quadrature_p
Definition integrator_4D_finiteTx0_gpu.hh:200
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
__global__ void gridreduce_4d_finiteTx0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *ang_quadrature_p, const ctype *ang_quadrature_w, const ctype *x0_quadrature_p, const ctype *x0_quadrature_w, const ctype x_extent, const ctype x0_extent, const uint x0_quadrature_size, const uint x0_summands, const ctype m_T, const ctype k, T... t)
Definition integrator_4D_finiteTx0_gpu.hh:21
bool __forceinline__ __host__ __device__ is_close(T1 a, T2 b, T3 eps_)
Function to evaluate whether two floats are equal to numerical precision. Tests for both relative and...
Definition math.hh:160
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22