DiFfRG
Loading...
Searching...
No Matches
integrator_4D_gpu_fq.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13#include <thrust/transform_reduce.h>
14
15// DiFfRG
18
19#include <DiFfRG/common/quadrature/quadratures.hh>
20
21namespace DiFfRG
22{
23 template <typename NT, typename KERNEL, size_t q1 = 32, size_t q2 = 8> class Integrator4DGPU_fq
24 {
25 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
26
27 public:
31 using ctype = typename get_type::ctype<NT>;
32
33 template <typename... T> struct functor {
34
35 public:
36 functor(const ctype x_extent, const ctype k, T... t) : x_extent(x_extent), k(k), t(t...) {}
37
38 __device__ NT operator()(const uint idx) const
39 {
40 const uint idx_x = idx / (q2 * q2 * q2);
41 const uint idx_y = (idx % (q2 * q2 * q2)) / (q2 * q2);
42 const uint idx_z = (idx % (q2 * q2)) / (q2);
43 const uint idx_cos2 = idx % q2;
44
45 static constexpr GLQuadrature<q1, ctype> x_quadrature{};
46 static constexpr GLQuadrature<q2, ctype> ang_quadrature{};
47
48 const ctype q = k * sqrt(x_quadrature.x[idx_x] * x_extent);
49 const ctype cos1 = 2 * (ang_quadrature.x[idx_y] - (ctype)0.5);
50 const ctype phi = 2 * (ctype)M_PI * ang_quadrature.x[idx_z];
51 const ctype int_element = (powr<2>(q) * (ctype)0.5 * powr<2>(k)) // x = p^2 / k^2 integral
52 * sqrt(1. - powr<2>(cos1)) // cos1 integral jacobian
53 / powr<4>(2 * (ctype)M_PI); // fourier factor
54 const ctype weight = 2 * (ctype)M_PI * ang_quadrature.w[idx_z] // phi weight
55 * 2 * ang_quadrature.w[idx_y] // cos1 weight
56 * x_quadrature.w[idx_x] * x_extent; // x weight
57 const ctype cos2 = 2 * (ang_quadrature.x[idx_cos2] - (ctype)0.5);
58 return std::apply([&](auto &&...args) { return KERNEL::kernel(q, cos1, cos2, phi, k, args...); }, t) // kernel
59 * int_element * weight // other weights and integration elements
60 * 2 * ang_quadrature.w[idx_cos2]; // cos2 weight
61 }
62
63 private:
65 const ctype k;
66 const std::tuple<T...> t;
67 };
68
70 const ctype x_extent, const JSONValue &json)
72 json.get_uint("/integration/cudathreadsperblock"))
73 {
74 }
75
77 const uint max_block_size = 256)
80 {
81 cudaGetDeviceCount(&n_devices);
82 if (n_devices == 0) throw std::runtime_error("No CUDA devices found!");
83
84 for (int device = 0; device < n_devices; ++device) {
85 const rmm::cuda_device_id device_id(device);
86 pool.emplace_back(
87 std::make_shared<PoolMR>(rmm::mr::get_per_device_resource(device_id), (device_data_size / 256 + 1) * 256));
88 }
89
90 if (grid_sizes[2] != grid_sizes[1] || grid_sizes[3] != grid_sizes[1])
91 throw std::runtime_error("Grid sizes must be currently equal for all angular dimensions!");
92
93 block_sizes = {max_block_size, max_block_size, max_block_size};
94 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
95 // are as close to each other as possible
96 uint optimize_dim = 2;
97 while (block_sizes[0] * block_sizes[1] * block_sizes[2] > max_block_size || block_sizes[0] > grid_sizes[0] ||
98 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
99 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
100 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
101 block_sizes[optimize_dim]--;
102 optimize_dim = (optimize_dim + 2) % 3;
103 }
104
105 uint blocks1 = grid_sizes[0] / block_sizes[0];
106 uint threads1 = block_sizes[0];
107 uint blocks2 = grid_sizes[1] / block_sizes[1];
108 uint threads2 = block_sizes[1];
109 uint blocks3 = grid_sizes[2] / block_sizes[2];
110 uint threads3 = block_sizes[2];
111
112 num_blocks = dim3(blocks1, blocks2, blocks3);
113 threads_per_block = dim3(threads1, threads2, threads3);
114
115 cudaSetDevice(0);
116 }
117
128
139 template <typename... T> NT get(const ctype k, const T &...t) const
140 {
141 const int m_device = evaluations++ % n_devices;
142 cudaSetDevice(0);
143
144 const auto cuda_stream = cuda_stream_pool.get_stream();
145 // rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, pool[m_device].get());
146 // gridreduce_4d_fq<q1, q2, ctype, NT, KERNEL>
147 // <<<num_blocks, threads_per_block, 0, cuda_stream.value()>>>(device_data.data(), x_extent, k, t...);
148 // check_cuda();
149 // return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()),
150 // device_data.begin(),
151 // device_data.end(), NT(0.), thrust::plus<NT>());
152 return KERNEL::constant(k, t...) +
153 thrust::transform_reduce(thrust::cuda::par.on(cuda_stream), thrust::make_counting_iterator<uint>(0),
154 thrust::make_counting_iterator<uint>(q1 * powr<3>(q2)),
155 functor<T...>(x_extent, k, t...), NT(0), thrust::plus<NT>());
156 }
157
168 template <typename... T> std::future<NT> request(const ctype k, const T &...t) const
169 {
170 const int m_device = evaluations++ % n_devices;
171 cudaSetDevice(m_device);
172
173 const auto cuda_stream = cuda_stream_pool.get_stream();
174
175 return std::async(std::launch::deferred, [=, this]() {
176 cudaSetDevice(m_device);
177
178 return KERNEL::constant(k, t...) +
179 thrust::transform_reduce(thrust::cuda::par.on(cuda_stream), thrust::make_counting_iterator<uint>(0),
180 thrust::make_counting_iterator<uint>(q1 * powr<3>(q2)),
181 functor<T...>(x_extent, k, t...), NT(0), thrust::plus<NT>());
182 });
183
184 cudaSetDevice(0);
185 }
186
187 private:
189 const std::array<uint, 4> grid_sizes;
190 std::array<uint, 3> block_sizes;
191
193
198
200
203
205 mutable std::vector<std::shared_ptr<PoolMR>> pool;
206 const rmm::cuda_stream_pool cuda_stream_pool;
207
208 mutable std::atomic_ullong evaluations;
209 };
210} // namespace DiFfRG
211
212#else
213
214#ifdef USE_CUDA
215
216namespace DiFfRG
217{
218 template <typename NT, typename KERNEL> class Integrator4DGPU_fq;
219}
220
221#else
222
224
225namespace DiFfRG
226{
227 template <typename NT, typename KERNEL> class Integrator4DGPU_fq : public Integrator4DTBB<NT, KERNEL>
228 {
229 public:
230 using ctype = typename get_type::ctype<NT>;
231
232 Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, std::array<uint, 4> grid_sizes, const ctype x_extent,
233 const uint max_block_size = 256)
234 : Integrator4DTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent)
235 {
236 }
237
238 Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, const std::array<uint, 4> grid_sizes,
239 const ctype x_extent, const JSONValue &)
240 : Integrator4DTBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent)
241 {
242 }
243 };
244} // namespace DiFfRG
245
246#endif
247
248#endif
Definition integrator_4D_gpu_fq.hh:24
std::array< uint, 3 > block_sizes
Definition integrator_4D_gpu_fq.hh:190
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_4D_gpu_fq.hh:25
const ctype x_extent
Definition integrator_4D_gpu_fq.hh:199
dim3 threads_per_block
Definition integrator_4D_gpu_fq.hh:202
dim3 num_blocks
Definition integrator_4D_gpu_fq.hh:201
const ctype * ptr_ang_quadrature_w
Definition integrator_4D_gpu_fq.hh:197
const ctype * ptr_x_quadrature_p
Definition integrator_4D_gpu_fq.hh:194
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_4D_gpu_fq.hh:31
Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, const std::array< uint, 4 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_4D_gpu_fq.hh:69
const ctype * ptr_ang_quadrature_p
Definition integrator_4D_gpu_fq.hh:196
QuadratureProvider & quadrature_provider
Definition integrator_4D_gpu_fq.hh:188
std::atomic_ullong evaluations
Definition integrator_4D_gpu_fq.hh:208
int n_devices
Definition integrator_4D_gpu_fq.hh:204
Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, std::array< uint, 4 > grid_sizes, const ctype x_extent, const uint max_block_size=256)
Definition integrator_4D_gpu_fq.hh:76
const ctype * ptr_x_quadrature_w
Definition integrator_4D_gpu_fq.hh:195
NT get(const ctype k, const T &...t) const
Get the integral of the kernel.
Definition integrator_4D_gpu_fq.hh:139
std::future< NT > request(const ctype k, const T &...t) const
Request a future for the integral of the kernel.
Definition integrator_4D_gpu_fq.hh:168
const std::array< uint, 4 > grid_sizes
Definition integrator_4D_gpu_fq.hh:189
Integrator4DGPU_fq(const Integrator4DGPU_fq &other)
Definition integrator_4D_gpu_fq.hh:118
const uint device_data_size
Definition integrator_4D_gpu_fq.hh:192
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_4D_gpu_fq.hh:206
std::vector< std::shared_ptr< PoolMR > > pool
Definition integrator_4D_gpu_fq.hh:205
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
unsigned int uint
Definition utils.hh:22
Definition gauss_legendre.hh:7
Definition integrator_4D_gpu_fq.hh:33
const ctype k
Definition integrator_4D_gpu_fq.hh:65
const ctype x_extent
Definition integrator_4D_gpu_fq.hh:64
const std::tuple< T... > t
Definition integrator_4D_gpu_fq.hh:66
functor(const ctype x_extent, const ctype k, T... t)
Definition integrator_4D_gpu_fq.hh:36
__device__ NT operator()(const uint idx) const
Definition integrator_4D_gpu_fq.hh:38