9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13#include <thrust/transform_reduce.h>
19#include <DiFfRG/common/quadrature/quadratures.hh>
25 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
40 const uint idx_x = idx / (q2 * q2 * q2);
41 const uint idx_y = (idx % (q2 * q2 * q2)) / (q2 * q2);
42 const uint idx_z = (idx % (q2 * q2)) / (q2);
43 const uint idx_cos2 = idx % q2;
49 const ctype cos1 = 2 * (ang_quadrature.x[idx_y] - (
ctype)0.5);
50 const ctype phi = 2 * (
ctype)M_PI * ang_quadrature.x[idx_z];
54 const ctype weight = 2 * (
ctype)M_PI * ang_quadrature.w[idx_z]
55 * 2 * ang_quadrature.w[idx_y]
57 const ctype cos2 = 2 * (ang_quadrature.x[idx_cos2] - (
ctype)0.5);
58 return std::apply([&](
auto &&...args) {
return KERNEL::kernel(q, cos1, cos2, phi,
k, args...); },
t)
59 * int_element * weight
60 * 2 * ang_quadrature.w[idx_cos2];
66 const std::tuple<T...>
t;
72 json.get_uint(
"/integration/cudathreadsperblock"))
77 const uint max_block_size = 256)
82 if (
n_devices == 0)
throw std::runtime_error(
"No CUDA devices found!");
84 for (
int device = 0; device <
n_devices; ++device) {
85 const rmm::cuda_device_id device_id(device);
87 std::make_shared<PoolMR>(rmm::mr::get_per_device_resource(device_id), (
device_data_size / 256 + 1) * 256));
91 throw std::runtime_error(
"Grid sizes must be currently equal for all angular dimensions!");
93 block_sizes = {max_block_size, max_block_size, max_block_size};
96 uint optimize_dim = 2;
102 optimize_dim = (optimize_dim + 2) % 3;
139 template <
typename... T> NT
get(
const ctype k,
const T &...t)
const
152 return KERNEL::constant(k, t...) +
153 thrust::transform_reduce(thrust::cuda::par.on(cuda_stream), thrust::make_counting_iterator<uint>(0),
154 thrust::make_counting_iterator<uint>(q1 *
powr<3>(q2)),
168 template <
typename... T> std::future<NT>
request(
const ctype k,
const T &...t)
const
171 cudaSetDevice(m_device);
175 return std::async(std::launch::deferred, [=,
this]() {
176 cudaSetDevice(m_device);
178 return KERNEL::constant(k, t...) +
179 thrust::transform_reduce(thrust::cuda::par.on(cuda_stream), thrust::make_counting_iterator<uint>(0),
180 thrust::make_counting_iterator<uint>(q1 *
powr<3>(q2)),
205 mutable std::vector<std::shared_ptr<PoolMR>>
pool;
218 template <
typename NT,
typename KERNEL>
class Integrator4DGPU_fq;
227 template <
typename NT,
typename KERNEL>
class Integrator4DGPU_fq :
public Integrator4DTBB<NT, KERNEL>
233 const uint max_block_size = 256)
Definition integrator_4D_gpu_fq.hh:24
std::array< uint, 3 > block_sizes
Definition integrator_4D_gpu_fq.hh:190
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_4D_gpu_fq.hh:25
const ctype x_extent
Definition integrator_4D_gpu_fq.hh:199
dim3 threads_per_block
Definition integrator_4D_gpu_fq.hh:202
dim3 num_blocks
Definition integrator_4D_gpu_fq.hh:201
const ctype * ptr_ang_quadrature_w
Definition integrator_4D_gpu_fq.hh:197
const ctype * ptr_x_quadrature_p
Definition integrator_4D_gpu_fq.hh:194
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition integrator_4D_gpu_fq.hh:31
Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, const std::array< uint, 4 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_4D_gpu_fq.hh:69
const ctype * ptr_ang_quadrature_p
Definition integrator_4D_gpu_fq.hh:196
QuadratureProvider & quadrature_provider
Definition integrator_4D_gpu_fq.hh:188
std::atomic_ullong evaluations
Definition integrator_4D_gpu_fq.hh:208
int n_devices
Definition integrator_4D_gpu_fq.hh:204
Integrator4DGPU_fq(QuadratureProvider &quadrature_provider, std::array< uint, 4 > grid_sizes, const ctype x_extent, const uint max_block_size=256)
Definition integrator_4D_gpu_fq.hh:76
const ctype * ptr_x_quadrature_w
Definition integrator_4D_gpu_fq.hh:195
NT get(const ctype k, const T &...t) const
Get the integral of the kernel.
Definition integrator_4D_gpu_fq.hh:139
std::future< NT > request(const ctype k, const T &...t) const
Request a future for the integral of the kernel.
Definition integrator_4D_gpu_fq.hh:168
const std::array< uint, 4 > grid_sizes
Definition integrator_4D_gpu_fq.hh:189
Integrator4DGPU_fq(const Integrator4DGPU_fq &other)
Definition integrator_4D_gpu_fq.hh:118
const uint device_data_size
Definition integrator_4D_gpu_fq.hh:192
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_4D_gpu_fq.hh:206
std::vector< std::shared_ptr< PoolMR > > pool
Definition integrator_4D_gpu_fq.hh:205
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
unsigned int uint
Definition utils.hh:22
Definition gauss_legendre.hh:7
Definition integrator_4D_gpu_fq.hh:33
const ctype k
Definition integrator_4D_gpu_fq.hh:65
const ctype x_extent
Definition integrator_4D_gpu_fq.hh:64
const std::tuple< T... > t
Definition integrator_4D_gpu_fq.hh:66
functor(const ctype x_extent, const ctype k, T... t)
Definition integrator_4D_gpu_fq.hh:36
__device__ NT operator()(const uint idx) const
Definition integrator_4D_gpu_fq.hh:38