DiFfRG
Loading...
Searching...
No Matches
integrator_4D_finiteTq0_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
20 template <typename ctype, typename NT, typename KERNEL, typename... T>
21 __global__ void gridreduce_4d_finiteTq0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
22 const ctype *ang_quadrature_p, const ctype *ang_quadrature_w,
23 const ctype *matsubara_quadrature_p, const ctype *matsubara_quadrature_w,
24 const ctype x_extent, const uint matsubara_size, const ctype m_T,
25 const ctype k, T... t)
26 {
27 constexpr uint d = 4;
28
29 uint len_x = gridDim.x * blockDim.x;
30 uint len_y = gridDim.y * blockDim.y;
31 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
32 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
33 uint idx_z = (blockIdx.z * blockDim.z) + threadIdx.z;
34 uint idx = idx_z * len_x * len_y + idx_y * len_x + idx_x;
35
36 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
37 const ctype cos = 2 * (ang_quadrature_p[idx_y] - (ctype)0.5);
38 const ctype phi = 2 * (ctype)M_PI * ang_quadrature_p[idx_z];
39 const ctype x_weight =
40 2 * (ctype)M_PI * ang_quadrature_w[idx_z] * 2 * ang_quadrature_w[idx_y] * x_quadrature_w[idx_x] * x_extent;
41
42 // sum
43 const ctype int_element = (powr<d - 3>(q) / (ctype)2 * powr<2>(k)) // x = p^2 / k^2 integral
44 / powr<d - 1>(2 * (ctype)M_PI); // fourier factor
45
46 NT res = int_element * x_weight * m_T * KERNEL::kernel(q, cos, phi, (ctype)0, k, t...);
47 for (uint idx_0 = 0; idx_0 < matsubara_size; ++idx_0) {
48 const ctype q0 = matsubara_quadrature_p[idx_0];
49 const ctype weight = x_weight * matsubara_quadrature_w[idx_0];
50 res +=
51 int_element * weight * (KERNEL::kernel(q, cos, phi, q0, k, t...) + KERNEL::kernel(q, cos, phi, -q0, k, t...));
52 }
53
54 dest[idx] = res;
55 }
56
57 template <typename NT, typename KERNEL> class Integrator4DFiniteTq0GPU
58 {
59 public:
60 using ctype = typename get_type::ctype<NT>;
61
63 const ctype x_extent, const JSONValue &json)
64 : Integrator4DFiniteTq0GPU(quadrature_provider, grid_sizes, x_extent, json.get_double("/physical/T"),
65 json.get_uint("/integration/cudathreadsperblock"))
66 {
67 }
68
69 Integrator4DFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 3> _grid_sizes,
70 const ctype x_extent, const ctype T, const uint max_block_size = 256)
71 : quadrature_provider(quadrature_provider), grid_sizes({_grid_sizes[0], _grid_sizes[1], _grid_sizes[2], 0}),
73 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256), manual_E(false),
75 {
76 if (grid_sizes[2] != grid_sizes[1])
77 throw std::runtime_error("Grid sizes must be currently equal for all angular dimensions!");
78
79 set_T(T);
80 }
81
82 void reinit()
83 {
85
90
92 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
93 // are as close to each other as possible
94 uint optimize_dim = 2;
96 block_sizes[1] > grid_sizes[1] || block_sizes[2] > grid_sizes[2]) {
97 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
98 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
99 block_sizes[optimize_dim]--;
100 optimize_dim = (optimize_dim + 2) % 3;
101 }
102
103 uint blocks1 = grid_sizes[0] / block_sizes[0];
104 uint threads1 = block_sizes[0];
105 uint blocks2 = grid_sizes[1] / block_sizes[1];
106 uint threads2 = block_sizes[1];
107 uint blocks3 = grid_sizes[2] / block_sizes[2];
108 uint threads3 = block_sizes[2];
109
110 num_blocks = dim3(blocks1, blocks2, blocks3);
111 threads_per_block = dim3(threads1, threads2, threads3);
112 }
113
121 void set_T(const ctype T, const ctype E = 0)
122 {
123 if (is_close(T, m_T) && E != 0 && (std::abs(E - m_E) / std::max(E, m_E) < 2.5e-2)) return;
124
125 m_T = T;
126 // the default typical energy scale will default the matsubara size to 11.
127 m_E = is_close(E, 0.) ? 10 * m_T : E;
128 manual_E = !is_close(E, 0.);
129
131
134 reinit();
135 }
136
142 void set_E(const ctype E) { set_T(m_T, E); }
143
158
159 template <typename... T> NT get(const ctype k, const T &...t)
160 {
161 if (!manual_E && (std::abs(k - m_E) / std::max(k, m_E) > 2.5e-2)) {
162 set_T(m_T, k);
163 manual_E = false;
164 }
165
166 const auto cuda_stream = cuda_stream_pool.get_stream();
167 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
169 <<<num_blocks, threads_per_block, 0, rmm::cuda_stream_per_thread.value()>>>(
172 check_cuda();
173 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(rmm::cuda_stream_per_thread.value()),
174 device_data.begin(), device_data.end(), NT(0),
175 thrust::plus<NT>());
176 }
177
178 template <typename... T> std::future<NT> request(const ctype k, const T &...t)
179 {
180 if (!manual_E && (std::abs(k - m_E) / std::max(k, m_E) > 2.5e-2)) {
181 set_T(m_T, k);
182 manual_E = false;
183 }
184
185 const auto cuda_stream = cuda_stream_pool.get_stream();
186 std::shared_ptr<rmm::device_uvector<NT>> device_data =
187 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
191 check_cuda();
192 const NT constant = KERNEL::constant(k, t...);
193
194 return std::async(std::launch::deferred, [=, this]() {
195 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
196 (*device_data).end(), NT(0), thrust::plus<NT>());
197 });
198 }
199
200 private:
202
203 std::array<uint, 4> grid_sizes;
204 std::array<uint, 3> block_sizes;
205
207
214
218
222
223 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
224 mutable PoolMR pool;
225 const rmm::cuda_stream_pool cuda_stream_pool;
226 };
227} // namespace DiFfRG
228
229#else
230
231#ifdef USE_CUDA
232
233namespace DiFfRG
234{
235 template <typename NT, typename KERNEL> class Integrator4DFiniteTq0GPU;
236}
237
238#else
239
241
242namespace DiFfRG
243{
244 template <typename NT, typename KERNEL> class Integrator4DFiniteTq0GPU : public Integrator4DFiniteTq0TBB<NT, KERNEL>
245 {
246 public:
247 using ctype = typename get_type::ctype<NT>;
248
249 Integrator4DFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 4> grid_sizes,
250 const ctype x_extent, const ctype T, const uint max_block_size = 256)
251 : Integrator4DFiniteTq0TBB<NT, KERNEL>(quadrature_provider, grid_sizes, x_extent, T, max_block_size)
252 {
253 }
254 };
255} // namespace DiFfRG
256
257#endif
258
259#endif
Definition integrator_4D_finiteTq0_gpu.hh:58
const ctype * ptr_ang_quadrature_w
Definition integrator_4D_finiteTq0_gpu.hh:211
dim3 num_blocks
Definition integrator_4D_finiteTq0_gpu.hh:220
Integrator4DFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 3 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_4D_finiteTq0_gpu.hh:62
Integrator4DFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 3 > _grid_sizes, const ctype x_extent, const ctype T, const uint max_block_size=256)
Definition integrator_4D_finiteTq0_gpu.hh:69
const ctype * ptr_matsubara_quadrature_w
Definition integrator_4D_finiteTq0_gpu.hh:213
const ctype * ptr_x_quadrature_p
Definition integrator_4D_finiteTq0_gpu.hh:208
const uint max_block_size
Definition integrator_4D_finiteTq0_gpu.hh:219
void set_E(const ctype E)
Set the typical energy scale of the integrator and recompute the Matsubara quadrature rule.
Definition integrator_4D_finiteTq0_gpu.hh:142
void reinit()
Definition integrator_4D_finiteTq0_gpu.hh:82
typename get_type::ctype< NT > ctype
Definition integrator_4D_finiteTq0_gpu.hh:60
Integrator4DFiniteTq0GPU(const Integrator4DFiniteTq0GPU &other)
Definition integrator_4D_finiteTq0_gpu.hh:144
std::array< uint, 4 > grid_sizes
Definition integrator_4D_finiteTq0_gpu.hh:203
bool manual_E
Definition integrator_4D_finiteTq0_gpu.hh:217
void set_T(const ctype T, const ctype E=0)
Set the temperature and typical energy scale of the integrator and recompute the Matsubara quadrature...
Definition integrator_4D_finiteTq0_gpu.hh:121
const ctype * ptr_ang_quadrature_p
Definition integrator_4D_finiteTq0_gpu.hh:210
QuadratureProvider & quadrature_provider
Definition integrator_4D_finiteTq0_gpu.hh:201
ctype m_T
Definition integrator_4D_finiteTq0_gpu.hh:216
std::future< NT > request(const ctype k, const T &...t)
Definition integrator_4D_finiteTq0_gpu.hh:178
const ctype x_extent
Definition integrator_4D_finiteTq0_gpu.hh:215
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_4D_finiteTq0_gpu.hh:225
std::array< uint, 3 > block_sizes
Definition integrator_4D_finiteTq0_gpu.hh:204
NT get(const ctype k, const T &...t)
Definition integrator_4D_finiteTq0_gpu.hh:159
const ctype * ptr_x_quadrature_w
Definition integrator_4D_finiteTq0_gpu.hh:209
dim3 threads_per_block
Definition integrator_4D_finiteTq0_gpu.hh:221
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_4D_finiteTq0_gpu.hh:223
const ctype * ptr_matsubara_quadrature_p
Definition integrator_4D_finiteTq0_gpu.hh:212
PoolMR pool
Definition integrator_4D_finiteTq0_gpu.hh:224
uint device_data_size
Definition integrator_4D_finiteTq0_gpu.hh:206
ctype m_E
Definition integrator_4D_finiteTq0_gpu.hh:216
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
const NT * get_device_weights(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:211
const NT * get_device_matsubara_points(const NT T, const NT typical_E, const int device=0)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:225
const std::vector< NT > & get_matsubara_points(const NT T, const NT typical_E)
Get the quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:174
const NT * get_device_points(const size_t order, const int device=0, const QuadratureType type=QuadratureType::legendre)
Get the device-side quadrature points for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:198
const NT * get_device_matsubara_weights(const NT T, const NT typical_E, const int device=0)
Get the device-side quadrature weights for a quadrature of size quadrature_size.
Definition quadrature_provider.hh:237
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
bool __forceinline__ __host__ __device__ is_close(T1 a, T2 b, T3 eps_)
Function to evaluate whether two floats are equal to numerical precision. Tests for both relative and...
Definition math.hh:160
__global__ void gridreduce_4d_finiteTq0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *ang_quadrature_p, const ctype *ang_quadrature_w, const ctype *matsubara_quadrature_p, const ctype *matsubara_quadrature_w, const ctype x_extent, const uint matsubara_size, const ctype m_T, const ctype k, T... t)
Definition integrator_4D_finiteTq0_gpu.hh:21
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22