DiFfRG
Loading...
Searching...
No Matches
integrator_finiteTq0_gpu.hh
Go to the documentation of this file.
1#pragma once
2
3#ifdef __CUDACC__
4
5// standard library
6#include <future>
7
8// external libraries
9#include <rmm/cuda_stream_pool.hpp>
10#include <rmm/device_uvector.hpp>
11#include <rmm/mr/device/pool_memory_resource.hpp>
12#include <thrust/reduce.h>
13
14// DiFfRG
17
18namespace DiFfRG
19{
20 template <typename ctype, int d, typename NT, typename KERNEL, typename... T>
21 __global__ void gridreduce_1d_finiteTq0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w,
22 const ctype *matsubara_quadrature_p, const ctype *matsubara_quadrature_w,
23 const ctype x_extent, const ctype m_T, const ctype k, T... t)
24 {
25 uint len_x = gridDim.x * blockDim.x;
26 uint idx_x = (blockIdx.x * blockDim.x) + threadIdx.x;
27 uint idx_y = (blockIdx.y * blockDim.y) + threadIdx.y;
28 uint idx = idx_y * len_x + idx_x;
29
30 const ctype q = k * sqrt(x_quadrature_p[idx_x] * x_extent);
31 constexpr ctype S_dm1 = S_d_prec<ctype>(d - 1);
32
33 const ctype x_weight = x_quadrature_w[idx_x] * x_extent;
34 const ctype int_element = S_dm1 // solid nd angle
35 * (powr<d - 3>(q) / (ctype)2 * powr<2>(k)) // x = p^2 / k^2 integral
36 / powr<d - 1>(2 * (ctype)M_PI); // fourier factor
37 const ctype q0 = matsubara_quadrature_p[idx_y];
38 const ctype weight = x_weight * matsubara_quadrature_w[idx_y];
39
40 NT res = int_element * weight * (KERNEL::kernel(q, q0, k, t...) + KERNEL::kernel(q, -q0, k, t...));
41 if (m_T > 0 && idx_y == 0) res += int_element * x_weight * m_T * KERNEL::kernel(q, (ctype)0, k, t...);
42
43 dest[idx] = res;
44 }
45
46 template <int d, typename NT, typename KERNEL> class IntegratorFiniteTq0GPU
47 {
48 static_assert(d >= 2, "dimension must be at least 2");
49
50 public:
51 using ctype = typename get_type::ctype<NT>;
52
54 const ctype x_extent, const JSONValue &json)
55 : IntegratorFiniteTq0GPU(quadrature_provider, grid_sizes, x_extent, json.get_double("/physical/T"),
56 json.get_uint("/integration/cudathreadsperblock"))
57 {
58 }
59
60 IntegratorFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 1> _grid_sizes,
61 const ctype x_extent, const ctype T, const uint max_block_size = 256)
62 : quadrature_provider(quadrature_provider), grid_sizes{{_grid_sizes[0], 0}},
63 device_data_size(grid_sizes[0] * 32), x_extent(x_extent),
64 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256), manual_E(false),
65 max_block_size(max_block_size)
66 {
67 set_T(T);
68 }
69
70 void reinit()
71 {
72 device_data_size = grid_sizes[0] * grid_sizes[1];
73
74 ptr_x_quadrature_p = quadrature_provider.get_device_points<ctype>(grid_sizes[0]);
75 ptr_x_quadrature_w = quadrature_provider.get_device_weights<ctype>(grid_sizes[0]);
76
77 block_sizes = {max_block_size, max_block_size};
78 // choose block sizes such that the size is both as close to max_block_size as possible and the individual sizes
79 // are as close to each other as possible
80 uint optimize_dim = 1;
81 while (block_sizes[0] * block_sizes[1] > max_block_size || block_sizes[0] > grid_sizes[0] ||
82 block_sizes[1] > grid_sizes[1]) {
83 if (block_sizes[optimize_dim] > 1) block_sizes[optimize_dim]--;
84 while (grid_sizes[optimize_dim] % block_sizes[optimize_dim] != 0)
85 block_sizes[optimize_dim]--;
86 optimize_dim = (optimize_dim + 1) % 2;
87 }
88
89 uint blocks1 = grid_sizes[0] / block_sizes[0];
90 uint threads1 = block_sizes[0];
91 uint blocks2 = grid_sizes[1] / block_sizes[1];
92 uint threads2 = block_sizes[1];
93
94 num_blocks = dim3(blocks1, blocks2);
95 threads_per_block = dim3(threads1, threads2);
96 }
97
105 void set_T(const ctype T, const ctype E = 0)
106 {
107 if (is_close(T, m_T) && E != 0 && (std::abs(E - m_E) / std::max(E, m_E) < 2.5e-2)) return;
108
109 m_T = T;
110 // the default typical energy scale will default the matsubara size to 11.
111 m_E = is_close(E, 0.) ? 10 * m_T : E;
112 manual_E = !is_close(E, 0.);
113
114 grid_sizes[1] = quadrature_provider.get_matsubara_points<ctype>(m_T, m_E).size();
115
116 ptr_matsubara_quadrature_p = quadrature_provider.get_device_matsubara_points<ctype>(m_T, m_E);
117 ptr_matsubara_quadrature_w = quadrature_provider.get_device_matsubara_weights<ctype>(m_T, m_E);
118 reinit();
119 }
120
126 void set_E(const ctype E) { set_T(m_T, E); }
127
129 : quadrature_provider(other.quadrature_provider), grid_sizes(other.grid_sizes),
130 device_data_size(other.device_data_size), ptr_x_quadrature_p(other.ptr_x_quadrature_p),
131 ptr_x_quadrature_w(other.ptr_x_quadrature_w), ptr_matsubara_quadrature_p(other.ptr_matsubara_quadrature_p),
132 ptr_matsubara_quadrature_w(other.ptr_matsubara_quadrature_w), x_extent(other.x_extent), m_T(other.m_T),
133 m_E(other.m_E), manual_E(other.manual_E),
134 pool(rmm::mr::get_current_device_resource(), (device_data_size / 256 + 1) * 256),
135 max_block_size(other.max_block_size)
136 {
137 block_sizes = other.block_sizes;
138 num_blocks = other.num_blocks;
139 threads_per_block = other.threads_per_block;
140 }
141
142 template <typename... T> NT get(const ctype k, const T &...t)
143 {
144 if (!manual_E && (std::abs(k - m_E) / std::max(k, m_E) > 2.5e-2)) {
145 set_T(m_T, k);
146 manual_E = false;
147 }
148
149 const auto cuda_stream = cuda_stream_pool.get_stream();
150 rmm::device_uvector<NT> device_data(device_data_size, cuda_stream, &pool);
151 gridreduce_1d_finiteTq0<ctype, d, NT, KERNEL><<<num_blocks, threads_per_block, 0, cuda_stream.value()>>>(
152 device_data.data(), ptr_x_quadrature_p, ptr_x_quadrature_w, ptr_matsubara_quadrature_p,
153 ptr_matsubara_quadrature_w, x_extent, m_T, k, t...);
154 check_cuda();
155 return KERNEL::constant(k, t...) + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), device_data.begin(),
156 device_data.end(), NT(0.), thrust::plus<NT>());
157 }
158
159 template <typename... T> std::future<NT> request(const ctype k, const T &...t)
160 {
161 if (!manual_E && (std::abs(k - m_E) / std::max(k, m_E) > 2.5e-2)) {
162 set_T(m_T, k);
163 manual_E = false;
164 }
165
166 const auto cuda_stream = cuda_stream_pool.get_stream();
167 std::shared_ptr<rmm::device_uvector<NT>> device_data =
168 std::make_shared<rmm::device_uvector<NT>>(device_data_size, cuda_stream, &pool);
169 gridreduce_1d_finiteTq0<ctype, d, NT, KERNEL><<<num_blocks, threads_per_block, 0, cuda_stream.value()>>>(
170 (*device_data).data(), ptr_x_quadrature_p, ptr_x_quadrature_w, ptr_matsubara_quadrature_p,
171 ptr_matsubara_quadrature_w, x_extent, m_T, k, t...);
172 check_cuda();
173 const NT constant = KERNEL::constant(k, t...);
174
175 return std::async(std::launch::deferred, [=, this]() {
176 return constant + thrust::reduce(thrust::cuda::par.on(cuda_stream.value()), (*device_data).begin(),
177 (*device_data).end(), NT(0.), thrust::plus<NT>());
178 });
179 }
180
181 private:
183
184 std::array<uint, 2> grid_sizes;
185 std::array<uint, 2> block_sizes;
186
188
193
195 ctype m_T, m_E;
197
201
202 using PoolMR = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
203 mutable PoolMR pool;
204 const rmm::cuda_stream_pool cuda_stream_pool;
205 };
206} // namespace DiFfRG
207
208#else
209
210#ifdef USE_CUDA
211
212namespace DiFfRG
213{
214 template <int d, typename NT, typename KERNEL> class IntegratorFiniteTq0GPU;
215}
216
217#else
218
220
221namespace DiFfRG
222{
223 template <int d, typename NT, typename KERNEL>
224 class IntegratorFiniteTq0GPU : public IntegratorFiniteTq0TBB<d, NT, KERNEL>
225 {
226 public:
227 using ctype = typename get_type::ctype<NT>;
228
229 IntegratorFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array<uint, 2> _grid_sizes,
230 const ctype x_extent, const ctype T, const uint max_block_size = 256)
231 : IntegratorFiniteTq0TBB<d, NT, KERNEL>(quadrature_provider, _grid_sizes, x_extent, T, max_block_size)
232 {
233 }
234 };
235} // namespace DiFfRG
236
237#endif
238
239#endif
Definition integrator_finiteTq0_gpu.hh:47
const ctype * ptr_matsubara_quadrature_p
Definition integrator_finiteTq0_gpu.hh:191
const ctype * ptr_x_quadrature_p
Definition integrator_finiteTq0_gpu.hh:189
const rmm::cuda_stream_pool cuda_stream_pool
Definition integrator_finiteTq0_gpu.hh:204
void set_E(const ctype E)
Set the typical energy scale of the integrator and recompute the Matsubara quadrature rule.
Definition integrator_finiteTq0_gpu.hh:126
ctype m_E
Definition integrator_finiteTq0_gpu.hh:195
dim3 threads_per_block
Definition integrator_finiteTq0_gpu.hh:200
rmm::mr::pool_memory_resource< rmm::mr::device_memory_resource > PoolMR
Definition integrator_finiteTq0_gpu.hh:202
void set_T(const ctype T, const ctype E=0)
Set the temperature and typical energy scale of the integrator and recompute the Matsubara quadrature...
Definition integrator_finiteTq0_gpu.hh:105
std::array< uint, 2 > block_sizes
Definition integrator_finiteTq0_gpu.hh:185
const ctype * ptr_x_quadrature_w
Definition integrator_finiteTq0_gpu.hh:190
uint device_data_size
Definition integrator_finiteTq0_gpu.hh:187
PoolMR pool
Definition integrator_finiteTq0_gpu.hh:203
typename get_type::ctype< NT > ctype
Definition integrator_finiteTq0_gpu.hh:51
void reinit()
Definition integrator_finiteTq0_gpu.hh:70
bool manual_E
Definition integrator_finiteTq0_gpu.hh:196
const ctype x_extent
Definition integrator_finiteTq0_gpu.hh:194
dim3 num_blocks
Definition integrator_finiteTq0_gpu.hh:199
IntegratorFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 1 > grid_sizes, const ctype x_extent, const JSONValue &json)
Definition integrator_finiteTq0_gpu.hh:53
const uint max_block_size
Definition integrator_finiteTq0_gpu.hh:198
IntegratorFiniteTq0GPU(const IntegratorFiniteTq0GPU &other)
Definition integrator_finiteTq0_gpu.hh:128
std::array< uint, 2 > grid_sizes
Definition integrator_finiteTq0_gpu.hh:184
const ctype * ptr_matsubara_quadrature_w
Definition integrator_finiteTq0_gpu.hh:192
QuadratureProvider & quadrature_provider
Definition integrator_finiteTq0_gpu.hh:182
NT get(const ctype k, const T &...t)
Definition integrator_finiteTq0_gpu.hh:142
std::future< NT > request(const ctype k, const T &...t)
Definition integrator_finiteTq0_gpu.hh:159
IntegratorFiniteTq0GPU(QuadratureProvider &quadrature_provider, const std::array< uint, 1 > _grid_sizes, const ctype x_extent, const ctype T, const uint max_block_size=256)
Definition integrator_finiteTq0_gpu.hh:60
A wrapper around the boost json value class.
Definition json.hh:19
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:139
typename internal::_ctype< CT >::value ctype
Definition types.hh:106
Definition complex_math.hh:14
__global__ void gridreduce_1d_finiteTq0(NT *dest, const ctype *x_quadrature_p, const ctype *x_quadrature_w, const ctype *matsubara_quadrature_p, const ctype *matsubara_quadrature_w, const ctype x_extent, const ctype m_T, const ctype k, T... t)
Definition integrator_finiteTq0_gpu.hh:21
constexpr __forceinline__ __host__ __device__ NumberType powr(const NumberType x)
A compile-time evaluatable power function for whole number exponents.
Definition math.hh:45
bool __forceinline__ __host__ __device__ is_close(T1 a, T2 b, T3 eps_)
Function to evaluate whether two floats are equal to numerical precision. Tests for both relative and...
Definition math.hh:160
consteval NT S_d_prec(uint d)
Surface of a d-dimensional sphere (precompiled)
Definition math.hh:104
void check_cuda(std::string prefix="")
Check if a CUDA error occurred and print an error message if it did.
unsigned int uint
Definition utils.hh:22