/home/runner/work/DiFfRG_current/DiFfRG_current/DiFfRG/include/DiFfRG/physics/integration/quadrature_integrator.hh Source File#

DiFfRG: /home/runner/work/DiFfRG_current/DiFfRG_current/DiFfRG/include/DiFfRG/physics/integration/quadrature_integrator.hh Source File
DiFfRG
quadrature_integrator.hh
Go to the documentation of this file.
1#pragma once
2
3// DiFfRG
13
14namespace DiFfRG
15{
24 template <int dim, typename NT, typename KERNEL, typename ExecutionSpace>
25 requires(dim > 0)
27 {
28 public:
32 using ctype = typename get_type::ctype<NT>;
36 using execution_space = ExecutionSpace;
37
38 QuadratureIntegrator(QuadratureProvider &quadrature_provider, const std::array<size_t, dim> &_grid_size,
39 const std::array<ctype, dim> &grid_min, const std::array<ctype, dim> &grid_max,
40 const std::array<QuadratureType, dim> &quadrature_type)
41 : quadrature_provider(quadrature_provider)
42 {
43 for (size_t i = 0; i < dim; ++i) {
44 grid_size[i] = _grid_size[i];
45
46 nodes[i] = quadrature_provider.template nodes<ctype, typename ExecutionSpace::memory_space>(grid_size[i],
47 quadrature_type[i]);
48 weights[i] = quadrature_provider.template weights<ctype, typename ExecutionSpace::memory_space>(
49 grid_size[i], quadrature_type[i]);
50 }
51 set_grid_extents(grid_min, grid_max);
52 }
53
54 void set_grid_extents(const std::array<ctype, dim> &grid_min, const std::array<ctype, dim> &grid_max)
55 {
56 for (size_t i = 0; i < dim; ++i) {
57 grid_extents[0][i] = grid_min[i];
58 grid_extents[1][i] = grid_max[i];
59
60 grid_start[i] = grid_extents[0][i];
61 grid_scale[i] = (grid_extents[1][i] - grid_extents[0][i]);
62 }
63 }
64
65 template <typename... T>
66 requires is_valid_kernel<NT, KERNEL, ctype, dim, T...>
67 void get(NT &dest, const T &...t) const
68 {
69 // create an execution space
70 ExecutionSpace space;
71
72 if (!m_result_views_initialized) {
73 m_result_view = Kokkos::View<NT, typename ExecutionSpace::memory_space>("result");
74 m_result_host = Kokkos::create_mirror_view(m_result_view);
75 m_result_views_initialized = true;
76 }
77 get(space, m_result_view, t...);
78 Kokkos::deep_copy(space, m_result_host, m_result_view);
79 space.fence();
80 dest = m_result_host();
81 }
82
83 template <typename OT, typename... T>
84 requires(!std::is_same_v<OT, NT> && is_valid_kernel<NT, KERNEL, ctype, dim, T...>)
85 void get(OT &dest, const T &...t) const
86 {
87 ExecutionSpace space;
88 get(space, dest, t...);
89 }
90
91 template <typename OT, typename... T>
92 requires(!std::is_same_v<OT, NT> && is_valid_kernel<NT, KERNEL, ctype, dim, T...>)
93 void get(ExecutionSpace &space, OT &dest, const T &...t) const
94 {
95 const auto args = device::make_tuple(t...);
96
97 const auto &n = nodes;
98 const auto &w = weights;
99 const auto &start = grid_start;
100 const auto &scale = grid_scale;
101
102 auto functor = KOKKOS_LAMBDA(const device::array<size_t, dim> &idx, NT &update)
103 {
105 ctype weight = 1;
106 bool is_first = true;
107 for (size_t i = 0; i < dim; ++i) {
108 x[i] = Kokkos::fma(scale[i], n[i][idx[i]], start[i]);
109 weight *= w[i][idx[i]] * scale[i];
110 is_first &= idx[i] == 0;
111 }
112 device::apply([&](const auto &...iargs) { update += weight * KERNEL::kernel(iargs...); },
113 device::tuple_cat(x, args));
114 device::apply([&](const auto &...iargs) { update += is_first ? KERNEL::constant(iargs...) : NT(0); }, args);
115 };
116
117 Kokkos::parallel_reduce("QuadratureIntegral_" + std::to_string(dim) + "D", // name of the kernel
118 make_kokkos_nd_range<dim, ExecutionSpace>(space, {0}, grid_size),
120 }
121
122 template <typename view_type, typename Coordinates, typename... Args>
123 void map(ExecutionSpace &space, const view_type integral_view, const Coordinates &coordinates, const Args &...args)
124 {
126 extents[0] = integral_view.size();
127 for (int i = 0; i < dim; ++i)
128 extents[1 + i] = grid_size[i];
129
130 // Reuse cached view if large enough, otherwise reallocate (grow-only)
131 {
132 bool needs_realloc = false;
133 for (size_t i = 0; i < 1 + dim; ++i)
134 needs_realloc |= (extents[i] > m_cache_extents[i]);
135 if (needs_realloc) {
136 for (size_t i = 0; i < 1 + dim; ++i)
137 m_cache_extents[i] = std::max(m_cache_extents[i], extents[i]);
138 m_cache = make_kokkos_nd_view<1 + dim, NT, ExecutionSpace>("cache", m_cache_extents);
139 }
140 }
141 // Create a Restrict-tagged alias of the cache for no-alias optimization
142 const auto cache = KokkosNDViewRestrict<1 + dim, NT, ExecutionSpace>(m_cache);
143
144 const auto m_args = device::make_tuple(args...);
145
146 const auto &n = nodes;
147 const auto &w = weights;
148 const auto &start = grid_start;
149 const auto &scale = grid_scale;
150
151 auto functor = KOKKOS_LAMBDA(const device::array<size_t, 1 + dim> &idx)
152 {
153 // make subview
154 auto subview = device::apply([&](const auto &...i) { return Kokkos::subview(cache, i...); }, idx);
155
156 // get the position for the current index
157 const auto idx_v = coordinates.from_linear_index(idx[0]);
158 const auto pos = coordinates.forward(idx_v);
159
161 ctype weight = 1;
162 for (int i = 0; i < dim; ++i) {
163 x[i] = Kokkos::fma(scale[i], n[i][idx[1 + i]], start[i]);
164 weight *= w[i][idx[1 + i]] * scale[i];
165 }
166
167 // make a tuple of all arguments
168 const auto full_args = device::tuple_cat(x, pos, m_args);
169
170 device::apply([&](const auto &...iargs) { subview() = weight * KERNEL::kernel(iargs...); }, full_args);
171 };
172
173 Kokkos::parallel_for(make_kokkos_nd_range<1 + dim, ExecutionSpace>(space, {0}, extents),
175
176 using TeamType = Kokkos::TeamPolicy<ExecutionSpace>::member_type;
177 // reduction with vector lanes for warp-level parallelism
178 constexpr int vector_width = 32;
179 Kokkos::parallel_for(
180 Kokkos::TeamPolicy(space, integral_view.size(), Kokkos::AUTO, vector_width),
181 KOKKOS_CLASS_LAMBDA(const TeamType &team) {
182 // get the current (continuous) index
183 const uint k = team.league_rank();
184
185 if (k > integral_view.size()) return;
186
187 // no-ops to capture
188 (void)cache;
189 (void)grid_size;
190
191 // Flatten grid_size into total element count for thread+vector splitting
192 size_t total_elements = 1;
193 for (int d = 0; d < dim; ++d)
194 total_elements *= grid_size[d];
195
196 // Pre-compute stride array for index decomposition (avoids division/modulo in inner loop)
198 strides[dim - 1] = 1;
199 for (int d = dim - 2; d >= 0; --d)
200 strides[d] = strides[d + 1] * grid_size[d + 1];
201
202 NT res{};
203 Kokkos::parallel_reduce(
204 Kokkos::TeamThreadRange(team, (total_elements + vector_width - 1) / vector_width),
205 [&](const size_t outer, NT &team_update) {
206 NT vec_sum{};
207 Kokkos::parallel_reduce(
208 Kokkos::ThreadVectorRange(team, vector_width),
209 [&](const size_t inner, NT &vec_update) {
210 const size_t flat = outer * vector_width + inner;
211 if (flat < total_elements) {
212 // Convert flat index back to multi-dimensional using pre-computed strides
214 size_t remainder = flat;
215 for (int d = 0; d < dim; ++d) {
216 ridx[d] = remainder / strides[d];
217 remainder -= ridx[d] * strides[d];
218 }
219 device::apply([&](const auto &...iargs) { vec_update += cache(k, iargs...); }, ridx);
220 }
221 },
222 vec_sum);
223 team_update += vec_sum;
224 },
225 res);
226
227 // add the constant value (skip coordinate computation if kernel has no constant)
228 Kokkos::single(Kokkos::PerTeam(team), [&]() {
229 const auto idx = coordinates.from_linear_index(k);
230 const auto pos = coordinates.forward(idx);
231 const auto full_args = device::tuple_cat(pos, m_args);
232 integral_view(k) =
233 res + device::apply([&](const auto &...iargs) { return KERNEL::constant(iargs...); }, full_args);
234 });
235 });
236 }
237
238 template <typename Coordinates, typename... Args>
239 auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
240 {
241 // Take care of MPI distribution
242 const auto &node_distribution = AbstractIntegrator::node_distribution;
243 if (node_distribution.mpi_comm != MPI_COMM_NULL && node_distribution.total_size > 0) {
244 auto mpi_comm = node_distribution.mpi_comm;
245 const auto &nodes = node_distribution.nodes;
246 const auto &sizes = node_distribution.sizes;
247
248 // Check if the rank is contained in nodes
249 const size_t m_rank = DiFfRG::MPI::rank(mpi_comm);
250 // If not, return an empty execution space
251 if (std::find(nodes.begin(), nodes.end(), m_rank) == nodes.end()) return ExecutionSpace();
252
253 // Get the size of the current rank
254 const size_t rank_size = sizes[m_rank];
255 // Offset is the sum of all previous ranks
256 const size_t offset = std::accumulate(sizes.begin(), sizes.begin() + m_rank, 0);
257
258 // Create a SubCoordinates object
259 const auto sub_coordinates = SubCoordinates(coordinates, offset, rank_size);
260 // Offset the destination pointer
261 NT *dest_offset = dest + offset;
262
263 return map_dist(dest_offset, sub_coordinates, args...);
264 }
265
266 return map_dist(dest, coordinates, args...);
267 }
268
269 template <typename Coordinates, typename... Args>
270 auto map_dist(NT *dest, const Coordinates &coordinates, const Args &...args)
271 {
272 // create unmanaged host view for dest
273 auto dest_view = Kokkos::View<NT *, CPU_memory, Kokkos::MemoryUnmanaged>(dest, coordinates.size());
274
275 // Reuse cached device view if large enough, otherwise reallocate (grow-only)
276 if (m_dest_device_size < coordinates.size()) {
277 m_dest_device = Kokkos::View<NT *, ExecutionSpace>(Kokkos::view_alloc(space, "MapIntegrators_device_view"),
278 coordinates.size());
279 m_dest_device_size = coordinates.size();
280 }
281 auto dest_device_view =
282 Kokkos::View<NT *, ExecutionSpace>(m_dest_device, Kokkos::make_pair(size_t(0), coordinates.size()));
283
284 // run the map function
285 map(space, dest_device_view, coordinates, args...);
286
287 // copy the result from device to the unmanaged host view
288 Kokkos::deep_copy(space, dest_view, dest_device_view);
289
290 return space;
291 }
292
293 protected:
295
296 ExecutionSpace space;
301
304
305 // Persistent view caches to avoid per-call GPU memory allocation
307 mutable device::array<size_t, 1 + dim> m_cache_extents{};
308 mutable Kokkos::View<NT *, ExecutionSpace> m_dest_device;
309 mutable size_t m_dest_device_size = 0;
310 mutable Kokkos::View<NT, typename ExecutionSpace::memory_space> m_result_view;
311 mutable typename Kokkos::View<NT, typename ExecutionSpace::memory_space>::host_mirror_type m_result_host;
312 mutable bool m_result_views_initialized = false;
313 };
314
315 template <int dim, typename NT, typename KERNEL>
316 class QuadratureIntegrator<dim, NT, KERNEL, TBB_exec> : public QuadratureIntegrator<dim, NT, KERNEL, Threads_exec>
317 {
319
320 public:
325 using ctype = typename get_type::ctype<NT>;
327
328 QuadratureIntegrator(QuadratureProvider &quadrature_provider, const std::array<size_t, dim> _grid_size,
329 std::array<ctype, dim> grid_min, std::array<ctype, dim> grid_max,
330 const std::array<QuadratureType, dim> quadrature_type)
331 : Base(quadrature_provider, _grid_size, grid_min, grid_max, quadrature_type)
332 {
333 }
334
335 template <typename... T>
336 requires is_valid_kernel<NT, KERNEL, ctype, dim, T...>
337 void get(NT &dest, const T &...t) const
338 {
339 const auto args = device::tie(t...);
340
341 const auto &n = nodes;
342 const auto &w = weights;
343 const auto &start = grid_start;
344 const auto &scale = grid_scale;
345
346 auto functor = [&](const device::array<size_t, dim> &idx) {
348 ctype weight = 1;
349 bool is_first = true;
350 for (size_t i = 0; i < dim; ++i) {
351 x[i] = Kokkos::fma(scale[i], n[i][idx[i]], start[i]);
352 weight *= w[i][idx[i]] * scale[i];
353 is_first &= idx[i] == 0;
354 }
355 return device::apply([&](const auto &...iargs) { return weight * KERNEL::kernel(iargs...); },
356 device::tuple_cat(x, args));
357 };
358
359 dest = KERNEL::constant(t...) + TBBReduction<dim, NT, decltype(functor)>(grid_size, functor);
360 }
361
362 template <typename Coordinates, typename... Args>
363 void map(execution_space &, NT *dest, const Coordinates &coordinates, const Args &...args)
364 {
365 const auto m_args = device::tie(args...);
366
367 tbb::parallel_for(tbb::blocked_range<uint>(0, coordinates.size()), [&](const tbb::blocked_range<uint> &r) {
368 for (uint idx = r.begin(); idx != r.end(); ++idx) {
369 const auto dis_idx = coordinates.from_linear_index(idx);
370 const auto pos = coordinates.forward(dis_idx);
371 // make a tuple of all arguments
372 const auto full_args = device::tuple_cat(pos, m_args);
373 device::apply([&](const auto &...iargs) { get(dest[idx], iargs...); }, full_args);
374 }
375 });
376 }
377
378 template <typename Coordinates, typename... Args>
379 auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
380 {
381 auto space = execution_space();
382
383 // Take care of MPI distribution
384 const auto &node_distribution = AbstractIntegrator::node_distribution;
385 if (node_distribution.mpi_comm != MPI_COMM_NULL && node_distribution.total_size > 0) {
386 auto mpi_comm = node_distribution.mpi_comm;
387 const auto &nodes = node_distribution.nodes;
388 const auto &sizes = node_distribution.sizes;
389
390 // Check if the rank is contained in nodes
391 const size_t m_rank = DiFfRG::MPI::rank(mpi_comm);
392 // If not, return an empty execution space
393 if (std::find(nodes.begin(), nodes.end(), m_rank) == nodes.end()) return execution_space();
394
395 // Get the size of the current rank
396 const size_t rank_size = sizes[m_rank];
397 // Offset is the sum of all previous ranks
398 const size_t offset = std::accumulate(sizes.begin(), sizes.begin() + m_rank, 0);
399
400 // Create a SubCoordinates object
401 const auto sub_coordinates = SubCoordinates(coordinates, offset, rank_size);
402 // Offset the destination pointer
403 NT *dest_offset = dest + offset;
404
405 map(space, dest_offset, sub_coordinates, args...);
406 } else
407 map(space, dest, coordinates, args...);
408 return space;
409 }
410
411 protected:
412 using Base::grid_extents;
413 using Base::grid_scale;
414 using Base::grid_size;
415 using Base::grid_start;
416 using Base::quadrature_provider;
417
418 using Base::nodes;
419 using Base::weights;
420 };
421} // namespace DiFfRG
Definition abstract_integrator.hh:64
NodeDistribution node_distribution
Definition abstract_integrator.hh:73
auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
Definition quadrature_integrator.hh:379
void get(NT &dest, const T &...t) const
Definition quadrature_integrator.hh:337
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition quadrature_integrator.hh:325
void map(execution_space &, NT *dest, const Coordinates &coordinates, const Args &...args)
Definition quadrature_integrator.hh:363
QuadratureIntegrator(QuadratureProvider &quadrature_provider, const std::array< size_t, dim > _grid_size, std::array< ctype, dim > grid_min, std::array< ctype, dim > grid_max, const std::array< QuadratureType, dim > quadrature_type)
Definition quadrature_integrator.hh:328
This class performs numerical integration over a d-dimensional hypercube using quadrature rules.
Definition quadrature_integrator.hh:27
Kokkos::View< NT, typename ExecutionSpace::memory_space > m_result_view
Definition quadrature_integrator.hh:310
ExecutionSpace execution_space
Execution space to be used for the integration, e.g. GPU_exec, TBB_exec.
Definition quadrature_integrator.hh:36
Kokkos::View< NT *, ExecutionSpace > m_dest_device
Definition quadrature_integrator.hh:308
void get(NT &dest, const T &...t) const
Definition quadrature_integrator.hh:67
void set_grid_extents(const std::array< ctype, dim > &grid_min, const std::array< ctype, dim > &grid_max)
Definition quadrature_integrator.hh:54
device::array< device::array< ctype, dim >, 2 > grid_extents
Definition quadrature_integrator.hh:298
QuadratureIntegrator(QuadratureProvider &quadrature_provider, const std::array< size_t, dim > &_grid_size, const std::array< ctype, dim > &grid_min, const std::array< ctype, dim > &grid_max, const std::array< QuadratureType, dim > &quadrature_type)
Definition quadrature_integrator.hh:38
Kokkos::View< NT, typenameExecutionSpace::memory_space >::host_mirror_type m_result_host
Definition quadrature_integrator.hh:311
ExecutionSpace space
Definition quadrature_integrator.hh:296
void map(ExecutionSpace &space, const view_type integral_view, const Coordinates &coordinates, const Args &...args)
Definition quadrature_integrator.hh:123
typename get_type::ctype< NT > ctype
Numerical type to be used for integration tasks e.g. the argument or possible jacobians.
Definition quadrature_integrator.hh:32
KokkosNDView< 1+dim, NT, ExecutionSpace > m_cache
Definition quadrature_integrator.hh:306
device::array< Kokkos::View< const ctype *, typename ExecutionSpace::memory_space >, dim > nodes
Definition quadrature_integrator.hh:302
device::array< Kokkos::View< const ctype *, typename ExecutionSpace::memory_space >, dim > weights
Definition quadrature_integrator.hh:303
device::array< ctype, dim > grid_scale
Definition quadrature_integrator.hh:300
QuadratureProvider & quadrature_provider
Definition quadrature_integrator.hh:297
device::array< ctype, dim > grid_start
Definition quadrature_integrator.hh:299
device::array< size_t, dim > grid_size
Definition quadrature_integrator.hh:294
auto map_dist(NT *dest, const Coordinates &coordinates, const Args &...args)
Definition quadrature_integrator.hh:270
auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
Definition quadrature_integrator.hh:239
void get(OT &dest, const T &...t) const
Definition quadrature_integrator.hh:85
void get(ExecutionSpace &space, OT &dest, const T &...t) const
Definition quadrature_integrator.hh:93
A class that provides quadrature points and weights, in host and device memory. The quadrature points...
Definition quadrature_provider.hh:137
Definition coordinates.hh:149
Definition abstract_integrator.hh:51
uint rank(MPI_Comm comm)
std::array< T, N > array
Definition kokkos.hh:133
typename internal::_ctype< CT >::value ctype
Definition types.hh:134
Definition complex_math.hh:10
Kokkos::View< typename GetKokkosNDStarType< dim, T >::type, ExecutionSpace > KokkosNDView
Definition kokkos.hh:160
auto make_kokkos_nd_range(ExecutionSpace &space, const device::array< size_t, dim > start, const device::array< size_t, dim > end)
Definition kokkos.hh:227
Kokkos::View< typename GetKokkosNDStarType< dim, T >::type, ExecutionSpace, Kokkos::MemoryTraits< Kokkos::Restrict > > KokkosNDViewRestrict
Definition kokkos.hh:165
constexpr auto & get(named_tuple< tuple_type, strSet > &ob)
get a reference to the element with the given name
Definition tuples.hh:111
unsigned int uint
Definition utils.hh:24
NT TBBReduction(const device::array< size_t, dim > &grid_size, const FUN &functor)
Definition tbb.hh:91
auto make_kokkos_nd_view(const std::string &label, const device::array< size_t, dim > &extents)
Definition kokkos.hh:181
ExecutionSpaces::TBB_exec_space TBB_exec
Definition kokkos.hh:49
This is a functor which wraps a lambda for reduction. Basically, this is necessary when one wants to ...
Definition kokkos.hh:314
This is a functor which wraps a lambda. Basically, this is necessary when one wants to call a variadi...
Definition kokkos.hh:288
This execution space is optimal when used in conjunction with the FE discretizations.
Definition kokkos.hh:23