quadrature_integrator_fT.hh Source File#

DiFfRG: /home/runner/work/DiFfRG_current/DiFfRG_current/DiFfRG/include/DiFfRG/physics/integration/finiteT/quadrature_integrator_fT.hh Source File
DiFfRG
Go to the documentation of this file.
#pragma once
 
// DiFfRG
#include <DiFfRG/common/kokkos.hh>
#include <DiFfRG/common/mpi.hh>
#include <DiFfRG/common/quadrature/quadrature_provider.hh>
#include <DiFfRG/common/tbb.hh>
#include <DiFfRG/common/tuples.hh>
#include <DiFfRG/common/types.hh>
#include <DiFfRG/common/utils.hh>
#include <DiFfRG/discretization/coordinates/coordinates.hh>
#include <DiFfRG/physics/integration/abstract_integrator.hh>
 
namespace DiFfRG
{
  template <int dim, typename NT, typename KERNEL, typename ExecutionSpace>
    requires(dim > 0)
  class QuadratureIntegrator_fT : public AbstractIntegrator
  {
  public:
    using ctype = typename get_type::ctype<NT>;
    using execution_space = ExecutionSpace;
 
    static constexpr int sdim = dim - 1;
 
    QuadratureIntegrator_fT(QuadratureProvider &quadrature_provider, const std::array<size_t, sdim> _grid_size,
                            std::array<ctype, sdim> grid_min, std::array<ctype, sdim> grid_max,
                            const std::array<QuadratureType, sdim> quadrature_type, const ctype T = 1,
                            const ctype typical_E = 1)
        : quadrature_provider(quadrature_provider), T(T), typical_E(typical_E)
    {
      for (int d = 0; d < sdim; ++d)
        grid_size[d] = _grid_size[d];
      matsubara_nodes =
          quadrature_provider.template matsubara_nodes<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_weights =
          quadrature_provider.template matsubara_weights<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_sum_T = quadrature_provider.template matsubara_T<ctype>(T, typical_E);
      for (int i = 0; i < sdim; ++i) {
        nodes[i] = quadrature_provider.template nodes<ctype, typename ExecutionSpace::memory_space>(grid_size[i],
                                                                                                    quadrature_type[i]);
        weights[i] = quadrature_provider.template weights<ctype, typename ExecutionSpace::memory_space>(
            grid_size[i], quadrature_type[i]);
      }
      set_grid_extents(grid_min, grid_max);
      grid_size[dim - 1] = matsubara_nodes.size();
    }
 
    void set_grid_extents(const std::array<ctype, sdim> grid_min, const std::array<ctype, sdim> grid_max)
    {
      for (int d = 0; d < sdim; ++d) {
        grid_extents[0][d] = grid_min[d];
        grid_extents[1][d] = grid_max[d];
      }
      for (int i = 0; i < sdim; ++i) {
        grid_start[i] = grid_extents[0][i];
        grid_scale[i] = (grid_extents[1][i] - grid_extents[0][i]);
      }
    }
 
    void set_T(const ctype T)
    {
      this->T = T;
      matsubara_nodes =
          quadrature_provider.template matsubara_nodes<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_weights =
          quadrature_provider.template matsubara_weights<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_sum_T = quadrature_provider.template matsubara_T<ctype>(T, typical_E);
      grid_size[dim - 1] = matsubara_nodes.size();
    }
 
    void set_typical_E(const ctype typical_E)
    {
      if (is_close(this->typical_E, typical_E, 1e-4 * T + std::numeric_limits<ctype>::epsilon() * 10)) return;
 
      this->typical_E = typical_E;
      matsubara_nodes =
          quadrature_provider.template matsubara_nodes<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_weights =
          quadrature_provider.template matsubara_weights<ctype, typename ExecutionSpace::memory_space>(T, typical_E);
      matsubara_sum_T = quadrature_provider.template matsubara_T<ctype>(T, typical_E);
      grid_size[dim - 1] = matsubara_nodes.size();
    }
 
    template <typename... T> void get(NT &dest, const T &...t) const
    {
      // create an execution space
      ExecutionSpace space;
 
      if (!m_result_views_initialized) {
        m_result_view = Kokkos::View<NT, typename ExecutionSpace::memory_space>("result");
        m_result_host = Kokkos::create_mirror_view(m_result_view);
        m_result_views_initialized = true;
      }
      get(space, m_result_view, t...);
      Kokkos::deep_copy(space, m_result_host, m_result_view);
      space.fence();
      dest = m_result_host();
    }
 
    template <typename OT, typename... T>
      requires(!std::is_same_v<OT, NT>)
    void get(OT &dest, const T &...t) const
    {
      ExecutionSpace space;
      get(space, dest, t...);
    }
 
    template <typename OT, typename... Args>
      requires(!std::is_same_v<OT, NT>)
    void get(ExecutionSpace &space, OT &dest, const Args &...t) const
    {
      const auto args = device::make_tuple(t...);
 
      const auto &n = nodes;
      const auto &w = weights;
      const auto &m_n = matsubara_nodes;
      const auto &m_w = matsubara_weights;
      const auto &start = grid_start;
      const auto &scale = grid_scale;
 
      const auto &m_T = matsubara_sum_T;
 
      auto functor = KOKKOS_LAMBDA(const device::array<size_t, dim> &idx, NT &update)
      {
        device::array<ctype, sdim> x;
        ctype weight = 1;
        bool is_first = true;
        for (int i = 0; i < sdim; ++i) {
          x[i] = Kokkos::fma(scale[i], n[i][idx[i]], start[i]);
          weight *= w[i][idx[i]] * scale[i];
          is_first &= idx[i] == 0;
        }
        is_first &= idx[dim - 1] == 0;
        const ctype xt = m_n[idx[dim - 1]];
        const ctype wt = m_w[idx[dim - 1]];
        device::apply(
            [&](const auto &...iargs) {
              device::apply(
                  [&](const auto &...posargs) {
                    update +=
                        weight *
                        (
                            // positive and negative Matsubara frequencies
                            wt * (KERNEL::kernel(posargs..., xt, iargs...) + KERNEL::kernel(posargs..., -xt, iargs...))
                            // The zero mode (once per matsubara sum)
                            + (idx[dim - 1] != 0 ? NT{} : m_T * KERNEL::kernel(posargs..., (ctype)0, iargs...)));
                  },
                  x);
            },
            args);
        device::apply([&](const auto &...iargs) { update += is_first ? KERNEL::constant(iargs...) : NT(0); }, args);
      };
 
      Kokkos::parallel_reduce("QuadratureIntegral_fT_" + std::to_string(dim) + "D", // name of the kernel
                              make_kokkos_nd_range<dim, ExecutionSpace>(space, {0}, grid_size),
                              KokkosNDLambdaWrapperReduction<dim, decltype(functor)>(functor), dest);
    }
 
    template <typename view_type, typename Coordinates, typename... Args>
    void map(ExecutionSpace &space, const view_type integral_view, const Coordinates &coordinates, const Args &...args)
    {
      device::array<size_t, 1 + dim> extents;
      extents[0] = integral_view.size();
      for (int i = 0; i < dim; ++i)
        extents[1 + i] = grid_size[i];
 
      // Reuse cached view if large enough, otherwise reallocate (grow-only)
      {
        bool needs_realloc = false;
        for (size_t i = 0; i < 1 + dim; ++i)
          needs_realloc |= (extents[i] > m_cache_extents[i]);
        if (needs_realloc) {
          for (size_t i = 0; i < 1 + dim; ++i)
            m_cache_extents[i] = std::max(m_cache_extents[i], extents[i]);
          m_cache = make_kokkos_nd_view<1 + dim, NT, ExecutionSpace>("cache", m_cache_extents);
        }
      }
      // Create a Restrict-tagged alias of the cache for no-alias optimization
      const auto cache = KokkosNDViewRestrict<1 + dim, NT, ExecutionSpace>(m_cache);
 
      const auto m_args = device::make_tuple(args...);
 
      const auto &n = nodes;
      const auto &w = weights;
      const auto &m_n = matsubara_nodes;
      const auto &m_w = matsubara_weights;
      const auto &start = grid_start;
      const auto &scale = grid_scale;
 
      const auto &m_T = matsubara_sum_T;
 
      auto functor = KOKKOS_LAMBDA(const device::array<size_t, 1 + dim> &idx)
      {
        // make subview
        auto subview = device::apply([&](const auto &...i) { return Kokkos::subview(cache, i...); }, idx);
 
        // get the position for the current index
        const auto idx_v = coordinates.from_linear_index(idx[0]);
        const auto pos = coordinates.forward(idx_v);
        // make a tuple of all arguments
        const auto full_args = device::tuple_cat(pos, m_args);
 
        device::array<ctype, sdim> x;
        ctype weight = 1;
        for (int i = 0; i < sdim; ++i) {
          x[i] = Kokkos::fma(scale[i], n[i][idx[1 + i]], start[i]);
          weight *= w[i][idx[1 + i]] * scale[i];
        }
        const ctype xt = m_n[idx[1 + dim - 1]];
        const ctype wt = m_w[idx[1 + dim - 1]];
        device::apply(
            [&](const auto &...iargs) {
              device::apply(
                  [&](const auto &...posargs) {
                    subview() =
                        weight *
                        (
                            // positive and negative Matsubara frequencies
                            wt * (KERNEL::kernel(posargs..., xt, iargs...) + KERNEL::kernel(posargs..., -xt, iargs...))
                            // The zero mode (once per matsubara sum)
                            + (idx[1 + dim - 1] != 0 ? NT{} : m_T * KERNEL::kernel(posargs..., (ctype)0, iargs...)));
                  },
                  x);
            },
            full_args);
      };
 
      Kokkos::parallel_for(make_kokkos_nd_range<1 + dim, ExecutionSpace>(space, {0}, extents),
                           KokkosNDLambdaWrapper<1 + dim, decltype(functor)>(functor));
 
      using TeamType = Kokkos::TeamPolicy<ExecutionSpace>::member_type;
      // reduction with vector lanes for warp-level parallelism
      Kokkos::parallel_for(
          Kokkos::TeamPolicy(space, integral_view.size(), Kokkos::AUTO, 32), KOKKOS_CLASS_LAMBDA(const TeamType &team) {
            // get the current (continuous) index
            const uint k = team.league_rank();
 
            if (k > integral_view.size()) return;
 
            // no-ops to capture
            (void)cache;
            (void)grid_size;
 
            // Flatten grid_size into total element count for thread+vector splitting
            size_t total_elements = 1;
            for (int d = 0; d < dim; ++d)
              total_elements *= grid_size[d];
 
            NT res{};
            Kokkos::parallel_reduce(
                Kokkos::TeamThreadRange(team, (total_elements + 31) / 32),
                [&](const size_t outer, NT &team_update) {
                  NT vec_sum{};
                  Kokkos::parallel_reduce(
                      Kokkos::ThreadVectorRange(team, 32),
                      [&](const size_t inner, NT &vec_update) {
                        const size_t flat = outer * 32 + inner;
                        if (flat < total_elements) {
                          // Convert flat index back to multi-dimensional
                          device::array<size_t, dim> ridx;
                          size_t remainder = flat;
                          for (int d = dim - 1; d >= 0; --d) {
                            ridx[d] = remainder % grid_size[d];
                            remainder /= grid_size[d];
                          }
                          device::apply([&](const auto &...iargs) { vec_update += cache(k, iargs...); }, ridx);
                        }
                      },
                      vec_sum);
                  team_update += vec_sum;
                },
                res);
 
            // add the constant value (skip coordinate computation if kernel has no constant)
            Kokkos::single(Kokkos::PerTeam(team), [&]() {
              const auto idx = coordinates.from_linear_index(k);
              const auto pos = coordinates.forward(idx);
              const auto full_args = device::tuple_cat(pos, m_args);
              integral_view(k) =
                  res + device::apply([&](const auto &...iargs) { return KERNEL::constant(iargs...); }, full_args);
            });
          });
    }
 
    template <typename Coordinates, typename... Args>
    auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
    {
      // Take care of MPI distribution
      const auto &node_distribution = AbstractIntegrator::node_distribution;
      if (node_distribution.mpi_comm != MPI_COMM_NULL && node_distribution.total_size > 0) {
        auto mpi_comm = node_distribution.mpi_comm;
        const auto &nodes = node_distribution.nodes;
        const auto &sizes = node_distribution.sizes;
 
        // Check if the rank is contained in nodes
        const size_t m_rank = DiFfRG::MPI::rank(mpi_comm);
        // If not, return an empty execution space
        if (std::find(nodes.begin(), nodes.end(), m_rank) == nodes.end()) return ExecutionSpace();
 
        // Get the size of the current rank
        const size_t rank_size = sizes[m_rank];
        // Offset is the sum of all previous ranks
        const size_t offset = std::accumulate(sizes.begin(), sizes.begin() + m_rank, 0);
 
        // Create a SubCoordinates object
        const auto sub_coordinates = SubCoordinates(coordinates, offset, rank_size);
        // Offset the destination pointer
        NT *dest_offset = dest + offset;
 
        return map_dist(dest_offset, sub_coordinates, args...);
      }
 
      return map_dist(dest, coordinates, args...);
    }
 
    template <typename Coordinates, typename... Args>
    auto map_dist(NT *dest, const Coordinates &coordinates, const Args &...args)
    {
      // create unmanaged host view for dest
      auto dest_view = Kokkos::View<NT *, CPU_memory, Kokkos::MemoryUnmanaged>(dest, coordinates.size());
 
      // Reuse cached device view if large enough, otherwise reallocate (grow-only)
      if (m_dest_device_size < coordinates.size()) {
        m_dest_device = Kokkos::View<NT *, ExecutionSpace>(Kokkos::view_alloc(space, "MapIntegrators_device_view"),
                                                           coordinates.size());
        m_dest_device_size = coordinates.size();
      }
      auto dest_device_view =
          Kokkos::View<NT *, ExecutionSpace>(m_dest_device, Kokkos::make_pair(size_t(0), coordinates.size()));
 
      // run the map function
      map(space, dest_device_view, coordinates, args...);
 
      // copy the result from device to the unmanaged host view
      Kokkos::deep_copy(space, dest_view, dest_device_view);
 
      return space;
    }
 
  protected:
    ExecutionSpace space;
    QuadratureProvider &quadrature_provider;
    device::array<device::array<ctype, sdim>, 2> grid_extents;
    device::array<ctype, sdim> grid_start;
    device::array<ctype, sdim> grid_scale;
 
    device::array<size_t, dim> grid_size;
 
    device::array<Kokkos::View<const ctype *, typename ExecutionSpace::memory_space>, sdim> nodes;
    device::array<Kokkos::View<const ctype *, typename ExecutionSpace::memory_space>, sdim> weights;
 
    ctype T, typical_E;
    ctype matsubara_sum_T;
 
    Kokkos::View<const ctype *, typename ExecutionSpace::memory_space> matsubara_nodes;
    Kokkos::View<const ctype *, typename ExecutionSpace::memory_space> matsubara_weights;
 
    // Persistent view caches to avoid per-call GPU memory allocation
    mutable KokkosNDView<1 + dim, NT, ExecutionSpace> m_cache;
    mutable device::array<size_t, 1 + dim> m_cache_extents{};
    mutable Kokkos::View<NT *, ExecutionSpace> m_dest_device;
    mutable size_t m_dest_device_size = 0;
    mutable Kokkos::View<NT, typename ExecutionSpace::memory_space> m_result_view;
    mutable typename Kokkos::View<NT, typename ExecutionSpace::memory_space>::host_mirror_type m_result_host;
    mutable bool m_result_views_initialized = false;
  };
 
  template <int dim, typename NT, typename KERNEL>
  class QuadratureIntegrator_fT<dim, NT, KERNEL, TBB_exec>
      : public QuadratureIntegrator_fT<dim, NT, KERNEL, Threads_exec>
  {
    using Base = QuadratureIntegrator_fT<dim, NT, KERNEL, Threads_exec>;
 
  public:
    using ctype = typename get_type::ctype<NT>;
    using execution_space = TBB_exec;
 
    static constexpr int sdim = dim - 1; // spatial dimension
 
    QuadratureIntegrator_fT(QuadratureProvider &quadrature_provider, const std::array<size_t, sdim> _grid_size,
                            std::array<ctype, sdim> grid_min, std::array<ctype, sdim> grid_max,
                            const std::array<QuadratureType, sdim> quadrature_type, const ctype T = 1,
                            const ctype typical_E = 1)
        : Base(quadrature_provider, _grid_size, grid_min, grid_max, quadrature_type, T, typical_E)
    {
    }
 
    template <typename... Args>
      requires is_valid_kernel<NT, KERNEL, ctype, dim, Args...>
    void get(NT &dest, const Args &...t) const
    {
      const auto args = device::tie(t...);
 
      const auto &n = nodes;
      const auto &w = weights;
      const auto &m_n = matsubara_nodes;
      const auto &m_w = matsubara_weights;
      const auto &start = grid_start;
      const auto &scale = grid_scale;
 
      const auto &m_T = matsubara_sum_T;
 
      auto functor = [&](const device::array<size_t, dim> &idx) {
        device::array<ctype, sdim> x;
        ctype weight = 1;
        for (int i = 0; i < sdim; ++i) {
          x[i] = Kokkos::fma(scale[i], n[i][idx[i]], start[i]);
          weight *= w[i][idx[i]] * scale[i];
        }
        const ctype xt = m_n[idx[dim - 1]];
        const ctype wt = m_w[idx[dim - 1]];
        NT update{};
        device::apply(
            [&](const auto &...iargs) {
              device::apply(
                  [&](const auto &...posargs) {
                    update +=
                        weight *
                        (
                            // positive and negative Matsubara frequencies
                            wt * (KERNEL::kernel(posargs..., xt, iargs...) + KERNEL::kernel(posargs..., -xt, iargs...))
                            // The zero mode (once per matsubara sum)
                            + (idx[dim - 1] != 0 ? NT{} : m_T * KERNEL::kernel(posargs..., (ctype)0, iargs...)));
                  },
                  x);
            },
            args);
        return update;
      };
 
      dest = KERNEL::constant(t...) + TBBReduction<dim, NT, decltype(functor)>(grid_size, functor);
    }
 
    template <typename Coordinates, typename... Args>
    void map(execution_space &, NT *dest, const Coordinates &coordinates, const Args &...args)
    {
      const auto m_args = device::tie(args...);
 
      tbb::parallel_for(tbb::blocked_range<uint>(0, coordinates.size()), [&](const tbb::blocked_range<uint> &r) {
        for (uint idx = r.begin(); idx != r.end(); ++idx) {
          const auto dis_idx = coordinates.from_linear_index(idx);
          const auto pos = coordinates.forward(dis_idx);
          // make a tuple of all arguments
          const auto full_args = device::tuple_cat(pos, m_args);
          device::apply([&](const auto &...iargs) { get(dest[idx], iargs...); }, full_args);
        }
      });
    }
 
    template <typename Coordinates, typename... Args>
    auto map(NT *dest, const Coordinates &coordinates, const Args &...args)
    {
      auto space = execution_space();
 
      // Take care of MPI distribution
      const auto &node_distribution = AbstractIntegrator::node_distribution;
      if (node_distribution.mpi_comm != MPI_COMM_NULL && node_distribution.total_size > 0) {
        auto mpi_comm = node_distribution.mpi_comm;
        const auto &nodes = node_distribution.nodes;
        const auto &sizes = node_distribution.sizes;
 
        // Check if the rank is contained in nodes
        const size_t m_rank = DiFfRG::MPI::rank(mpi_comm);
        // If not, return an empty execution space
        if (std::find(nodes.begin(), nodes.end(), m_rank) == nodes.end()) return execution_space();
 
        // Get the size of the current rank
        const size_t rank_size = sizes[m_rank];
        // Offset is the sum of all previous ranks
        const size_t offset = std::accumulate(sizes.begin(), sizes.begin() + m_rank, 0);
 
        // Create a SubCoordinates object
        const auto sub_coordinates = SubCoordinates(coordinates, offset, rank_size);
        // Offset the destination pointer
        NT *dest_offset = dest + offset;
 
        map(space, dest_offset, sub_coordinates, args...);
      } else
        map(space, dest, coordinates, args...);
      return space;
    }
 
  protected:
    using Base::grid_extents;
    using Base::grid_scale;
    using Base::grid_size;
    using Base::grid_start;
    using Base::quadrature_provider;
 
    using Base::matsubara_nodes;
    using Base::matsubara_weights;
    using Base::nodes;
    using Base::weights;
 
    using Base::matsubara_sum_T;
    using Base::T;
    using Base::typical_E;
  };
 
} // namespace DiFfRG