
/*** DO NOT EDIT THIS FILE DIRECTLY (use 'headers' to generate) ***/

#ifndef hypre_UTILITIES_HPP
#define hypre_UTILITIES_HPP

#include <HYPRE_config.h>

#ifdef HYPRE_MIXED_PRECISION
#include "_hypre_utilities_mup_def.h"
#endif

#ifdef __cplusplus
extern "C++" {
#endif

/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_FUNCTORS_H
#define HYPRE_FUNCTORS_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/*--------------------------------------------------------------------------
 * hypreFunctor_DenseMatrixIdentity
 *
 * Functor for generating a dense identity matrix.
 * This assumes that the input array "a" is zeros everywhere
 *--------------------------------------------------------------------------*/

struct hypreFunctor_DenseMatrixIdentity
{
   HYPRE_Int   n_;
   HYPRE_Real *a_;

   hypreFunctor_DenseMatrixIdentity(HYPRE_Int n, HYPRE_Real *a)
   {
      n_ = n;
      a_ = a;
   }

   __host__ __device__ void operator()(HYPRE_Int i)
   {
      a_[i * n_ + i] = 1.0;
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_ArrayStridedAccess
 *
 * Functor for performing strided data access on a templated array.
 *
 * The stride interval "s_" is used to access every "s_"-th element
 * from the source array "a_".
 *
 * It is templated to support various data types for the array.
 *--------------------------------------------------------------------------*/

template <typename T>
struct hypreFunctor_ArrayStridedAccess
{
   HYPRE_Int  s_;
   T         *a_;

   hypreFunctor_ArrayStridedAccess(HYPRE_Int s, T *a) : s_(s), a_(a) {}

   __host__ __device__ T operator()(HYPRE_Int i)
   {
      return a_[i * s_];
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_IndexStrided
 *
 * This functor multiplies a given index "i" by a specified stride "s_".
 *
 * It is templated to support various data types for the index and stride.
 *--------------------------------------------------------------------------*/

template <typename T>
struct hypreFunctor_IndexStrided
{
   T s_;

   hypreFunctor_IndexStrided(T s) : s_(s) {}

   __host__ __device__ T operator()(const T i) const
   {
      return i * s_;
   }
};

/*--------------------------------------------------------------------------
 * hypreFunctor_IndexCycle
 *--------------------------------------------------------------------------*/

struct hypreFunctor_IndexCycle
{
   HYPRE_Int cycle_length;

   hypreFunctor_IndexCycle(HYPRE_Int _cycle_length) : cycle_length(_cycle_length) {}

   __host__ __device__ HYPRE_Int operator()(HYPRE_Int i) const
   {
      return i % cycle_length;
   }
};

/*--------------------------------------------------------------------------
 * Functor to check: |x| > tol
 *--------------------------------------------------------------------------*/

struct hypreFunctor_NonzeroAboveTol
{
   HYPRE_Real tol;

   hypreFunctor_NonzeroAboveTol(HYPRE_Real tol_) : tol(tol_) {}

   __host__ __device__
   bool operator()(const HYPRE_Complex& x) const
   {
      return hypre_cabs(x) > tol;
   }
};

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
#endif /* ifndef HYPRE_FUNCTORS_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_PREDICATES_H
#define HYPRE_PREDICATES_H

/******************************************************************************
 *
 * Header file defining predicates for thrust used throughout hypre
 *
 *****************************************************************************/

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/*--------------------------------------------------------------------------
 * hyprePred_StridedAccess
 *
 * This struct defines a predicate for strided access in array-like data.
 *
 * It is used to determine if an element at a given index should be processed
 * or not, based on a specified stride. The operator() returns true when the
 * index is a multiple of the stride, indicating the element at that index
 * is part of the strided subset.
 *--------------------------------------------------------------------------*/

struct hyprePred_StridedAccess
{
   HYPRE_Int  s_;

   hyprePred_StridedAccess(HYPRE_Int s) : s_(s) {}

   __host__ __device__ HYPRE_Int operator()(const HYPRE_Int i) const
   {
      return (!(i % s_));
   }
};

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */
#endif /* ifndef HYPRE_PREDICATES_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef DEVICE_ALLOCATOR_H
#define DEVICE_ALLOCATOR_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* C++ style memory allocator for the device using the abstract memory model */
struct hypre_device_allocator
{
   typedef char value_type;

   hypre_device_allocator()
   {
      // constructor
   }

   ~hypre_device_allocator()
   {
      // destructor
   }

   char *allocate(std::ptrdiff_t num_bytes)
   {
      return hypre_TAlloc(char, num_bytes, HYPRE_MEMORY_DEVICE);
   }

   void deallocate(char *ptr, size_t n)
   {
      HYPRE_UNUSED_VAR(n);
      hypre_TFree(ptr, HYPRE_MEMORY_DEVICE);
   }
};

#endif /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

#endif
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

#ifndef HYPRE_DEVICE_UTILS_H
#define HYPRE_DEVICE_UTILS_H

#if defined(HYPRE_USING_GPU)

/* Data types depending on GPU architecture */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_SYCL)
typedef hypre_uint          hypre_mask;
#define hypre_mask_one      1U

#elif defined(HYPRE_USING_HIP)
typedef hypre_ulonglongint  hypre_mask;
#define hypre_mask_one      1ULL

#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          cuda includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA)
using hypre_DeviceItem = void*;
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_profiler_api.h>

#if defined(HYPRE_USING_CURAND)
#include <curand.h>
#endif

#if defined(HYPRE_USING_CUBLAS)
#include <cublas_v2.h>
#endif

#if defined(HYPRE_USING_CUSPARSE)
/* Note (VPM) : As of cuSPARSE 12, ILU functionalities have been marked as deprecated.
   The following definition avoids compilation warnings regarding the use of ILU routines */
#define DISABLE_CUSPARSE_DEPRECATED
#include <cusparse.h>
#endif

#if defined(HYPRE_USING_CUSOLVER)
#include <cusolverDn.h>
#endif

#ifndef CUDART_VERSION
#error CUDART_VERSION Undefined!
#endif

#ifndef CUDA_VERSION
#error CUDA_VERSION Undefined!
#endif

#if CUDA_VERSION >= 11000
#define THRUST_IGNORE_DEPRECATED_CPP11
#define THRUST_IGNORE_DEPRECATED_CPP_DIALECT
#endif

#ifndef CUSPARSE_VERSION
#if defined(CUSPARSE_VER_MAJOR) && defined(CUSPARSE_VER_MINOR) && defined(CUSPARSE_VER_PATCH)
#define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + CUSPARSE_VER_MINOR *  100 + CUSPARSE_VER_PATCH)
#else
#define CUSPARSE_VERSION CUDA_VERSION
#endif
#endif

#define CUSPARSE_NEWAPI_VERSION 11000
#define CUSPARSE_NEWSPMM_VERSION 11401
#define CUDA_MALLOCASYNC_VERSION 11020
#define CUDA_THRUST_NOSYNC_VERSION 12000

#define CUSPARSE_SPSV_VERSION 11600
#if CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION
#define hypre_cusparseSpSVDescr         cusparseSpSVDescr_t
#define hypre_cusparseSpSV_createDescr  cusparseSpSV_createDescr
#define hypre_cusparseSpSV_destroyDescr cusparseSpSV_destroyDescr
#else
#define hypre_cusparseSpSVDescr         csrsv2Info_t
#define hypre_cusparseSpSV_createDescr  cusparseCreateCsrsv2Info
#define hypre_cusparseSpSV_destroyDescr cusparseDestroyCsrsv2Info
#endif

#define CUSPARSE_SPSM_VERSION 11600
#if CUSPARSE_VERSION >= CUSPARSE_SPSM_VERSION
#define hypre_cusparseSpSMDescr         cusparseSpSMDescr_t
#define hypre_cusparseSpSM_createDescr  cusparseSpSM_createDescr
#define hypre_cusparseSpSM_destroyDescr cusparseSpSM_destroyDescr
#else
#define hypre_cusparseSpSMDescr         csrsm2Info_t
#define hypre_cusparseSpSM_createDescr  cusparseCreateCsrsm2Info
#define hypre_cusparseSpSM_destroyDescr cusparseDestroyCsrsm2Info
#endif

#if defined(HYPRE_USING_DEVICE_MALLOC_ASYNC)
#if CUDA_VERSION < CUDA_MALLOCASYNC_VERSION
#error cudaMalloc/FreeAsync needs CUDA 11.2
#endif
#endif

#if defined(HYPRE_USING_THRUST_NOSYNC)
#if CUDA_VERSION < CUDA_THRUST_NOSYNC_VERSION
#error thrust::cuda::par_nosync needs CUDA 12
#endif
#define HYPRE_THRUST_EXECUTION thrust::cuda::par_nosync
#else
#define HYPRE_THRUST_EXECUTION thrust::cuda::par
#endif

#if CUDART_VERSION >= 13000
// Macro for device memory prefetching (CUDART 13.0+)
#define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \
   do { \
      if (hypre_HandleDeviceUVM(hypre_handle())) \
      { \
         cudaMemLocation loc = {cudaMemLocationTypeDevice, hypre_HandleDevice(hypre_handle())}; \
         HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \
      } \
   } while (0)

// Macro for host memory prefetching (CUDART 13.0+)
#define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \
   do { \
      if (hypre_HandleDeviceUVM(hypre_handle())) \
      { \
         cudaMemLocation loc = {cudaMemLocationTypeHost, cudaCpuDeviceId}; \
         HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, loc, 0, stream)); \
      } \
   } while (0)

#else
// Macro for device memory prefetching (< CUDART 13.0)
#define HYPRE_MEM_PREFETCH_DEVICE(ptr, size, stream) \
   do { \
      if (hypre_HandleDeviceUVM(hypre_handle())) \
      { \
         HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, hypre_HandleDevice(hypre_handle()), stream)); \
      } \
   } while (0)

// Macro for host memory prefetching (< CUDART 13.0)
#define HYPRE_MEM_PREFETCH_HOST(ptr, size, stream) \
   do { \
      if (hypre_HandleDeviceUVM(hypre_handle())) \
      { \
         HYPRE_CUDA_CALL(cudaMemPrefetchAsync(ptr, size, cudaCpuDeviceId, stream)); \
      } \
   } while (0)
#endif

#endif /* defined(HYPRE_USING_CUDA) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          hip includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_HIP)

using hypre_DeviceItem = void*;
#include <hip/hip_runtime.h>

#if defined(HYPRE_USING_ROCBLAS)
#include <rocblas/rocblas.h>
#endif

#if defined(HYPRE_USING_ROCSPARSE)
#include <rocsparse/rocsparse.h>
#if !defined(ROCSPARSE_VERSION)
#define ROCSPARSE_VERSION (ROCSPARSE_VERSION_MAJOR * 100000 + ROCSPARSE_VERSION_MINOR * 100 + ROCSPARSE_VERSION_PATCH)
#endif
#endif

#if defined(HYPRE_USING_ROCSOLVER)
#include <rocsolver/rocsolver.h>
#endif

#if defined(HYPRE_USING_ROCRAND)
#include <rocrand/rocrand.h>
#endif

#endif /* defined(HYPRE_USING_HIP) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          thrust includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

#include <thrust/execution_policy.h>
#if defined(HYPRE_USING_CUDA)
#include <thrust/system/cuda/execution_policy.h>
#elif defined(HYPRE_USING_HIP)
#include <thrust/system/hip/execution_policy.h>
#endif
#include <thrust/count.h>
#include <thrust/device_ptr.h>
#include <thrust/unique.h>
#include <thrust/sort.h>
#include <thrust/binary_search.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/transform.h>
#include <thrust/functional.h>
#include <thrust/gather.h>
#include <thrust/scan.h>
#include <thrust/fill.h>
#include <thrust/adjacent_difference.h>
#include <thrust/inner_product.h>
#include <thrust/logical.h>
#include <thrust/replace.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <thrust/remove.h>
#include <thrust/version.h>

/* VPM: this is needed to support cuda 10. not_fn is the correct replacement going forward. */
#define THRUST_VERSION_NOTFN 200600
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
#define HYPRE_THRUST_NOT(pred) thrust::not1(pred)
#else
#define HYPRE_THRUST_NOT(pred) thrust::not_fn(pred)
#endif

/* Resolve deprecated warnings about thrust::identity */
#if (defined(THRUST_VERSION) && THRUST_VERSION < 200802)
#define HYPRE_THRUST_IDENTITY(type) thrust::identity<type>()
#elif defined(HYPRE_USING_CUDA)
#define HYPRE_THRUST_IDENTITY(type) cuda::std::identity()
#endif

using namespace thrust::placeholders;
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *                          sycl includes
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_SYCL)

#include <sycl/sycl.hpp>
#include <dpct/dpct.hpp>
#if defined(HYPRE_USING_ONEMKLSPARSE)
#include <oneapi/mkl/spblas.hpp>
#endif
#if defined(HYPRE_USING_ONEMKLBLAS)
#include <oneapi/mkl/blas.hpp>
#include <oneapi/mkl/lapack.hpp>
#endif
#if defined(HYPRE_USING_ONEMKLRAND)
#include <oneapi/mkl/rng.hpp>
#endif

/* The following definitions facilitate code reuse and limits
 * if/def-ing when unifying cuda/hip code with sycl code */
using dim3 = sycl::range<3>;
using hypre_DeviceItem = sycl::nd_item<3>;
#define __global__
#define __host__
#define __device__
#define __forceinline__ __inline__ __attribute__((always_inline))

#endif /* defined(HYPRE_USING_SYCL) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      device defined values
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if HYPRE_WARP_SIZE == 64
#define HYPRE_WARP_BITSHIFT      6
#define HYPRE_WARP_FULL_MASK     0xFFFFFFFFFFFFFFFF

#elif HYPRE_WARP_SIZE == 32
#define HYPRE_WARP_BITSHIFT      5

/* HIP 6.5 onwards requires masks to be 64-bit integers even when compiling for warp size 32 */
#if defined(HYPRE_USING_HIP) && HIP_VERSION > 60500000 && !defined(HIP_DISABLE_WARP_SYNC_BUILTINS)
#define HYPRE_WARP_FULL_MASK     0xFFFFFFFFFFFFFFFF
#else
#define HYPRE_WARP_FULL_MASK     0xFFFFFFFF
#endif

#else
#error "Unsupported value for HYPRE_WARP_SIZE"
#endif

#define HYPRE_MAX_NTHREADS_BLOCK 1024
#define HYPRE_MAX_NUM_WARPS      (64 * 64 * 32)
#define HYPRE_FLT_LARGE          1e30
#define HYPRE_1D_BLOCK_SIZE      512
#define HYPRE_MAX_NUM_STREAMS    10
#define HYPRE_SPGEMM_MAX_NBIN    10

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *       macro for launching GPU kernels
 *       NOTE: IN HYPRE'S DEFAULT STREAM
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_DEBUG)
#define GPU_LAUNCH_SYNC { hypre_SyncComputeStream(); hypre_GetDeviceLastError(); }
#else
#define GPU_LAUNCH_SYNC
#endif

/* cuda/hip version */
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                                                        \
{                                                                                                                                   \
   if ( gridsize.x  == 0 || gridsize.y  == 0 || gridsize.z  == 0 ||                                                                 \
        blocksize.x == 0 || blocksize.y == 0 || blocksize.z == 0 )                                                                  \
   {                                                                                                                                \
      /* printf("Warning %s %d: Zero CUDA grid/block (%d %d %d) (%d %d %d)\n",                                                      \
                 __FILE__, __LINE__,                                                                                                \
                 gridsize.x, gridsize.y, gridsize.z, blocksize.x, blocksize.y, blocksize.z); */                                     \
   }                                                                                                                                \
   else                                                                                                                             \
   {                                                                                                                                \
      hypre_DeviceItem item = NULL;                                                                                                 \
      (kernel_name) <<< (gridsize), (blocksize), shmem_size, hypre_HandleComputeStream(hypre_handle()) >>> (item, __VA_ARGS__);     \
      GPU_LAUNCH_SYNC;                                                                                                              \
   }                                                                                                                                \
}

#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...) HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, 0, __VA_ARGS__)
#endif /* defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

/* sycl version */
#if defined(HYPRE_USING_SYCL)

#define HYPRE_GPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, __VA_ARGS__);                                           \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_CPU_LAUNCH(kernel_name, gridsize, blocksize, ...)                              \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      HPYRE_Int compute_stream = hypre_HandleComputeStreamNum(hypre_handle());               \
      hypre_HandleComputeStreamNum(hypre_handle) = HYPRE_MAX_NUM_STREAMS - 1;                \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, __VA_ARGS__);                                           \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
      hypre_HandleComputeStreamNum(hypre_handle) = compute_stream;                           \
   }                                                                                         \
}

#define HYPRE_GPU_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)                 \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         sycl::range<1> shmem_range(shmem_size);                                             \
         sycl::local_accessor<char, 1> shmem_accessor(shmem_range, cgh);                     \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item,                                                         \
                  shmem_accessor.get_multi_ptr<sycl::access::decorated::yes>().get(),        \
                  __VA_ARGS__); });                                                          \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_GPU_DEBUG_LAUNCH(kernel_name, gridsize, blocksize, ...)                        \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         auto debug_stream = sycl::stream(4096, 1024, cgh);                                  \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item, debug_stream, __VA_ARGS__);                             \
         });                                                                                 \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}

#define HYPRE_GPU_DEBUG_LAUNCH2(kernel_name, gridsize, blocksize, shmem_size, ...)           \
{                                                                                            \
   if ( gridsize[2] == 0 || blocksize[2] == 0 )                                              \
   {                                                                                         \
     /* hypre_printf("Warning %s %d: Zero SYCL 1D launch parameters grid/block (%d) (%d)\n", \
                  __FILE__, __LINE__,                                                        \
                  gridsize[0], blocksize[0]); */                                             \
   }                                                                                         \
   else                                                                                      \
   {                                                                                         \
      hypre_HandleComputeStream(hypre_handle())->submit([&] (sycl::handler& cgh) {           \
         auto debug_stream = sycl::stream(4096, 1024, cgh);                                  \
         sycl::range<1> shmem_range(shmem_size);                                             \
         sycl::accessor<char, 1, sycl::access_mode::read_write,                              \
            sycl::target::local> shmem_accessor(shmem_range, cgh);                           \
         cgh.parallel_for(sycl::nd_range<3>(gridsize*blocksize, blocksize),                  \
            [=] (hypre_DeviceItem item) [[intel::reqd_sub_group_size(HYPRE_WARP_SIZE)]]      \
               { (kernel_name)(item,                                                         \
                  shmem_accessor.get_multi_ptr<sycl::access::decorated::yes>().get(),        \
                  __VA_ARGS__); });                                                          \
      }).wait_and_throw();                                                                   \
   }                                                                                         \
}
#endif /* defined(HYPRE_USING_SYCL) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping cuda/hip/sycl calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA)
#define HYPRE_CUDA_CALL(call) do {                                                           \
   cudaError_t err = call;                                                                   \
   if (cudaSuccess != err) {                                                                 \
      printf("CUDA ERROR (code = %d, %s) at %s:%d\n", err, cudaGetErrorString(err),          \
                   __FILE__, __LINE__);                                                      \
      hypre_assert(0);                                                                       \
   } } while(0)

#elif defined(HYPRE_USING_HIP)
#define HYPRE_HIP_CALL(call) do {                                                            \
   hipError_t err = call;                                                                    \
   if (hipSuccess != err) {                                                                  \
      printf("HIP ERROR (code = %d, %s) at %s:%d\n", err, hipGetErrorString(err),            \
                   __FILE__, __LINE__);                                                      \
      hypre_assert(0);                                                                       \
   } } while(0)

#elif defined(HYPRE_USING_SYCL)
#define HYPRE_SYCL_CALL(call)                                                                \
   try                                                                                       \
   {                                                                                         \
      call;                                                                                  \
   }                                                                                         \
   catch (sycl::exception const &ex)                                                         \
   {                                                                                         \
      hypre_printf("SYCL ERROR (code = %s) at %s:%d\n", ex.what(),                           \
                     __FILE__, __LINE__);                                                    \
      assert(0);                                                                             \
   }                                                                                         \
   catch(std::runtime_error const& ex)                                                       \
   {                                                                                         \
      hypre_printf("STD ERROR (code = %s) at %s:%d\n", ex.what(),                            \
                   __FILE__, __LINE__);                                                      \
      assert(0);                                                                             \
   }
#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping vendor library calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_COMPLEX) /* Double Complex */
#error "GPU build does not support complex numbers!"

#elif defined(HYPRE_SINGLE) /* Single */
#if defined(HYPRE_USING_CUDA)
/* cuBLAS */
#define hypre_cublas_scal                      cublasSscal
#define hypre_cublas_axpy                      cublasSaxpy
#define hypre_cublas_dot                       cublasSdot
#define hypre_cublas_gemv                      cublasSgemv
#define hypre_cublas_getrfBatched              cublasSgetrfBatched
#define hypre_cublas_getriBatched              cublasSgetriBatched

/* cuSPARSE */
#define hypre_cusparse_csru2csr_bufferSizeExt  cusparseScsru2csr_bufferSizeExt
#define hypre_cusparse_csru2csr                cusparseScsru2csr
#define hypre_cusparse_csrsv2_bufferSize       cusparseScsrsv2_bufferSize
#define hypre_cusparse_csrsv2_analysis         cusparseScsrsv2_analysis
#define hypre_cusparse_csrsv2_solve            cusparseScsrsv2_solve
#define hypre_cusparse_csrmv                   cusparseScsrmv
#define hypre_cusparse_csrgemm                 cusparseScsrgemm
#define hypre_cusparse_csr2csc                 cusparseScsr2csc
#define hypre_cusparse_csrilu02_bufferSize     cusparseScsrilu02_bufferSize
#define hypre_cusparse_csrilu02_analysis       cusparseScsrilu02_analysis
#define hypre_cusparse_csrilu02                cusparseScsrilu02
#define hypre_cusparse_csrsm2_bufferSizeExt    cusparseScsrsm2_bufferSizeExt
#define hypre_cusparse_csrsm2_analysis         cusparseScsrsm2_analysis
#define hypre_cusparse_csrsm2_solve            cusparseScsrsm2_solve

/* cuSOLVER */
#define hypre_cusolver_dngetrf                 cusolverDnSgetrf
#define hypre_cusolver_dngetrf_bs              cusolverDnSgetrf_bufferSize
#define hypre_cusolver_dngetrs                 cusolverDnSgetrs
#define hypre_cusolver_dnpotrf_batched         cusolverDnSpotrfBatched
#define hypre_cusolver_dnpotrs_batched         cusolverDnSpotrsBatched

#elif defined(HYPRE_USING_HIP)
/* rocSPARSE */
#define hypre_rocsparse_csrsv_buffer_size      rocsparse_scsrsv_buffer_size
#define hypre_rocsparse_csrsv_analysis         rocsparse_scsrsv_analysis
#define hypre_rocsparse_csrsv_solve            rocsparse_scsrsv_solve
#define hypre_rocsparse_gthr                   rocsparse_sgthr
#define hypre_rocsparse_csrmv_analysis         rocsparse_scsrmv_analysis
#define hypre_rocsparse_csrmv                  rocsparse_scsrmv
#define hypre_rocsparse_csrgemm_buffer_size    rocsparse_scsrgemm_buffer_size
#define hypre_rocsparse_csrgemm                rocsparse_scsrgemm
#define hypre_rocsparse_csr2csc                rocsparse_scsr2csc
#define hypre_rocsparse_csrilu0_buffer_size    rocsparse_scsrilu0_buffer_size
#define hypre_rocsparse_csrilu0_analysis       rocsparse_scsrilu0_analysis
#define hypre_rocsparse_csrilu0                rocsparse_scsrilu0
#define hypre_rocsparse_csritilu0_history      rocsparse_scsritilu0_history
#if (ROCSPARSE_VERSION >= 300400)
#define hypre_rocsparse_csritilu0_compute      rocsparse_scsritilu0_compute_ex
#else
#define hypre_rocsparse_csritilu0_compute      rocsparse_scsritilu0_compute
#endif


/* rocSOLVER */
#define hypre_rocsolver_getrf_batched          rocsolver_sgetrf_batched
#define hypre_rocsolver_getri_batched          rocsolver_sgetri_batched
#define hypre_rocsolver_potrf_batched          rocsolver_spotrf_batched
#define hypre_rocsolver_potrs_batched          rocsolver_spotrs_batched

#endif /* if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) */

#elif defined(HYPRE_LONG_DOUBLE) /* Long Double */
#error "GPU build does not support Long Double numbers!"

#else /* Double */
#if defined(HYPRE_USING_CUDA)
/* cuBLAS */
#define hypre_cublas_scal                      cublasDscal
#define hypre_cublas_axpy                      cublasDaxpy
#define hypre_cublas_dot                       cublasDdot
#define hypre_cublas_gemv                      cublasDgemv
#define hypre_cublas_getrfBatched              cublasDgetrfBatched
#define hypre_cublas_getriBatched              cublasDgetriBatched

/* cuSPARSE */
#define hypre_cusparse_csru2csr_bufferSizeExt  cusparseDcsru2csr_bufferSizeExt
#define hypre_cusparse_csru2csr                cusparseDcsru2csr
#define hypre_cusparse_csrsv2_bufferSize       cusparseDcsrsv2_bufferSize
#define hypre_cusparse_csrsv2_analysis         cusparseDcsrsv2_analysis
#define hypre_cusparse_csrsv2_solve            cusparseDcsrsv2_solve
#define hypre_cusparse_csrmv                   cusparseDcsrmv
#define hypre_cusparse_csrgemm                 cusparseDcsrgemm
#define hypre_cusparse_csr2csc                 cusparseDcsr2csc
#define hypre_cusparse_csrilu02_bufferSize     cusparseDcsrilu02_bufferSize
#define hypre_cusparse_csrilu02_analysis       cusparseDcsrilu02_analysis
#define hypre_cusparse_csrilu02                cusparseDcsrilu02
#define hypre_cusparse_csrsm2_bufferSizeExt    cusparseDcsrsm2_bufferSizeExt
#define hypre_cusparse_csrsm2_analysis         cusparseDcsrsm2_analysis
#define hypre_cusparse_csrsm2_solve            cusparseDcsrsm2_solve

/* cuSOLVER */
#define hypre_cusolver_dngetrf                 cusolverDnDgetrf
#define hypre_cusolver_dngetrf_bs              cusolverDnDgetrf_bufferSize
#define hypre_cusolver_dngetrs                 cusolverDnDgetrs
#define hypre_cusolver_dnpotrf_batched         cusolverDnDpotrfBatched
#define hypre_cusolver_dnpotrs_batched         cusolverDnDpotrsBatched

#elif defined(HYPRE_USING_HIP)
/* rocSPARSE */
#define hypre_rocsparse_csrsv_buffer_size      rocsparse_dcsrsv_buffer_size
#define hypre_rocsparse_csrsv_analysis         rocsparse_dcsrsv_analysis
#define hypre_rocsparse_csrsv_solve            rocsparse_dcsrsv_solve
#define hypre_rocsparse_gthr                   rocsparse_dgthr
#define hypre_rocsparse_csrmv_analysis         rocsparse_dcsrmv_analysis
#define hypre_rocsparse_csrmv                  rocsparse_dcsrmv
#define hypre_rocsparse_csrgemm_buffer_size    rocsparse_dcsrgemm_buffer_size
#define hypre_rocsparse_csrgemm                rocsparse_dcsrgemm
#define hypre_rocsparse_csr2csc                rocsparse_dcsr2csc
#define hypre_rocsparse_csrilu0_buffer_size    rocsparse_dcsrilu0_buffer_size
#define hypre_rocsparse_csrilu0_analysis       rocsparse_dcsrilu0_analysis
#define hypre_rocsparse_csrilu0                rocsparse_dcsrilu0
#define hypre_rocsparse_csritilu0_history      rocsparse_dcsritilu0_history
#if (ROCSPARSE_VERSION >= 300400)
#define hypre_rocsparse_csritilu0_compute      rocsparse_dcsritilu0_compute_ex
#else
#define hypre_rocsparse_csritilu0_compute      rocsparse_dcsritilu0_compute
#endif


/* rocSOLVER */
#define hypre_rocsolver_getrf_batched          rocsolver_dgetrf_batched
#define hypre_rocsolver_getri_batched          rocsolver_dgetri_batched
#define hypre_rocsolver_potrf_batched          rocsolver_dpotrf_batched
#define hypre_rocsolver_potrs_batched          rocsolver_dpotrs_batched

#endif /* #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)*/
#endif /* #if defined(HYPRE_COMPLEX) || defined(HYPRE_SINGLE) || defined(HYPRE_LONG_DOUBLE) */

#define HYPRE_CUBLAS_CALL(call) do {                                                         \
   cublasStatus_t err = call;                                                                \
   if (CUBLAS_STATUS_SUCCESS != err) {                                                       \
      printf("CUBLAS ERROR (code = %d, %d) at %s:%d\n",                                      \
            err, err == CUBLAS_STATUS_EXECUTION_FAILED, __FILE__, __LINE__);                 \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ROCBLAS_CALL(call) do {                                                        \
   rocblas_status err = call;                                                                \
   if (rocblas_status_success != err) {                                                      \
      printf("rocBLAS ERROR (code = %d, %s) at %s:%d\n",                                     \
             err, rocblas_status_to_string(err), __FILE__, __LINE__);                        \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#if CUSPARSE_VERSION >= 10300
#define HYPRE_CUSPARSE_CALL(call) do {                                                       \
   cusparseStatus_t err = call;                                                              \
   if (CUSPARSE_STATUS_SUCCESS != err) {                                                     \
      printf("CUSPARSE ERROR (code = %d, %s) at %s:%d\n",                                    \
            err, cusparseGetErrorString(err), __FILE__, __LINE__);                           \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)
#else
#define HYPRE_CUSPARSE_CALL(call) do {                                                       \
   cusparseStatus_t err = call;                                                              \
   if (CUSPARSE_STATUS_SUCCESS != err) {                                                     \
      printf("CUSPARSE ERROR (code = %d) at %s:%d\n",                                        \
            err, __FILE__, __LINE__);                                                        \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)
#endif

#define HYPRE_ROCSPARSE_CALL(call) do {                                                      \
   rocsparse_status err = call;                                                              \
   if (rocsparse_status_success != err) {                                                    \
      printf("rocSPARSE ERROR (code = %d) at %s:%d\n",                                       \
            err, __FILE__, __LINE__);                                                        \
      assert(0); exit(1);                                                                    \
   } } while(0)

#define HYPRE_CUSOLVER_CALL(call) do {                                                       \
   cusolverStatus_t err = call;                                                              \
   if (CUSOLVER_STATUS_SUCCESS != err) {                                                     \
      printf("cuSOLVER ERROR (code = %d) at %s:%d\n",                                        \
            err, __FILE__, __LINE__);                                                        \
      hypre_assert(0);                                                                       \
   } } while(0)

#define HYPRE_ROCSOLVER_CALL(call) do {                                                      \
   rocblas_status err = call;                                                                \
   if (rocblas_status_success != err) {                                                      \
      printf("rocSOLVER ERROR (code = %d, %s) at %s:%d\n",                                   \
             err, rocblas_status_to_string(err), __FILE__, __LINE__);                        \
   } } while(0)

#define HYPRE_CURAND_CALL(call) do {                                                         \
   curandStatus_t err = call;                                                                \
   if (CURAND_STATUS_SUCCESS != err) {                                                       \
      printf("CURAND ERROR (code = %d) at %s:%d\n", err, __FILE__, __LINE__);                \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ROCRAND_CALL(call) do {                                                        \
   rocrand_status err = call;                                                                \
   if (ROCRAND_STATUS_SUCCESS != err) {                                                      \
      printf("ROCRAND ERROR (code = %d) at %s:%d\n", err, __FILE__, __LINE__);               \
      hypre_assert(0); exit(1);                                                              \
   } } while(0)

#define HYPRE_ONEMKL_CALL(call)                                                              \
   try                                                                                       \
   {                                                                                         \
      call;                                                                                  \
   }                                                                                         \
   catch (oneapi::mkl::exception const &ex)                                                  \
   {                                                                                         \
      hypre_printf("ONEMKL ERROR (code = %s) at %s:%d\n", ex.what(),                         \
                   __FILE__, __LINE__);                                                      \
      assert(0); exit(1);                                                                    \
   }

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      macros for wrapping thrust/oneDPL calls for error reporting
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

/* RL: TODO Want macro HYPRE_THRUST_CALL to return value but I don't know how to do it right
 * The following one works OK for now */

#if defined(HYPRE_USING_CUDA)
#define HYPRE_THRUST_CALL(func_name, ...) \
   thrust::func_name(HYPRE_THRUST_EXECUTION(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);
#elif defined(HYPRE_USING_HIP)
#define HYPRE_THRUST_CALL(func_name, ...) \
   thrust::func_name(thrust::hip::par(hypre_HandleDeviceAllocator(hypre_handle())).on(hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);

#elif defined(HYPRE_USING_SYCL)

#define HYPRE_ONEDPL_CALL(func_name, ...)                                                    \
  func_name(oneapi::dpl::execution::make_device_policy(                                      \
           *hypre_HandleComputeStream(hypre_handle())), __VA_ARGS__);

#define HYPRE_ONEDPL_CPU_CALL(func_name, ...)                                                \
   func_name(oneapi::dpl::execution::make_device_policy(                                     \
             *hypre_DeviceDataStream(hypre_HandleDeviceData(hypre_handle()), HYPRE_MAX_NUM_STREAMS - 1)), __VA_ARGS__);

#endif

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      device info data structures
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUSOLVER)
typedef cusolverDnHandle_t vendorSolverHandle_t;
#elif defined(HYPRE_USING_ROCSOLVER)
typedef rocblas_handle     vendorSolverHandle_t;
#endif

struct hypre_DeviceData
{
#if defined(HYPRE_USING_CURAND)
   curandGenerator_t                 curand_generator;
#endif

#if defined(HYPRE_USING_ROCRAND)
   rocrand_generator                 curand_generator;
#endif

#if defined(HYPRE_USING_CUBLAS)
   cublasHandle_t                    cublas_handle;
#endif

#if defined(HYPRE_USING_CUSPARSE)
   cusparseHandle_t                  cusparse_handle;
#endif

#if defined(HYPRE_USING_ROCSPARSE)
   rocsparse_handle                  cusparse_handle;
#endif

#if defined(HYPRE_USING_CUSOLVER) || defined(HYPRE_USING_ROCSOLVER)
   vendorSolverHandle_t              vendor_solver_handle;
#endif

   /* TODO (VPM): Change to HYPRE_USING_GPU_STREAMS*/
#if defined(HYPRE_USING_CUDA_STREAMS)
#if defined(HYPRE_USING_CUDA)
   cudaStream_t                      streams[HYPRE_MAX_NUM_STREAMS];
#elif defined(HYPRE_USING_HIP)
   hipStream_t                       streams[HYPRE_MAX_NUM_STREAMS];
#elif defined(HYPRE_USING_SYCL)
   sycl::queue*                      streams[HYPRE_MAX_NUM_STREAMS] = {NULL};
#endif
#endif

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
   hypre_device_allocator            device_allocator;
#endif
#if defined(HYPRE_USING_SYCL)
   sycl::device                     *device;
   HYPRE_Int                         device_max_work_group_size;
#else
   HYPRE_Int                         device;
   HYPRE_Int                         device_uvm;
#endif
   HYPRE_Int                         use_gpu_aware_mpi;
   HYPRE_Int                         gs_method; /* device G-S options */
   hypre_int                         device_max_shmem_per_block[3];
   /* by default, hypre puts GPU computations in this stream
    * Do not be confused with the default (null) stream */
   HYPRE_Int                         compute_stream_num;
   /* work space for hypre's device reducer */
   void                             *reduce_buffer;
   /* device spgemm options */
   HYPRE_Int                         spgemm_algorithm;
   HYPRE_Int                         spgemm_binned;
   HYPRE_Int                         spgemm_num_bin;
   /* the highest bins for symbl [0] and numer [1]
    * which are not necessary to be `spgemm_num_bin' due to shmem limit on GPUs */
   HYPRE_Int                         spgemm_highest_bin[2];
   /* for bin i: ([0][i], [2][i]) = (max #block to launch, block dimension) for symbl
    *            ([1][i], [3][i]) = (max #block to launch, block dimension) for numer */
   HYPRE_Int                         spgemm_block_num_dim[4][HYPRE_SPGEMM_MAX_NBIN + 1];
   HYPRE_Int                         spgemm_rownnz_estimate_method;
   HYPRE_Int                         spgemm_rownnz_estimate_nsamples;
   float                             spgemm_rownnz_estimate_mult_factor;
   /* cusparse */
   HYPRE_Int                         spmv_use_vendor;
   HYPRE_Int                         sptrans_use_vendor;
   HYPRE_Int                         spgemm_use_vendor;
   /* PMIS RNG */
   HYPRE_Int                         use_gpu_rand;
};

#define hypre_DeviceDataDevice(data)                         ((data) -> device)
#define hypre_DeviceDataDeviceUVM(data)                      ((data) -> device_uvm)
#define hypre_DeviceDataDeviceMaxWorkGroupSize(data)         ((data) -> device_max_work_group_size)
#define hypre_DeviceDataUseGpuAwareMPI(data)                 ((data) -> use_gpu_aware_mpi)
#define hypre_DeviceDataGSMethod(data)                       ((data) -> gs_method)
#define hypre_DeviceDataDeviceMaxShmemPerBlock(data)         ((data) -> device_max_shmem_per_block)
#define hypre_DeviceDataDeviceMaxShmemPerBlockInited(data)  (((data) -> device_max_shmem_per_block)[2])
#define hypre_DeviceDataComputeStreamNum(data)               ((data) -> compute_stream_num)
#define hypre_DeviceDataReduceBuffer(data)                   ((data) -> reduce_buffer)
#define hypre_DeviceDataSpgemmUseVendor(data)                ((data) -> spgemm_use_vendor)
#define hypre_DeviceDataSpMVUseVendor(data)                  ((data) -> spmv_use_vendor)
#define hypre_DeviceDataSpTransUseVendor(data)               ((data) -> sptrans_use_vendor)
#define hypre_DeviceDataSpgemmAlgorithm(data)                ((data) -> spgemm_algorithm)
#define hypre_DeviceDataSpgemmBinned(data)                   ((data) -> spgemm_binned)
#define hypre_DeviceDataSpgemmNumBin(data)                   ((data) -> spgemm_num_bin)
#define hypre_DeviceDataSpgemmHighestBin(data)               ((data) -> spgemm_highest_bin)
#define hypre_DeviceDataSpgemmBlockNumDim(data)              ((data) -> spgemm_block_num_dim)
#define hypre_DeviceDataSpgemmRownnzEstimateMethod(data)     ((data) -> spgemm_rownnz_estimate_method)
#define hypre_DeviceDataSpgemmRownnzEstimateNsamples(data)   ((data) -> spgemm_rownnz_estimate_nsamples)
#define hypre_DeviceDataSpgemmRownnzEstimateMultFactor(data) ((data) -> spgemm_rownnz_estimate_mult_factor)
#define hypre_DeviceDataDeviceAllocator(data)                ((data) -> device_allocator)
#define hypre_DeviceDataUseGpuRand(data)                     ((data) -> use_gpu_rand)

hypre_DeviceData*     hypre_DeviceDataCreate();
void                  hypre_DeviceDataDestroy(hypre_DeviceData* data);

#if defined(HYPRE_USING_CURAND)
curandGenerator_t     hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_ROCRAND)
rocrand_generator     hypre_DeviceDataCurandGenerator(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUBLAS)
cublasHandle_t        hypre_DeviceDataCublasHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUSPARSE)
cusparseHandle_t      hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_ROCSPARSE)
rocsparse_handle      hypre_DeviceDataCusparseHandle(hypre_DeviceData *data);
#endif

#if defined(HYPRE_USING_CUSOLVER) || defined(HYPRE_USING_ROCSOLVER)
vendorSolverHandle_t  hypre_DeviceDataVendorSolverHandle(hypre_DeviceData *data);
#endif

/* TODO (VPM): Create a deviceStream_t to encapsulate all stream types below */
#if defined(HYPRE_USING_CUDA)
cudaStream_t          hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
cudaStream_t          hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#elif defined(HYPRE_USING_HIP)
hipStream_t           hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
hipStream_t           hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#elif defined(HYPRE_USING_SYCL)
sycl::queue*          hypre_DeviceDataStream(hypre_DeviceData *data, HYPRE_Int i);
sycl::queue*          hypre_DeviceDataComputeStream(hypre_DeviceData *data);
#endif

/* Data structure and accessor routines for Sparse Triangular Matrices */
struct hypre_CsrsvData
{
#if defined(HYPRE_USING_CUSPARSE)
   hypre_cusparseSpSVDescr   info_L;
   hypre_cusparseSpSVDescr   info_U;
   cusparseSolvePolicy_t     analysis_policy;
   cusparseSolvePolicy_t     solve_policy;
#elif defined(HYPRE_USING_ROCSPARSE)
   rocsparse_mat_info        info_L;
   rocsparse_mat_info        info_U;
   rocsparse_analysis_policy analysis_policy;
   rocsparse_solve_policy    solve_policy;
#endif

#if defined(HYPRE_USING_CUSPARSE) && (CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION)
   size_t                    buffer_size_L;
   size_t                    buffer_size_U;
   char                     *buffer_L;
   char                     *buffer_U;
#else
   hypre_int                 buffer_size;
   char                     *buffer;
#endif

   /* Temporary array to save matrix values with modified diagonal */
   HYPRE_Complex            *mat_data;

   /* Flags for checking whether the analysis phase has been executed or not */
   HYPRE_Int                 analyzed_L;
   HYPRE_Int                 analyzed_U;
};

#define hypre_CsrsvDataInfoL(data)          ((data) -> info_L)
#define hypre_CsrsvDataInfoU(data)          ((data) -> info_U)
#define hypre_CsrsvDataAnalyzedL(data)      ((data) -> analyzed_L)
#define hypre_CsrsvDataAnalyzedU(data)      ((data) -> analyzed_U)
#define hypre_CsrsvDataSolvePolicy(data)    ((data) -> solve_policy)
#define hypre_CsrsvDataAnalysisPolicy(data) ((data) -> analysis_policy)
#if defined(HYPRE_USING_CUSPARSE) && (CUSPARSE_VERSION >= CUSPARSE_SPSV_VERSION)
#define hypre_CsrsvDataBufferSizeL(data)    ((data) -> buffer_size_L)
#define hypre_CsrsvDataBufferSizeU(data)    ((data) -> buffer_size_U)
#define hypre_CsrsvDataBufferL(data)        ((data) -> buffer_L)
#define hypre_CsrsvDataBufferU(data)        ((data) -> buffer_U)
#else
#define hypre_CsrsvDataBufferSize(data)     ((data) -> buffer_size)
#define hypre_CsrsvDataBuffer(data)         ((data) -> buffer)
#endif
#define hypre_CsrsvDataMatData(data)        ((data) -> mat_data)

struct hypre_GpuMatData
{
#if defined(HYPRE_USING_CUSPARSE)
   cusparseMatDescr_t                   mat_descr;
   char                                *spmv_buffer;

#elif defined(HYPRE_USING_ROCSPARSE)
   rocsparse_mat_descr                  mat_descr;
   rocsparse_mat_info                   mat_info;

#elif defined(HYPRE_USING_ONEMKLSPARSE)
   oneapi::mkl::sparse::matrix_handle_t mat_handle;
#endif
};

#define hypre_GpuMatDataMatDescr(data)    ((data) -> mat_descr)
#define hypre_GpuMatDataMatInfo(data)     ((data) -> mat_info)
#define hypre_GpuMatDataMatHandle(data)   ((data) -> mat_handle)
#define hypre_GpuMatDataSpMVBuffer(data)  ((data) -> spmv_buffer)

#endif /* if defined(HYPRE_USING_GPU) */

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      generic device functions (cuda/hip/sycl)
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_GPU)
template <typename T>
static __device__ __forceinline__
T read_only_load( const T *ptr )
{
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350
   return __ldg( ptr );
#else
   return *ptr;
#endif
}

static __device__ __forceinline__
hypre_int next_power_of_2(hypre_int n)
{
   if (n <= 0)
   {
      return 0;
   }

   /* if n is power of 2, return itself */
   if ( (n & (n - 1)) == 0 )
   {
      return n;
   }

   n |= (n >>  1);
   n |= (n >>  2);
   n |= (n >>  4);
   n |= (n >>  8);
   n |= (n >> 16);
   n ^= (n >>  1);
   n  = (n <<  1);

   return n;
}

/* Flip n-th bit of bitmask (0 becomes 1. 1 becomes 0) */
static __device__ __forceinline__
hypre_mask hypre_mask_flip_at(hypre_mask bitmask, hypre_int n)
{
   return bitmask ^ (hypre_mask_one << n);
}

#endif // defined(HYPRE_USING_GPU)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      cuda/hip functions
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* return the thread identifier in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_threadIdx(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   if (dim == 0) { return threadIdx.x; }
   if (dim == 1) { return threadIdx.y; }
   if (dim == 2) { return threadIdx.z; }

   return -1;
}

/* return the block identifier in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_blockIdx(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   if (dim == 0) { return blockIdx.x; }
   if (dim == 1) { return blockIdx.y; }
   if (dim == 2) { return blockIdx.z; }

   return -1;
}

/* return the block dimension in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_blockDim(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   if (dim == 0) { return blockDim.x; }
   if (dim == 1) { return blockDim.y; }
   if (dim == 2) { return blockDim.z; }

   return -1;
}

/* return the number of threads in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_threads(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   switch (dim)
   {
      case 1:
         return (blockDim.x);
      case 2:
         return (blockDim.x * blockDim.y);
      case 3:
         return (blockDim.x * blockDim.y * blockDim.z);
   }

   return -1;
}

/* return the flattened thread id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_thread_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   switch (dim)
   {
      case 1:
         return (threadIdx.x);
      case 2:
         return (threadIdx.y * blockDim.x + threadIdx.x);
      case 3:
         return (threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x +
                 threadIdx.x);
   }

   return -1;
}

/* return the number of warps in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_warps(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_num_threads<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the warp id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_warp_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_thread_id<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the thread lane id in warp */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_lane_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_thread_id<dim>(item) & (HYPRE_WARP_SIZE - 1);
}

/* return the num of blocks in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_blocks()
{
   switch (dim)
   {
      case 1:
         return (gridDim.x);
      case 2:
         return (gridDim.x * gridDim.y);
      case 3:
         return (gridDim.x * gridDim.y * gridDim.z);
   }

   return -1;
}

/* return the flattened block id in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_block_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   switch (dim)
   {
      case 1:
         return (blockIdx.x);
      case 2:
         return (blockIdx.y * gridDim.x + blockIdx.x);
      case 3:
         return (blockIdx.z * gridDim.x * gridDim.y + blockIdx.y * gridDim.x +
                 blockIdx.x);
   }

   return -1;
}

/* return the number of threads in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_threads(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_threads<bdim>(item);
}

/* return the flattened thread id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_thread_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item) +
          hypre_gpu_get_thread_id<bdim>(item);
}

/* return the number of warps in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_warps(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_num_blocks<gdim>() * hypre_gpu_get_num_warps<bdim>(item);
}

/* return the flattened warp id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_warp_id(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item) +
          hypre_gpu_get_warp_id<bdim>(item);
}

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
static __device__ __forceinline__
hypre_double atomicAdd(hypre_double* address, hypre_double val)
{
   hypre_ulonglongint* address_as_ull = (hypre_ulonglongint*) address;
   hypre_ulonglongint old = *address_as_ull, assumed;

   do
   {
      assumed = old;
      old = atomicCAS(address_as_ull, assumed,
                      __double_as_longlong(val +
                                           __longlong_as_double(assumed)));

      // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
   }
   while (assumed != old);

   return __longlong_as_double(old);
}
#endif

/* Perform atomic add operation */
template <typename T>
static __device__ __forceinline__
void hypre_gpu_atomicAdd(hypre_int pos, T* address, T val)
{
   atomicAdd((T*)(address + pos), val);
}

// There are no *_sync functions in HIP < 6.5
#if (defined(HYPRE_USING_HIP) && HIP_VERSION < 60500000 || defined(HIP_DISABLE_WARP_SYNC_BUILTINS))

template <typename T>
static __device__ __forceinline__
T __shfl_sync(hypre_mask mask, T val, hypre_int src_line, hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(mask);

   return __shfl(val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_up_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(mask);

   return __shfl_up(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_down_sync(hypre_mask mask, T val, hypre_uint delta, hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(mask);

   return __shfl_down(val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T __shfl_xor_sync(hypre_mask mask, T val, hypre_int lanemask, hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(mask);

   return __shfl_xor(val, lanemask, width);
}

static __device__ __forceinline__
void __syncwarp()
{
}

static __device__ __forceinline__
hypre_int __any_sync(unsigned mask, hypre_int predicate)
{
   HYPRE_UNUSED_VAR(mask);

   return __any(predicate);
}

#endif // if (defined(HYPRE_USING_HIP) && HIP_VERSION < 60500000)

static __device__ __forceinline__
hypre_mask hypre_ballot_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
#if defined(HYPRE_USING_CUDA) ||\
  (defined(HYPRE_USING_HIP) && HIP_VERSION >= 60500000 && !defined(HIP_DISABLE_WARP_SYNC_BUILTINS))
   HYPRE_UNUSED_VAR(item);

   return __ballot_sync(mask, predicate);
#else
   HYPRE_UNUSED_VAR(mask);
   return __ballot(predicate);
#endif
}

static __device__ __forceinline__
HYPRE_Int hypre_popc(hypre_mask mask)
{
#if defined(HYPRE_USING_CUDA)
   return (HYPRE_Int) __popc(mask);
#else
   return (HYPRE_Int) __popcll(mask);
#endif
}

static __device__ __forceinline__
HYPRE_Int hypre_ffs(hypre_mask mask)
{
#if defined(HYPRE_USING_CUDA)
   return (HYPRE_Int) __ffs(mask);
#else
   return (HYPRE_Int) __ffsll(mask);
#endif
}

/* sync the thread block */
static __device__ __forceinline__
void block_sync(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   __syncthreads();
}

/* sync the warp */
static __device__ __forceinline__
void warp_sync(hypre_DeviceItem &item)
{
   HYPRE_UNUSED_VAR(item);

   __syncwarp();
}

/* exclusive prefix scan */
template <typename T>
static __device__ __forceinline__
T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = 2; d <= HYPRE_WARP_SIZE; d <<= 1)
   {
      T t = __shfl_up_sync(HYPRE_WARP_FULL_MASK, in, d >> 1);
      if ( (lane_id & (d - 1)) == (d - 1) )
      {
         in += t;
      }
   }

   all_sum = __shfl_sync(HYPRE_WARP_FULL_MASK, in, HYPRE_WARP_SIZE - 1);

   if (lane_id == HYPRE_WARP_SIZE - 1)
   {
      in = 0;
   }

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      T t = __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);

      if ( (lane_id & (d - 1)) == (d - 1))
      {
         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
         {
            in += t;
         }
         else
         {
            in = t;
         }
      }
   }
   return in;
}

static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   HYPRE_UNUSED_VAR(item);

   return __any_sync(mask, predicate);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
                    hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(item);

   return __shfl_sync(mask, val, src_line, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                       hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(item);

   return __shfl_up_sync(mask, val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                         hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(item);

   return __shfl_down_sync(mask, val, delta, width);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
                        hypre_int width = HYPRE_WARP_SIZE)
{
   HYPRE_UNUSED_VAR(item);

   return __shfl_xor_sync(mask, val, lane_mask, width);
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_sum(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d);
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_sum(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d);
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_max(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = max(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_max(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = max(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_reduce_min(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = min(in, __shfl_down_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template <typename T>
static __device__ __forceinline__
T warp_allreduce_min(hypre_DeviceItem &item, T in)
{
   HYPRE_UNUSED_VAR(item);

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = min(in, __shfl_xor_sync(HYPRE_WARP_FULL_MASK, in, d));
   }
   return in;
}

template<typename T1, typename T2>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct type_cast : public thrust::unary_function<T1, T2>
#else
struct type_cast
#endif
{
   __host__ __device__ T2 operator()(const T1 &x) const
   {
      return (T2) x;
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct absolute_value : public thrust::unary_function<T, T>
#else
struct absolute_value
#endif
{
   __host__ __device__ T operator()(const T &x) const
   {
      return x < T(0) ? -x : x;
   }
};

template<typename T1, typename T2>
struct TupleComp2
{
   typedef thrust::tuple<T1, T2> Tuple;

   __host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (thrust::get<0>(t1) < thrust::get<0>(t2))
      {
         return true;
      }
      if (thrust::get<0>(t1) > thrust::get<0>(t2))
      {
         return false;
      }
      return hypre_abs(thrust::get<1>(t1)) > hypre_abs(thrust::get<1>(t2));
   }
};

template<typename T1, typename T2>
struct TupleComp3
{
   typedef thrust::tuple<T1, T2> Tuple;

   __host__ __device__ bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (thrust::get<0>(t1) < thrust::get<0>(t2))
      {
         return true;
      }
      if (thrust::get<0>(t1) > thrust::get<0>(t2))
      {
         return false;
      }
      if (thrust::get<0>(t2) == thrust::get<1>(t2))
      {
         return false;
      }
      return thrust::get<0>(t1) == thrust::get<1>(t1) || thrust::get<1>(t1) < thrust::get<1>(t2);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_negative : public thrust::unary_function<T, bool>
#else
struct is_negative
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x < 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_positive : public thrust::unary_function<T, bool>
#else
struct is_positive
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x > 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct is_nonnegative : public thrust::unary_function<T, bool>
#else
struct is_nonnegative
#endif
{
   __host__ __device__ bool operator()(const T &x)
   {
      return (x >= 0);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct in_range : public thrust::unary_function<T, bool>
#else
struct in_range
#endif
{
   T low, up;

   in_range(T low_, T up_) { low = low_; up = up_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x >= low && x <= up);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct out_of_range : public thrust::unary_function<T, bool>
#else
struct out_of_range
#endif
{
   T low, up;

   out_of_range(T low_, T up_) { low = low_; up = up_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x < low || x > up);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct less_than : public thrust::unary_function<T, bool>
#else
struct less_than
#endif
{
   T val;

   less_than(T val_) { val = val_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x < val);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct modulo : public thrust::unary_function<T, T>
#else
struct modulo
#endif
{
   T val;

   modulo(T val_) { val = val_; }

   __host__ __device__ T operator()(const T &x)
   {
      return (x % val);
   }
};

template<typename T>
#if (defined(THRUST_VERSION) && THRUST_VERSION < THRUST_VERSION_NOTFN)
struct equal : public thrust::unary_function<T, bool>
#else
struct equal
#endif
{
   T val;

   equal(T val_) { val = val_; }

   __host__ __device__ bool operator()(const T &x)
   {
      return (x == val);
   }
};

struct print_functor
{
   __host__ __device__ void operator()(HYPRE_Real val)
   {
      printf("%f\n", val);
   }
};

#endif // defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      sycl functions
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

#if defined(HYPRE_USING_SYCL)

/* return the thread identifier in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_threadIdx(hypre_DeviceItem &item)
{
   if constexpr (dim == 0) { return item.get_local_id(2); }
   if constexpr (dim == 1) { return item.get_local_id(1); }
   if constexpr (dim == 2) { return item.get_local_id(0); }

   return -1;
}

/* return the block identifier in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_blockIdx(hypre_DeviceItem &item)
{
   if constexpr (dim == 0) { return item.get_group(2); }
   if constexpr (dim == 1) { return item.get_group(1); }
   if constexpr (dim == 2) { return item.get_group(0); }

   return -1;
}

/* return the block dimension in direction dim */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_blockDim(hypre_DeviceItem &item)
{
   if constexpr (dim == 0) { return item.get_local_range(2); }
   if constexpr (dim == 1) { return item.get_local_range(1); }
   if constexpr (dim == 2) { return item.get_local_range(0); }

   return -1;
}

/* return the number of threads in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_threads(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_local_range().size());
}

/* return the flattened thread id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_thread_id(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_local_linear_id());
}

/* return the number of warps in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_threads<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the warp id in block */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_thread_id<dim>(item) >> HYPRE_WARP_BITSHIFT;
}

/* return the thread lane id in warp */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_lane_id(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_sub_group().get_local_linear_id());
}

/* return the num of blocks in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_num_blocks(hypre_DeviceItem &item)
{
   return static_cast<hypre_int>(item.get_group_range().size());
}

/* return the flattened block id in grid */
template <hypre_int dim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_block_id(hypre_DeviceItem &item)
{
   return item.get_group_linear_id();
}

/* return the number of threads in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_threads(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item);
}

/* return the flattened thread id in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_thread_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_threads<bdim>(item) +
          hypre_gpu_get_thread_id<bdim>(item);
}

/* return the number of warps in grid */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_num_warps(hypre_DeviceItem &item)
{
   return hypre_gpu_get_num_blocks<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item);
}

/* return the flattened warp id in nd_range */
template <hypre_int bdim, hypre_int gdim>
static __device__ __forceinline__
hypre_int hypre_gpu_get_grid_warp_id(hypre_DeviceItem &item)
{
   return hypre_gpu_get_block_id<gdim>(item) * hypre_gpu_get_num_warps<bdim>(item) +
          hypre_gpu_get_warp_id<bdim>(item);
}

/* Perform atomic add operation */
template <typename T>
static __device__ __forceinline__
void hypre_gpu_atomicAdd(hypre_int pos, T* address, T val)
{
   auto atomic_val =
      sycl::atomic_ref<T, sycl::memory_order::relaxed,
      sycl::memory_scope::device, sycl::access::address_space::local_space>
      (address[pos]);
   auto curr = atomic_val.load(sycl::memory_order::relaxed);
   while (!atomic_val.compare_exchange_strong(curr, curr + val, sycl::memory_order::relaxed)) {}
}

/* sync the thread block */
static __device__ __forceinline__
void block_sync(hypre_DeviceItem &item)
{
   sycl::group_barrier(item.get_group());
}

/* sync the warp */
static __device__ __forceinline__
void warp_sync(hypre_DeviceItem &item)
{
   sycl::group_barrier(item.get_sub_group());
}

/* exclusive prefix scan */
template <typename T>
static __device__ __forceinline__
T warp_prefix_sum(hypre_DeviceItem &item, hypre_int lane_id, T in, T &all_sum)
{
#pragma unroll
   for (hypre_int d = 2; d <= HYPRE_WARP_SIZE; d <<= 1)
   {
      T t = sycl::shift_group_right(item.get_sub_group(), in, d >> 1);
      if ( (lane_id & (d - 1)) == (d - 1) )
      {
         in += t;
      }
   }

   all_sum = sycl::group_broadcast(item.get_sub_group(), in, HYPRE_WARP_SIZE - 1);

   if (lane_id == HYPRE_WARP_SIZE - 1)
   {
      in = 0;
   }

#pragma unroll
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      T t = sycl::permute_group_by_xor(item.get_sub_group(), in, d);

      if ( (lane_id & (d - 1)) == (d - 1))
      {
         if ( (lane_id & ((d << 1) - 1)) == ((d << 1) - 1) )
         {
            in += t;
         }
         else
         {
            in = t;
         }
      }
   }
   return in;
}

static __device__ __forceinline__
hypre_mask hypre_ballot_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   return sycl::reduce_over_group(
             item.get_sub_group(),
             (mask & (0x1 << item.get_sub_group().get_local_linear_id())) &&
             predicate ? (0x1 << item.get_sub_group().get_local_linear_id()) : 0,
             sycl::ext::oneapi::plus<>());
}

static __device__ __forceinline__
HYPRE_Int hypre_popc(hypre_mask mask)
{
   return (HYPRE_Int) sycl::popcount(mask);
}

static __device__ __forceinline__
HYPRE_Int hypre_ffs(hypre_mask mask)
{
   return (HYPRE_Int) dpct::ffs<HYPRE_Int>(mask);
}

static __device__ __forceinline__
hypre_int warp_any_sync(hypre_DeviceItem &item, hypre_mask mask, hypre_int predicate)
{
   HYPRE_UNUSED_VAR(mask);

   return sycl::any_of_group(item.get_sub_group(), predicate);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line)
{
   HYPRE_UNUSED_VAR(mask);

   return sycl::group_broadcast(item.get_sub_group(), val, src_line);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int src_line,
                    hypre_int width)
{
   HYPRE_UNUSED_VAR(mask);

   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_start = (lane_id / width) * width;
   hypre_int src_in_warp = group_start + (src_line % width);
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
   HYPRE_UNUSED_VAR(mask);

   return sycl::shift_group_right(item.get_sub_group(), val, delta);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_up_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                       hypre_int width)
{
   HYPRE_UNUSED_VAR(mask);

   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_start = (lane_id / width) * width;
   hypre_int src_in_warp = lane_id - delta >= group_start ? lane_id - delta : lane_id;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta)
{
   HYPRE_UNUSED_VAR(mask);

   return sycl::shift_group_left(item.get_sub_group(), val, delta);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_down_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_uint delta,
                         hypre_int width)
{
   HYPRE_UNUSED_VAR(mask);

   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_end = ((lane_id / width) + 1) * width - 1;
   hypre_int src_in_warp = lane_id + delta <= group_end ? lane_id + delta : lane_id;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask)
{
   HYPRE_UNUSED_VAR(mask);

   return sycl::permute_group_by_xor(item.get_sub_group(), val, lane_mask);
}

template <typename T>
static __device__ __forceinline__
T warp_shuffle_xor_sync(hypre_DeviceItem &item, hypre_mask mask, T val, hypre_int lane_mask,
                        hypre_int width)
{
   HYPRE_UNUSED_VAR(mask);

   hypre_int lane_id = hypre_gpu_get_lane_id<1>(item);
   hypre_int group_end = ((lane_id / width) + 1) * width - 1;
   hypre_int src_in_warp = lane_id ^ lane_mask;
   src_in_warp = src_in_warp > group_end ? lane_id : src_in_warp;
   return sycl::select_from_group(item.get_sub_group(), val, src_in_warp);
}

template <typename T>
static __forceinline__
T warp_reduce_sum(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += sycl::shift_group_left(item.get_sub_group(), in, d);
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_sum(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in += sycl::permute_group_by_xor(item.get_sub_group(), in, d);
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_reduce_max(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::max(in, sycl::shift_group_left(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_max(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::max(in, sycl::permute_group_by_xor(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_reduce_min(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::min(in, sycl::shift_group_left(item.get_sub_group(), in, d));
   }
   return in;
}

template <typename T>
static __forceinline__
T warp_allreduce_min(hypre_DeviceItem &item, T in)
{
   for (hypre_int d = HYPRE_WARP_SIZE >> 1; d > 0; d >>= 1)
   {
      in = std::min(in, sycl::permute_group_by_xor(item.get_sub_group(), in, d));
   }
   return in;
}

template<typename T>
struct is_negative
{
   is_negative() {}

   constexpr bool operator()(const T &x = T()) const { return (x < 0); }
};

template<typename T>
struct is_positive
{
   is_positive() {}

   constexpr bool operator()(const T &x = T()) const { return (x > 0); }
};

template<typename T>
struct is_nonnegative
{
   is_nonnegative() {}

   constexpr bool operator()(const T &x = T()) const { return (x >= 0); }
};

template<typename T>
struct in_range
{
   T low, high;
   in_range(T low_ = T(), T high_ = T()) { low = low_; high = high_; }

   constexpr bool operator()(const T &x) const { return (x >= low && x <= high); }
};

template<typename T>
struct out_of_range
{
   T low, high;
   out_of_range(T low_ = T(), T high_ = T()) { low = low_; high = high_; }

   constexpr bool operator()(const T &x) const { return (x < low || x > high); }
};

template<typename T>
struct less_than
{
   T val;
   less_than(T val_ = T()) { val = val_; }

   constexpr bool operator()(const T &x) const { return (x < val); }
};

template<typename T>
struct modulo
{
   T val;
   modulo(T val_ = T()) { val = val_; }

   constexpr T operator()(const T &x) const { return (x % val); }
};

template<typename T>
struct equal
{
   T val;
   equal(T val_ = T()) { val = val_; }

   constexpr bool operator()(const T &x) const { return (x == val); }
};

template<typename T1, typename T2>
struct type_cast
{
   constexpr T2 operator()(const T1 &x = T1()) const { return (T2) x; }
};

template<typename T>
struct absolute_value
{
   constexpr T operator()(const T &x) const { return x < T(0) ? -x : x; }
};

template<typename... T>
struct TupleComp2
{
   typedef std::tuple<T...> Tuple;
   bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (std::get<0>(t1) < std::get<0>(t2))
      {
         return true;
      }
      if (std::get<0>(t1) > std::get<0>(t2))
      {
         return false;
      }
      return hypre_abs(std::get<1>(t1)) > hypre_abs(std::get<1>(t2));
   }
};

template<typename... T>
struct TupleComp3
{
   typedef std::tuple<T...> Tuple;
   bool operator()(const Tuple& t1, const Tuple& t2)
   {
      if (std::get<0>(t1) < std::get<0>(t2))
      {
         return true;
      }
      if (std::get<0>(t1) > std::get<0>(t2))
      {
         return false;
      }
      if (std::get<0>(t2) == std::get<1>(t2))
      {
         return false;
      }
      return std::get<0>(t1) == std::get<1>(t1) || std::get<1>(t1) < std::get<1>(t2);
   }
};

#endif // #if defined(HYPRE_USING_SYCL)

/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 *      end of functions defined here
 * - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */

/* device_utils.c */
#if defined(HYPRE_USING_GPU)
dim3 hypre_GetDefaultDeviceBlockDimension();

dim3 hypre_GetDefaultDeviceGridDimension( HYPRE_Int n, const char *granularity, dim3 bDim );

dim3 hypre_dim3(HYPRE_Int x);
dim3 hypre_dim3(HYPRE_Int x, HYPRE_Int y);
dim3 hypre_dim3(HYPRE_Int x, HYPRE_Int y, HYPRE_Int z);

template <typename T1, typename T2, typename T3>
HYPRE_Int hypreDevice_StableSortByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2,
                                           T3 *vals, HYPRE_Int opt);

template <typename T1, typename T2, typename T3, typename T4>
HYPRE_Int hypreDevice_StableSortTupleByTupleKey(HYPRE_Int N, T1 *keys1, T2 *keys2,
                                                T3 *vals1, T4 *vals2, HYPRE_Int opt);

template <typename T1, typename T2, typename T3>
HYPRE_Int hypreDevice_ReduceByTupleKey(HYPRE_Int N, T1 *keys1_in, T2 *keys2_in,
                                       T3 *vals_in, T1 *keys1_out, T2 *keys2_out,
                                       T3 *vals_out);

template <typename T>
HYPRE_Int hypreDevice_ScatterConstant(T *x, HYPRE_Int n, HYPRE_Int *map, T v);

HYPRE_Int hypreDevice_GenScatterAdd(HYPRE_Real *x, HYPRE_Int ny, HYPRE_Int *map,
                                    HYPRE_Real *y, char *work);

template <typename T>
HYPRE_Int hypreDevice_CsrRowPtrsToIndicesWithRowNum(HYPRE_Int nrows, HYPRE_Int nnz,
                                                    HYPRE_Int *d_row_ptr, T *d_row_num, T *d_row_ind);

#endif

#if defined(HYPRE_USING_CUSPARSE)

cudaDataType hypre_HYPREComplexToCudaDataType();

#if CUSPARSE_VERSION >= CUSPARSE_NEWAPI_VERSION
cusparseIndexType_t hypre_HYPREIntToCusparseIndexType();
#endif

#endif // #if defined(HYPRE_USING_CUSPARSE)

#endif /* #ifndef HYPRE_CUDA_UTILS_H */
/******************************************************************************
 * Copyright (c) 1998 Lawrence Livermore National Security, LLC and other
 * HYPRE Project Developers. See the top-level COPYRIGHT file for details.
 *
 * SPDX-License-Identifier: (Apache-2.0 OR MIT)
 ******************************************************************************/

/* CUDA reducer class */

#ifndef HYPRE_CUDA_REDUCER_H
#define HYPRE_CUDA_REDUCER_H

#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
#if !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS)

template<typename T> void OneBlockReduce(T *d_arr, HYPRE_Int N, T *h_out);

struct HYPRE_Real2
{
   HYPRE_Real x, y;

   __host__ __device__
   HYPRE_Real2() {}

   __host__ __device__
   HYPRE_Real2(HYPRE_Real x1, HYPRE_Real x2)
   {
      x = x1;
      y = x2;
   }

   __host__ __device__
   void operator=(HYPRE_Real val)
   {
      x = y = val;
   }

   __host__ __device__
   void operator+=(HYPRE_Real2 rhs)
   {
      x += rhs.x;
      y += rhs.y;
   }

};

struct HYPRE_Real4
{
   HYPRE_Real u, v, w, x;

   __host__ __device__
   HYPRE_Real4() {}

   __host__ __device__
   HYPRE_Real4(HYPRE_Real x1, HYPRE_Real x2, HYPRE_Real x3, HYPRE_Real x4)
   {
      u = x1;
      v = x2;
      w = x3;
      x = x4;
   }

   __host__ __device__
   void operator=(HYPRE_Real val)
   {
      u = v = w = x = val;
   }

   __host__ __device__
   void operator+=(HYPRE_Real4 rhs)
   {
      u += rhs.u;
      v += rhs.v;
      w += rhs.w;
      x += rhs.x;
   }

};

struct HYPRE_Real6
{
   HYPRE_Real u, v, w, x, y, z;

   __host__ __device__
   HYPRE_Real6() {}

   __host__ __device__
   HYPRE_Real6(HYPRE_Real x1, HYPRE_Real x2, HYPRE_Real x3,
               HYPRE_Real x4, HYPRE_Real x5, HYPRE_Real x6)
   {
      u = x1;
      v = x2;
      w = x3;
      x = x4;
      y = x5;
      z = x6;
   }

   __host__ __device__
   void operator=(HYPRE_Real val)
   {
      u = v = w = x = y = z = val;
   }

   __host__ __device__
   void operator+=(HYPRE_Real6 rhs)
   {
      u += rhs.u;
      v += rhs.v;
      w += rhs.w;
      x += rhs.x;
      y += rhs.y;
      z += rhs.z;
   }

};

/* reduction within a warp */
__inline__ __host__ __device__
HYPRE_Real warpReduceSum(HYPRE_Real val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val, offset);
   }
#endif
   return val;
}

__inline__ __host__ __device__
HYPRE_Real2 warpReduceSum(HYPRE_Real2 val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val.x += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.x, offset);
      val.y += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.y, offset);
   }
#endif
   return val;
}

__inline__ __host__ __device__
HYPRE_Real4 warpReduceSum(HYPRE_Real4 val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val.u += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.u, offset);
      val.v += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.v, offset);
      val.w += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.w, offset);
      val.x += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.x, offset);
   }
#endif
   return val;
}

__inline__ __host__ __device__
HYPRE_Real6 warpReduceSum(HYPRE_Real6 val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   for (HYPRE_Int offset = warpSize / 2; offset > 0; offset /= 2)
   {
      val.u += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.u, offset);
      val.v += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.v, offset);
      val.w += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.w, offset);
      val.x += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.x, offset);
      val.y += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.y, offset);
      val.z += __shfl_down_sync(HYPRE_WARP_FULL_MASK, val.z, offset);
   }
#endif
   return val;
}

/* reduction within a block */
template <typename T>
__inline__ __host__ __device__
T blockReduceSum(T val)
{
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
   // Shared mem for HYPRE_WARP_SIZE partial sums
   __shared__ T shared[HYPRE_WARP_SIZE];

   HYPRE_Int lane = threadIdx.x & (warpSize - 1);
   HYPRE_Int wid  = threadIdx.x >> HYPRE_WARP_BITSHIFT;

   // Each warp performs partial reduction
   val = warpReduceSum(val);

   // Write reduced value to shared memory
   if (lane == 0)
   {
      shared[wid] = val;
   }

   // Wait for all partial reductions
   __syncthreads();

   // read from shared memory only if that warp existed
   if (threadIdx.x < (blockDim.x >> HYPRE_WARP_BITSHIFT))
   {
      val = shared[lane];
   }
   else
   {
      val = 0.0;
   }

   // Final reduce within first warp
   if (wid == 0)
   {
      val = warpReduceSum(val);
   }

#endif
   return val;
}

template<typename T>
__global__ void
OneBlockReduceKernel(hypre_DeviceItem &item,
                     T                *arr,
                     HYPRE_Int         N)
{
#if !defined(HYPRE_USING_SYCL)
   HYPRE_UNUSED_VAR(item);
#endif

   T sum;

   sum = 0.0;

   if (threadIdx.x < (hypre_uint) N)
   {
      sum = arr[threadIdx.x];
   }

   sum = blockReduceSum(sum);

   if (threadIdx.x == 0)
   {
      arr[0] = sum;
   }
}

/* Reducer class */
template <typename T>
struct ReduceSum
{
   using value_type = T;

   T init;                    /* initial value passed in */
   mutable T __thread_sum;    /* place to hold local sum of a thread,
                                 and partial sum of a block */
   T *d_buf;                  /* place to store partial sum within blocks
                                 in the 1st round, used in the 2nd round */
   HYPRE_Int nblocks;         /* number of blocks used in the 1st round */

   /* constructor
    * val is the initial value (added to the reduced sum) */
   __host__
   ReduceSum(T val)
   {
      init = val;
      __thread_sum = 0.0;
      nblocks = -1;
   }

   /* copy constructor */
   __host__ __device__
   ReduceSum(const ReduceSum<T>& other)
   {
      *this = other;
   }

   /* copy assignment operator */
   __host__ __device__
   ReduceSum<T>& operator=(const ReduceSum<T>& other)
   {
      if (this != &other)
      {
         init = other.init;
         __thread_sum = other.__thread_sum;
         d_buf = other.d_buf;
         nblocks = other.nblocks;
      }
      return *this;
   }

   __host__ void
   Allocate2ndPhaseBuffer()
   {
      if (hypre_HandleReduceBuffer(hypre_handle()) == NULL)
      {
         /* allocate for the max size for reducing HYPRE_Real6 type */
         hypre_HandleReduceBuffer(hypre_handle()) =
            hypre_TAlloc(HYPRE_Real6, HYPRE_MAX_NTHREADS_BLOCK, HYPRE_MEMORY_DEVICE);
      }

      d_buf = (T*) hypre_HandleReduceBuffer(hypre_handle());
   }

   /* reduction within blocks */
   __host__ __device__
   void BlockReduce() const
   {
#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__)
      __thread_sum = blockReduceSum(__thread_sum);
      if (threadIdx.x == 0)
      {
         d_buf[blockIdx.x] = __thread_sum;
      }
#endif
   }

   __host__ __device__
   void operator+=(T val) const
   {
      __thread_sum += val;
   }

   /* invoke the 2nd reduction at the time want the sum from the reducer */
   __host__
   operator T()
   {
      T val;

      const HYPRE_MemoryLocation memory_location = hypre_HandleMemoryLocation(hypre_handle());
      const HYPRE_ExecutionPolicy exec_policy = hypre_GetExecPolicy1(memory_location);

      if (exec_policy == HYPRE_EXEC_HOST)
      {
         val = __thread_sum;
         val += init;
      }
      else
      {
         /* 2nd reduction with only *one* block */
         hypre_assert(nblocks >= 0 && nblocks <= HYPRE_MAX_NTHREADS_BLOCK);
         const dim3 gDim(1), bDim(HYPRE_MAX_NTHREADS_BLOCK);
         HYPRE_GPU_LAUNCH( OneBlockReduceKernel, gDim, bDim, d_buf, nblocks );
         hypre_TMemcpy(&val, d_buf, T, 1, HYPRE_MEMORY_HOST, HYPRE_MEMORY_DEVICE);
         val += init;
      }

      return val;
   }

   /* destructor */
   __host__ __device__
   ~ReduceSum<T>()
   {
   }
};

#endif /* #if !defined(HYPRE_USING_RAJA) && !defined(HYPRE_USING_KOKKOS) */
#endif /* #if defined(HYPRE_USING_CUDA)  || defined(HYPRE_USING_HIP) */
#endif /* #ifndef HYPRE_CUDA_REDUCER_H */

#ifdef __cplusplus
}
#endif

#ifdef HYPRE_MIXED_PRECISION
/* The following is for user compiles and the order is important.  The first
 * header ensures that we do not change prototype names in user files or in the
 * second header file.  The second header contains all the prototypes needed by
 * users for mixed precision. */
#ifndef hypre_MP_BUILD
#include "_hypre_utilities_mup_undef.h"
#include "_hypre_utilities_mup.h"
#endif
#endif

#endif

