Template Numerical Library version\ main:16b9213d5
Loading...
Searching...
No Matches
TNL::Backend Namespace Reference

Internal namespace for CUDA/HIP backend support. More...

Classes

struct  LaunchConfiguration
 Holds the parameters necessary to launch a CUDA or HIP kernel (i.e. schedule it for execution on some stream of some device). More...
struct  SharedMemory
struct  SharedMemory< T, 16 >
struct  SharedMemory< T, 32 >
struct  SharedMemory< T, 64 >
struct  SharedMemory< T, 8 >
class  Stream
class  StreamPool

Typedefs

using error_t = cudaError_t
using stream_t = cudaStream_t

Enumerations

enum  { StreamDefault = cudaStreamDefault , StreamNonBlocking = cudaStreamNonBlocking }
enum  FuncCache { FuncCachePreferNone = cudaFuncCachePreferNone , FuncCachePreferShared = cudaFuncCachePreferShared , FuncCachePreferL1 = cudaFuncCachePreferL1 , FuncCachePreferEqual = cudaFuncCachePreferEqual }
enum  MemcpyKind {
  MemcpyHostToHost = cudaMemcpyHostToHost , MemcpyHostToDevice = cudaMemcpyHostToDevice , MemcpyDeviceToHost = cudaMemcpyDeviceToHost , MemcpyDeviceToDevice = cudaMemcpyDeviceToDevice ,
  MemcpyDefault = cudaMemcpyDefault
}

Functions

template<typename Element, typename FillBuffer, typename PushBuffer>
void bufferedTransfer (std::size_t size, FillBuffer &fill, PushBuffer &push)
template<typename Element, typename FillBuffer>
void bufferedTransferToDevice (Element *destination, std::size_t size, FillBuffer &fill)
template<typename Element, typename PushBuffer>
void bufferedTransferToHost (const Element *source, std::size_t size, PushBuffer &push)
void checkErrorCode (const char *file_name, int line, error_t error)
void deviceSynchronize ()
template<class T>
void funcSetCacheConfig (T *func, enum FuncCache cacheConfig)
int getArchitectureMajor (int deviceNum)
int getArchitectureMinor (int deviceNum)
int getDevice ()
 Returns the ID of the active device.
int getDeviceCores (int deviceNum)
int getDeviceCoresPerMultiprocessors (int deviceNum)
int getDeviceCount ()
 Returns the number of devices available in the system.
int getDeviceMultiprocessors (int deviceNum)
std::string getDeviceName (int deviceNum)
bool getECCEnabled (int deviceNum)
std::size_t getFreeGlobalMemory ()
__device__ std::size_t getGlobalBlockIdx_x (const dim3 &gridIdx)
__device__ std::size_t getGlobalBlockIdx_y (const dim3 &gridIdx)
__device__ std::size_t getGlobalBlockIdx_z (const dim3 &gridIdx)
std::size_t getGlobalMemorySize (int deviceNum)
__device__ std::size_t getGlobalThreadIdx_x (const dim3 &gridIdx)
__device__ std::size_t getGlobalThreadIdx_y (const dim3 &gridIdx)
__device__ std::size_t getGlobalThreadIdx_z (const dim3 &gridIdx)
template<typename Index>
__device__ Index getInterleaving (const Index index)
constexpr int getMaxBlockXSize ()
constexpr int getMaxBlockYSize ()
constexpr int getMaxBlockZSize ()
constexpr std::size_t getMaxGridXSize ()
constexpr std::size_t getMaxGridYSize ()
constexpr std::size_t getMaxGridZSize ()
constexpr int getMaxThreadsPerBlock ()
constexpr std::size_t getMaxThreadsPerGrid ()
constexpr int getMaxWarpSize ()
 Returns the maximum possible warp/wavefront size for the current build.
constexpr int getMinWarpSize ()
 Returns the minimum possible warp/wavefront size across all platforms.
template<typename Index>
Index getNumberOfBlocks (const Index threads, const unsigned int blockSize)
template<typename Index>
Index getNumberOfGrids (const Index blocks, const unsigned int gridSize)
constexpr int getNumberOfSharedMemoryBanks ()
int getRegistersPerMultiprocessor (int deviceNum)
template<typename T>
__device__ T * getSharedMemory ()
std::size_t getSharedMemoryPerBlock (int deviceNum)
constexpr std::size_t getTransferBufferSize ()
constexpr auto getWarpFullMask ()
 Returns the full mask for warp shuffle operations.
constexpr int getWarpSize ()
int getWarpSize (int deviceId)
 Runtime query for the warp/wavefront size of a specific device.
template<typename RawKernel, typename... KernelParameters>
void launchKernel (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters)
template<typename RawKernel, typename... KernelParameters>
void launchKernelAsync (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters)
template<typename RawKernel, typename... KernelParameters>
void launchKernelSync (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters)
template<class T>
__device__ T ldg (const T &value)
 Loads data from a global memory using the __ldg() intrinsic.
void memcpy (void *dst, const void *src, std::size_t sizeBytes, MemcpyKind kind)
std::ostreamoperator<< (std::ostream &str, const dim3 &d)
void printThreadsSetup (const dim3 &blockSize, const dim3 &blocksCount, const dim3 &gridSize, const dim3 &gridsCount, std::ostream &str=std::cout)
void setDevice (int device)
 Sets the active device.
void setupGrid (const dim3 &blocksCount, const dim3 &gridsCount, const dim3 &gridIdx, dim3 &gridSize)
void setupThreads (const dim3 &blockSize, dim3 &blocksCount, dim3 &gridsCount, long long int xThreads, long long int yThreads=0, long long int zThreads=0)
stream_t streamCreateWithPriority (unsigned int flags, int priority)
void streamDestroy (stream_t stream)
void streamSynchronize (stream_t stream)

Detailed Description

Internal namespace for CUDA/HIP backend support.

Function Documentation

◆ bufferedTransferToDevice()

template<typename Element, typename FillBuffer>
void TNL::Backend::bufferedTransferToDevice ( Element * destination,
std::size_t size,
FillBuffer & fill )

This function creates a buffer on the host, the fill handler fills it with data and this function transfers data from the buffer to the destination, which is a pointer to device memory.

◆ bufferedTransferToHost()

template<typename Element, typename PushBuffer>
void TNL::Backend::bufferedTransferToHost ( const Element * source,
std::size_t size,
PushBuffer & push )

This function creates a buffer on the host, fills it with data transferred from source, which is a pointer to device memory, and the push handler processes the data in the buffer.

◆ getMaxWarpSize()

int TNL::Backend::getMaxWarpSize ( )
constexpr

Returns the maximum possible warp/wavefront size for the current build.

Unlike getWarpSize(), this is consistent between host and device compilation:

  • CUDA: always 32
  • HIP: always 64 (the maximum across all AMD architectures)
  • Host-only: 32

Use this for compile-time guards that must be consistent on the host side, e.g. deciding whether to instantiate TPS=64 kernel variants.

◆ getMinWarpSize()

int TNL::Backend::getMinWarpSize ( )
constexpr

Returns the minimum possible warp/wavefront size across all platforms.

Always returns 32. Use this for compile-time checks that must allow configs valid on any architecture, e.g. column-major SlicedEllpack launch configs where TPS * SliceSize >= getMinWarpSize() ensures the group covers at least one warp on the smallest-warp-size device.

◆ getWarpFullMask()

auto TNL::Backend::getWarpFullMask ( )
nodiscardconstexpr

Returns the full mask for warp shuffle operations.

HIP shfl intrinsics require a 64-bit mask regardless of the wavefront size (unused upper bits are zero for wave32). CUDA uses a 32-bit mask.

Warning: this function relies on __GFX8__/__GFX9__ arch macros which are defined only during device compilation. On the host side, it falls through to the wavefront=32 mask. Do not call from host code — use only in __device__ functions or inside #if defined(__CUDACC__) || defined(__HIP__) guards.

◆ getWarpSize() [1/2]

int TNL::Backend::getWarpSize ( )
constexpr

The warpSize variable is of type int and contains the warp size (in threads) for the target device. This should be used only from device code in order to develop portable wave-aware code.

Note that NVIDIA devices return 32; AMD devices return 64 for gfx8/gfx9 and 32 for gfx10 and above.

Warning: the returned value may be inconsistent when used from a host-code context on HIP, because __GFX8__/__GFX9__ are defined only during device compilation. On the host side, the function falls through to return 32. Use getMaxWarpSize() for compile-time host-side guards, or the runtime overload getWarpSize(int deviceId) for host-side dispatch decisions. See https://clang.llvm.org/docs/HIPSupport.html#predefined-macros for details.

https://rocm.docs.amd.com/projects/HIP/en/latest/reference/kernel_language.html#warpsize

◆ getWarpSize() [2/2]

int TNL::Backend::getWarpSize ( int deviceId)
inlinenodiscard

Runtime query for the warp/wavefront size of a specific device.

Use this for host-side launch configuration decisions (TPS selection, etc.). For CUDA, always returns 32. For HIP, queries the runtime API.

Parameters
deviceIdThe device ID to query (default: current device).