|
Template Numerical Library version\ main:16b9213d5
|
Internal namespace for CUDA/HIP backend support. More...
Classes | |
| struct | LaunchConfiguration |
| Holds the parameters necessary to launch a CUDA or HIP kernel (i.e. schedule it for execution on some stream of some device). More... | |
| struct | SharedMemory |
| struct | SharedMemory< T, 16 > |
| struct | SharedMemory< T, 32 > |
| struct | SharedMemory< T, 64 > |
| struct | SharedMemory< T, 8 > |
| class | Stream |
| class | StreamPool |
Typedefs | |
| using | error_t = cudaError_t |
| using | stream_t = cudaStream_t |
Functions | |
| template<typename Element, typename FillBuffer, typename PushBuffer> | |
| void | bufferedTransfer (std::size_t size, FillBuffer &fill, PushBuffer &push) |
| template<typename Element, typename FillBuffer> | |
| void | bufferedTransferToDevice (Element *destination, std::size_t size, FillBuffer &fill) |
| template<typename Element, typename PushBuffer> | |
| void | bufferedTransferToHost (const Element *source, std::size_t size, PushBuffer &push) |
| void | checkErrorCode (const char *file_name, int line, error_t error) |
| void | deviceSynchronize () |
| template<class T> | |
| void | funcSetCacheConfig (T *func, enum FuncCache cacheConfig) |
| int | getArchitectureMajor (int deviceNum) |
| int | getArchitectureMinor (int deviceNum) |
| int | getDevice () |
| Returns the ID of the active device. | |
| int | getDeviceCores (int deviceNum) |
| int | getDeviceCoresPerMultiprocessors (int deviceNum) |
| int | getDeviceCount () |
| Returns the number of devices available in the system. | |
| int | getDeviceMultiprocessors (int deviceNum) |
| std::string | getDeviceName (int deviceNum) |
| bool | getECCEnabled (int deviceNum) |
| std::size_t | getFreeGlobalMemory () |
| __device__ std::size_t | getGlobalBlockIdx_x (const dim3 &gridIdx) |
| __device__ std::size_t | getGlobalBlockIdx_y (const dim3 &gridIdx) |
| __device__ std::size_t | getGlobalBlockIdx_z (const dim3 &gridIdx) |
| std::size_t | getGlobalMemorySize (int deviceNum) |
| __device__ std::size_t | getGlobalThreadIdx_x (const dim3 &gridIdx) |
| __device__ std::size_t | getGlobalThreadIdx_y (const dim3 &gridIdx) |
| __device__ std::size_t | getGlobalThreadIdx_z (const dim3 &gridIdx) |
| template<typename Index> | |
| __device__ Index | getInterleaving (const Index index) |
| constexpr int | getMaxBlockXSize () |
| constexpr int | getMaxBlockYSize () |
| constexpr int | getMaxBlockZSize () |
| constexpr std::size_t | getMaxGridXSize () |
| constexpr std::size_t | getMaxGridYSize () |
| constexpr std::size_t | getMaxGridZSize () |
| constexpr int | getMaxThreadsPerBlock () |
| constexpr std::size_t | getMaxThreadsPerGrid () |
| constexpr int | getMaxWarpSize () |
| Returns the maximum possible warp/wavefront size for the current build. | |
| constexpr int | getMinWarpSize () |
| Returns the minimum possible warp/wavefront size across all platforms. | |
| template<typename Index> | |
| Index | getNumberOfBlocks (const Index threads, const unsigned int blockSize) |
| template<typename Index> | |
| Index | getNumberOfGrids (const Index blocks, const unsigned int gridSize) |
| constexpr int | getNumberOfSharedMemoryBanks () |
| int | getRegistersPerMultiprocessor (int deviceNum) |
| template<typename T> | |
| __device__ T * | getSharedMemory () |
| std::size_t | getSharedMemoryPerBlock (int deviceNum) |
| constexpr std::size_t | getTransferBufferSize () |
| constexpr auto | getWarpFullMask () |
| Returns the full mask for warp shuffle operations. | |
| constexpr int | getWarpSize () |
| int | getWarpSize (int deviceId) |
| Runtime query for the warp/wavefront size of a specific device. | |
| template<typename RawKernel, typename... KernelParameters> | |
| void | launchKernel (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters) |
| template<typename RawKernel, typename... KernelParameters> | |
| void | launchKernelAsync (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters) |
| template<typename RawKernel, typename... KernelParameters> | |
| void | launchKernelSync (RawKernel kernel_function, LaunchConfiguration launch_configuration, KernelParameters &&... parameters) |
| template<class T> | |
| __device__ T | ldg (const T &value) |
| Loads data from a global memory using the __ldg() intrinsic. | |
| void | memcpy (void *dst, const void *src, std::size_t sizeBytes, MemcpyKind kind) |
| std::ostream & | operator<< (std::ostream &str, const dim3 &d) |
| void | printThreadsSetup (const dim3 &blockSize, const dim3 &blocksCount, const dim3 &gridSize, const dim3 &gridsCount, std::ostream &str=std::cout) |
| void | setDevice (int device) |
| Sets the active device. | |
| void | setupGrid (const dim3 &blocksCount, const dim3 &gridsCount, const dim3 &gridIdx, dim3 &gridSize) |
| void | setupThreads (const dim3 &blockSize, dim3 &blocksCount, dim3 &gridsCount, long long int xThreads, long long int yThreads=0, long long int zThreads=0) |
| stream_t | streamCreateWithPriority (unsigned int flags, int priority) |
| void | streamDestroy (stream_t stream) |
| void | streamSynchronize (stream_t stream) |
Internal namespace for CUDA/HIP backend support.
| void TNL::Backend::bufferedTransferToDevice | ( | Element * | destination, |
| std::size_t | size, | ||
| FillBuffer & | fill ) |
This function creates a buffer on the host, the fill handler fills it with data and this function transfers data from the buffer to the destination, which is a pointer to device memory.
| void TNL::Backend::bufferedTransferToHost | ( | const Element * | source, |
| std::size_t | size, | ||
| PushBuffer & | push ) |
This function creates a buffer on the host, fills it with data transferred from source, which is a pointer to device memory, and the push handler processes the data in the buffer.
|
constexpr |
Returns the maximum possible warp/wavefront size for the current build.
Unlike getWarpSize(), this is consistent between host and device compilation:
Use this for compile-time guards that must be consistent on the host side, e.g. deciding whether to instantiate TPS=64 kernel variants.
|
constexpr |
Returns the minimum possible warp/wavefront size across all platforms.
Always returns 32. Use this for compile-time checks that must allow configs valid on any architecture, e.g. column-major SlicedEllpack launch configs where TPS * SliceSize >= getMinWarpSize() ensures the group covers at least one warp on the smallest-warp-size device.
|
nodiscardconstexpr |
Returns the full mask for warp shuffle operations.
HIP shfl intrinsics require a 64-bit mask regardless of the wavefront size (unused upper bits are zero for wave32). CUDA uses a 32-bit mask.
Warning: this function relies on __GFX8__/__GFX9__ arch macros which are defined only during device compilation. On the host side, it falls through to the wavefront=32 mask. Do not call from host code — use only in __device__ functions or inside #if defined(__CUDACC__) || defined(__HIP__) guards.
|
constexpr |
The warpSize variable is of type int and contains the warp size (in threads) for the target device. This should be used only from device code in order to develop portable wave-aware code.
Note that NVIDIA devices return 32; AMD devices return 64 for gfx8/gfx9 and 32 for gfx10 and above.
Warning: the returned value may be inconsistent when used from a host-code context on HIP, because __GFX8__/__GFX9__ are defined only during device compilation. On the host side, the function falls through to return 32. Use getMaxWarpSize() for compile-time host-side guards, or the runtime overload getWarpSize(int deviceId) for host-side dispatch decisions. See https://clang.llvm.org/docs/HIPSupport.html#predefined-macros for details.
https://rocm.docs.amd.com/projects/HIP/en/latest/reference/kernel_language.html#warpsize
|
inlinenodiscard |
Runtime query for the warp/wavefront size of a specific device.
Use this for host-side launch configuration decisions (TPS selection, etc.). For CUDA, always returns 32. For HIP, queries the runtime API.
| deviceId | The device ID to query (default: current device). |