[LANG] Added support for device functions (#484)
This commit is contained in:
@@ -224,6 +224,7 @@ struct scanline_layout: public distributed_layout {
|
||||
int nts(size_t k) { return nts_.at(k); }
|
||||
int contig_per_thread(size_t k) { return nts_.at(k); }
|
||||
|
||||
int per_thread(size_t k) { return nts(k) * shape_[k] / shape_per_cta(k);}
|
||||
public:
|
||||
// micro tile size. The size of a tile held by a thread block.
|
||||
std::vector<int> mts_;
|
||||
|
@@ -24,6 +24,7 @@ namespace llvm{
|
||||
class IRBuilder;
|
||||
class ArrayType;
|
||||
class Function;
|
||||
class StructType;
|
||||
}
|
||||
|
||||
namespace triton{
|
||||
@@ -114,6 +115,8 @@ private:
|
||||
private:
|
||||
Type *cvt(ir::type *ty);
|
||||
llvm::Attribute cvt(ir::attribute attr);
|
||||
llvm::StructType* packed_type(ir::value* i);
|
||||
void forward_declare(ir::function* fn);
|
||||
|
||||
public:
|
||||
generator(analysis::axes *a_axes,
|
||||
@@ -125,6 +128,8 @@ public:
|
||||
unsigned num_warps);
|
||||
|
||||
void visit_value(ir::value* v);
|
||||
void visit_call_inst(ir::call_inst*);
|
||||
void visit_launch_inst(ir::launch_inst *);
|
||||
void visit_phi_node(ir::phi_node*);
|
||||
void visit_binary_operator(ir::binary_operator*);
|
||||
void visit_getelementptr_inst(ir::getelementptr_inst*);
|
||||
@@ -148,6 +153,8 @@ public:
|
||||
void visit_unmasked_store_inst(ir::unmasked_store_inst*);
|
||||
void visit_masked_store_inst(ir::masked_store_inst*);
|
||||
void visit_cat_inst(ir::cat_inst*);
|
||||
void visit_extract_value_inst(ir::extract_value_inst *);
|
||||
void visit_insert_value_inst(ir::insert_value_inst *);
|
||||
void visit_reshape_inst(ir::reshape_inst*);
|
||||
void visit_splat_inst(ir::splat_inst*);
|
||||
void visit_broadcast_inst(ir::broadcast_inst*);
|
||||
@@ -242,6 +249,7 @@ private:
|
||||
/// triton bb -> llvm bb
|
||||
std::map<ir::value*, BasicBlock *> bbs_;
|
||||
std::map<ir::value*, std::vector<int>> ords_;
|
||||
std::map<ir::value*, Function*> fns_;
|
||||
|
||||
// helper for creating llvm values
|
||||
adder add;
|
||||
|
31
include/triton/codegen/transform/inline.h
Normal file
31
include/triton/codegen/transform/inline.h
Normal file
@@ -0,0 +1,31 @@
|
||||
#pragma once
|
||||
|
||||
#include <list>
|
||||
|
||||
namespace triton {
|
||||
|
||||
namespace ir {
|
||||
class module;
|
||||
class function;
|
||||
class call_inst;
|
||||
class builder;
|
||||
}
|
||||
|
||||
namespace codegen{
|
||||
namespace transform{
|
||||
|
||||
struct fncmp {
|
||||
bool operator()(ir::function* x, ir::function* y) const;
|
||||
};
|
||||
|
||||
class inliner {
|
||||
public:
|
||||
inliner() {}
|
||||
void do_inline(ir::function* fn, ir::call_inst* callsite, ir::builder& builder, std::list<ir::call_inst*>& callsites);
|
||||
void run(ir::module &mod);
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}
|
@@ -30,6 +30,9 @@ private:
|
||||
bool rewrite_dot_hmma(ir::dot_inst *dot, ir::builder& builder, bool trans_a, bool trans_b, ir::value *A, ir::value *B, ir::value *D);
|
||||
bool rewrite_dot(ir::instruction *value, ir::builder& builder);
|
||||
bool rewrite_mult(ir::instruction *value, ir::builder& builder);
|
||||
bool rewrite_insert_extract(ir::instruction *value, ir::builder& builder);
|
||||
|
||||
|
||||
bool rewrite_unit_red(ir::instruction *value, ir::builder& builder);
|
||||
bool rewrite_gep_ptr_min_off_plus_off(ir::instruction *value, ir::builder& builder);
|
||||
bool rewrite_select_masked_load(ir::instruction *value, ir::builder& builder);
|
||||
|
@@ -89,6 +89,7 @@ public:
|
||||
static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
|
||||
static CUresult cuDeviceGetCount(int *count);
|
||||
// link management
|
||||
static CUresult cuLinkAddFile_v2(CUlinkState state, CUjitInputType type, const char *path, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
||||
static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues);
|
||||
static CUresult cuLinkCreate_v2(unsigned int numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut);
|
||||
static CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut);
|
||||
@@ -214,6 +215,7 @@ private:
|
||||
static void* cuDeviceGetAttribute_;
|
||||
static void* cuDeviceGetCount_;
|
||||
// link management
|
||||
static void* cuLinkAddFile_v2_;
|
||||
static void* cuLinkAddData_v2_;
|
||||
static void* cuLinkCreate_v2_;
|
||||
static void* cuLinkDestroy_;
|
||||
|
244
include/triton/external/CUDA/cuda.h
vendored
Executable file → Normal file
244
include/triton/external/CUDA/cuda.h
vendored
Executable file → Normal file
@@ -224,7 +224,7 @@ typedef uint64_t cuuint64_t;
|
||||
/**
|
||||
* CUDA API version number
|
||||
*/
|
||||
#define CUDA_VERSION 11050
|
||||
#define CUDA_VERSION 11040
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@@ -496,33 +496,7 @@ typedef enum CUarray_format_enum {
|
||||
CU_AD_FORMAT_SIGNED_INT32 = 0x0a, /**< Signed 32-bit integers */
|
||||
CU_AD_FORMAT_HALF = 0x10, /**< 16-bit floating point */
|
||||
CU_AD_FORMAT_FLOAT = 0x20, /**< 32-bit floating point */
|
||||
CU_AD_FORMAT_NV12 = 0xb0, /**< 8-bit YUV planar format, with 4:2:0 sampling */
|
||||
CU_AD_FORMAT_UNORM_INT8X1 = 0xc0, /**< 1 channel unsigned 8-bit normalized integer */
|
||||
CU_AD_FORMAT_UNORM_INT8X2 = 0xc1, /**< 2 channel unsigned 8-bit normalized integer */
|
||||
CU_AD_FORMAT_UNORM_INT8X4 = 0xc2, /**< 4 channel unsigned 8-bit normalized integer */
|
||||
CU_AD_FORMAT_UNORM_INT16X1 = 0xc3, /**< 1 channel unsigned 16-bit normalized integer */
|
||||
CU_AD_FORMAT_UNORM_INT16X2 = 0xc4, /**< 2 channel unsigned 16-bit normalized integer */
|
||||
CU_AD_FORMAT_UNORM_INT16X4 = 0xc5, /**< 4 channel unsigned 16-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT8X1 = 0xc6, /**< 1 channel signed 8-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT8X2 = 0xc7, /**< 2 channel signed 8-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT8X4 = 0xc8, /**< 4 channel signed 8-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT16X1 = 0xc9, /**< 1 channel signed 16-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT16X2 = 0xca, /**< 2 channel signed 16-bit normalized integer */
|
||||
CU_AD_FORMAT_SNORM_INT16X4 = 0xcb, /**< 4 channel signed 16-bit normalized integer */
|
||||
CU_AD_FORMAT_BC1_UNORM = 0x91, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
|
||||
CU_AD_FORMAT_BC1_UNORM_SRGB = 0x92, /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
|
||||
CU_AD_FORMAT_BC2_UNORM = 0x93, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
|
||||
CU_AD_FORMAT_BC2_UNORM_SRGB = 0x94, /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding*/
|
||||
CU_AD_FORMAT_BC3_UNORM = 0x95, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
|
||||
CU_AD_FORMAT_BC3_UNORM_SRGB = 0x96, /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding*/
|
||||
CU_AD_FORMAT_BC4_UNORM = 0x97, /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
|
||||
CU_AD_FORMAT_BC4_SNORM = 0x98, /**< 1 channel signed normalized block-compressed (BC4 compression) format */
|
||||
CU_AD_FORMAT_BC5_UNORM = 0x99, /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
|
||||
CU_AD_FORMAT_BC5_SNORM = 0x9a, /**< 2 channel signed normalized block-compressed (BC5 compression) format */
|
||||
CU_AD_FORMAT_BC6H_UF16 = 0x9b, /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
|
||||
CU_AD_FORMAT_BC6H_SF16 = 0x9c, /**< 3 channel signed half-float block-compressed (BC6H compression) format */
|
||||
CU_AD_FORMAT_BC7_UNORM = 0x9d, /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
|
||||
CU_AD_FORMAT_BC7_UNORM_SRGB = 0x9e /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
|
||||
CU_AD_FORMAT_NV12 = 0xb0
|
||||
} CUarray_format;
|
||||
|
||||
/**
|
||||
@@ -657,7 +631,7 @@ typedef enum CUdevice_attribute_enum {
|
||||
CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED = 102, /**< Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs */
|
||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED = 103, /**< Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED = 104, /**< Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate */
|
||||
CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED = 105, /**< Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate */
|
||||
CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR = 106, /**< Maximum number of blocks per multiprocessor */
|
||||
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED = 107, /**< Device supports compression of memory */
|
||||
CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */
|
||||
@@ -665,7 +639,7 @@ typedef enum CUdevice_attribute_enum {
|
||||
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED = 110, /**< Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate */
|
||||
CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK = 111, /**< Shared memory reserved by CUDA driver per block in bytes */
|
||||
CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
|
||||
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
|
||||
CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED = 113, /**< Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU */
|
||||
CU_DEVICE_ATTRIBUTE_TIMELINE_SEMAPHORE_INTEROP_SUPPORTED = 114, /**< External timeline semaphore interop is supported on the device */
|
||||
CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED = 115, /**< Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs */
|
||||
CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
|
||||
@@ -1650,8 +1624,7 @@ typedef enum cudaError_enum {
|
||||
CUDA_ERROR_UNSUPPORTED_EXEC_AFFINITY = 224,
|
||||
|
||||
/**
|
||||
* This indicates that the device kernel source is invalid. This includes
|
||||
* compilation/linker errors encountered in device code or user error.
|
||||
* This indicates that the device kernel source is invalid.
|
||||
*/
|
||||
CUDA_ERROR_INVALID_SOURCE = 300,
|
||||
|
||||
@@ -2068,9 +2041,9 @@ typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
|
||||
* On Windows the flag is a no-op.
|
||||
* On Linux that memory is marked as non cache-coherent for the GPU and
|
||||
* is expected to be physically contiguous. It may return
|
||||
* ::CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
|
||||
* ::CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
|
||||
* On all other platforms, it is not supported and ::CUDA_ERROR_NOT_SUPPORTED
|
||||
* CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user,
|
||||
* CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions.
|
||||
* On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED
|
||||
* is returned.
|
||||
* Flag for ::cuMemHostRegister()
|
||||
*/
|
||||
@@ -2079,12 +2052,12 @@ typedef size_t (CUDA_CB *CUoccupancyB2DSize)(int blockSize);
|
||||
/**
|
||||
* If set, the passed memory pointer is treated as pointing to memory that is
|
||||
* considered read-only by the device. On platforms without
|
||||
* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
|
||||
* CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
|
||||
* required in order to register memory mapped to the CPU as read-only. Support
|
||||
* for the use of this flag can be queried from the device attribute
|
||||
* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with
|
||||
* CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with
|
||||
* a current context associated with a device that does not have this attribute
|
||||
* set will cause ::cuMemHostRegister to error with ::CUDA_ERROR_NOT_SUPPORTED.
|
||||
* set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
|
||||
*/
|
||||
#define CU_MEMHOSTREGISTER_READ_ONLY 0x08
|
||||
|
||||
@@ -3735,117 +3708,117 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
|
||||
* \p dev. The supported attributes are:
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: Maximum number of threads per
|
||||
* block;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: Maximum x-dimension of a block;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: Maximum y-dimension of a block;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: Maximum z-dimension of a block;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: Maximum x-dimension of a grid;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: Maximum y-dimension of a grid;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: Maximum z-dimension of a grid;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK: Maximum amount of
|
||||
* shared memory available to a thread block in bytes
|
||||
* shared memory available to a thread block in bytes;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: Memory available on device for
|
||||
* __constant__ variables in a CUDA C kernel in bytes
|
||||
* - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads
|
||||
* __constant__ variables in a CUDA C kernel in bytes;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_WARP_SIZE: Warp size in threads;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_PITCH: Maximum pitch in bytes allowed by the
|
||||
* memory copy functions that involve memory regions allocated through
|
||||
* ::cuMemAllocPitch()
|
||||
* ::cuMemAllocPitch();
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH: Maximum 1D
|
||||
* texture width
|
||||
* texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH: Maximum width
|
||||
* for a 1D texture bound to linear memory
|
||||
* for a 1D texture bound to linear memory;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH: Maximum
|
||||
* mipmapped 1D texture width
|
||||
* mipmapped 1D texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH: Maximum 2D
|
||||
* texture width
|
||||
* texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT: Maximum 2D
|
||||
* texture height
|
||||
* texture height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH: Maximum width
|
||||
* for a 2D texture bound to linear memory
|
||||
* for a 2D texture bound to linear memory;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT: Maximum height
|
||||
* for a 2D texture bound to linear memory
|
||||
* for a 2D texture bound to linear memory;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH: Maximum pitch
|
||||
* in bytes for a 2D texture bound to linear memory
|
||||
* in bytes for a 2D texture bound to linear memory;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH: Maximum
|
||||
* mipmapped 2D texture width
|
||||
* mipmapped 2D texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT: Maximum
|
||||
* mipmapped 2D texture height
|
||||
* mipmapped 2D texture height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH: Maximum 3D
|
||||
* texture width
|
||||
* texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT: Maximum 3D
|
||||
* texture height
|
||||
* texture height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH: Maximum 3D
|
||||
* texture depth
|
||||
* texture depth;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE:
|
||||
* Alternate maximum 3D texture width, 0 if no alternate
|
||||
* maximum 3D texture size is supported
|
||||
* maximum 3D texture size is supported;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE:
|
||||
* Alternate maximum 3D texture height, 0 if no alternate
|
||||
* maximum 3D texture size is supported
|
||||
* maximum 3D texture size is supported;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE:
|
||||
* Alternate maximum 3D texture depth, 0 if no alternate
|
||||
* maximum 3D texture size is supported
|
||||
* maximum 3D texture size is supported;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH:
|
||||
* Maximum cubemap texture width or height
|
||||
* Maximum cubemap texture width or height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH:
|
||||
* Maximum 1D layered texture width
|
||||
* Maximum 1D layered texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS:
|
||||
* Maximum layers in a 1D layered texture
|
||||
* Maximum layers in a 1D layered texture;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH:
|
||||
* Maximum 2D layered texture width
|
||||
* Maximum 2D layered texture width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT:
|
||||
* Maximum 2D layered texture height
|
||||
* Maximum 2D layered texture height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS:
|
||||
* Maximum layers in a 2D layered texture
|
||||
* Maximum layers in a 2D layered texture;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH:
|
||||
* Maximum cubemap layered texture width or height
|
||||
* Maximum cubemap layered texture width or height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS:
|
||||
* Maximum layers in a cubemap layered texture
|
||||
* Maximum layers in a cubemap layered texture;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH:
|
||||
* Maximum 1D surface width
|
||||
* Maximum 1D surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH:
|
||||
* Maximum 2D surface width
|
||||
* Maximum 2D surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT:
|
||||
* Maximum 2D surface height
|
||||
* Maximum 2D surface height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH:
|
||||
* Maximum 3D surface width
|
||||
* Maximum 3D surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT:
|
||||
* Maximum 3D surface height
|
||||
* Maximum 3D surface height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH:
|
||||
* Maximum 3D surface depth
|
||||
* Maximum 3D surface depth;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH:
|
||||
* Maximum 1D layered surface width
|
||||
* Maximum 1D layered surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS:
|
||||
* Maximum layers in a 1D layered surface
|
||||
* Maximum layers in a 1D layered surface;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH:
|
||||
* Maximum 2D layered surface width
|
||||
* Maximum 2D layered surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT:
|
||||
* Maximum 2D layered surface height
|
||||
* Maximum 2D layered surface height;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS:
|
||||
* Maximum layers in a 2D layered surface
|
||||
* Maximum layers in a 2D layered surface;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH:
|
||||
* Maximum cubemap surface width
|
||||
* Maximum cubemap surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH:
|
||||
* Maximum cubemap layered surface width
|
||||
* Maximum cubemap layered surface width;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS:
|
||||
* Maximum layers in a cubemap layered surface
|
||||
* Maximum layers in a cubemap layered surface;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK: Maximum number of 32-bit
|
||||
* registers available to a thread block
|
||||
* - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz
|
||||
* registers available to a thread block;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_CLOCK_RATE: The typical clock frequency in kilohertz;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT: Alignment requirement; texture
|
||||
* base addresses aligned to ::textureAlign bytes do not need an offset
|
||||
* applied to texture fetches
|
||||
* applied to texture fetches;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT: Pitch alignment requirement
|
||||
* for 2D texture references bound to pitched memory
|
||||
* for 2D texture references bound to pitched memory;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GPU_OVERLAP: 1 if the device can concurrently copy
|
||||
* memory between host and device while executing a kernel, or 0 if not
|
||||
* memory between host and device while executing a kernel, or 0 if not;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: Number of multiprocessors on
|
||||
* the device
|
||||
* the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT: 1 if there is a run time limit
|
||||
* for kernels executed on the device, or 0 if not
|
||||
* for kernels executed on the device, or 0 if not;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_INTEGRATED: 1 if the device is integrated with the
|
||||
* memory subsystem, or 0 if not
|
||||
* memory subsystem, or 0 if not;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY: 1 if the device can map host
|
||||
* memory into the CUDA address space, or 0 if not
|
||||
* memory into the CUDA address space, or 0 if not;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_COMPUTE_MODE: Compute mode that device is currently
|
||||
* in. Available modes are as follows:
|
||||
* - ::CU_COMPUTEMODE_DEFAULT: Default mode - Device is not restricted and
|
||||
@@ -3858,33 +3831,33 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
|
||||
* executing multiple kernels within the same context simultaneously, or 0 if
|
||||
* not. It is not guaranteed that multiple kernels will be resident
|
||||
* on the device concurrently so this feature should not be relied upon for
|
||||
* correctness.
|
||||
* correctness;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_ECC_ENABLED: 1 if error correction is enabled on the
|
||||
* device, 0 if error correction is disabled or not supported by the device
|
||||
* - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device
|
||||
* device, 0 if error correction is disabled or not supported by the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_PCI_BUS_ID: PCI bus identifier of the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID: PCI device (also known as slot) identifier
|
||||
* of the device
|
||||
* of the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID: PCI domain identifier of the device
|
||||
* - ::CU_DEVICE_ATTRIBUTE_TCC_DRIVER: 1 if the device is using a TCC driver. TCC
|
||||
* is only available on Tesla hardware running Windows Vista or later
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits
|
||||
* - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor
|
||||
* is only available on Tesla hardware running Windows Vista or later;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: Peak memory clock frequency in kilohertz;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: Global memory bus width in bits;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE: Size of L2 cache in bytes. 0 if the device doesn't have L2 cache;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR: Maximum resident threads per multiprocessor;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING: 1 if the device shares a unified address space with
|
||||
* the host, or 0 if not
|
||||
* - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number
|
||||
* - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number
|
||||
* the host, or 0 if not;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: Major compute capability version number;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: Minor compute capability version number;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED: 1 if device supports caching globals
|
||||
* in L1 cache, 0 if caching globals in L1 cache is not supported by the device
|
||||
* in L1 cache, 0 if caching globals in L1 cache is not supported by the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED: 1 if device supports caching locals
|
||||
* in L1 cache, 0 if caching locals in L1 cache is not supported by the device
|
||||
* in L1 cache, 0 if caching locals in L1 cache is not supported by the device;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR: Maximum amount of
|
||||
* shared memory available to a multiprocessor in bytes; this amount is shared
|
||||
* by all thread blocks simultaneously resident on a multiprocessor
|
||||
* by all thread blocks simultaneously resident on a multiprocessor;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR: Maximum number of 32-bit
|
||||
* registers available to a multiprocessor; this number is shared by all thread
|
||||
* blocks simultaneously resident on a multiprocessor
|
||||
* blocks simultaneously resident on a multiprocessor;
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY: 1 if device supports allocating managed memory
|
||||
* on this system, 0 if allocating managed memory is not supported by the device on this system.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD: 1 if device is on a multi-GPU board, 0 if not.
|
||||
@@ -3910,20 +3883,14 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
|
||||
* - ::CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED: Device supports virtual memory management APIs like ::cuMemAddressReserve, ::cuMemCreate, ::cuMemMap and related APIs
|
||||
* - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED: Device supports exporting memory to a posix file descriptor with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
|
||||
* - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 NT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
|
||||
* - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested via ::cuMemCreate
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor
|
||||
* - ::CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED: Device supports exporting memory to a Win32 KMT handle with ::cuMemExportToShareableHandle, if requested ::cuMemCreate
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR: Maximum number of thread blocks that can reside on a multiprocessor.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED: Device supports compressible memory allocation via ::cuMemCreate
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE: Maximum L2 persisting lines capacity setting in bytes
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MAX_ACCESS_POLICY_WINDOW_SIZE: Maximum value of CUaccessPolicyWindow::num_bytes
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WITH_CUDA_VMM_SUPPORTED: Device supports specifying the GPUDirect RDMA flag with ::cuMemCreate.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes
|
||||
* - ::CU_DEVICE_ATTRIBUTE_SPARSE_CUDA_ARRAY_SUPPORTED: Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag ::CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
|
||||
* - ::CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK: Amount of shared memory per block reserved by CUDA driver in bytes.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED: Device supports using the ::cuMemHostRegister flag CU_MEMHOSTERGISTER_READ_ONLY to register memory that must be mapped as read-only to the GPU
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED: Device supports using the ::cuMemAllocAsync and ::cuMemPool family of APIs
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_SUPPORTED: Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information)
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_FLUSH_WRITES_OPTIONS: The returned attribute shall be interpreted as a bitmask, where the individual bits are described by the ::CUflushGPUDirectRDMAWritesOptions enum
|
||||
* - ::CU_DEVICE_ATTRIBUTE_GPU_DIRECT_RDMA_WRITES_ORDERING: GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::CUGPUDirectRDMAWritesOrdering for the numerical values returned here.
|
||||
* - ::CU_DEVICE_ATTRIBUTE_MEMPOOL_SUPPORTED_HANDLE_TYPES: Bitmask of handle types supported with mempool based IPC
|
||||
*
|
||||
* \param pi - Returned device attribute value
|
||||
* \param attrib - Device attribute to query
|
||||
@@ -4690,13 +4657,6 @@ CUresult CUDAAPI cuCtxCreate_v3(CUcontext *pctx, CUexecAffinityParam *paramsArra
|
||||
* It is the responsibility of the calling function to ensure that no API
|
||||
* call issues using \p ctx while ::cuCtxDestroy() is executing.
|
||||
*
|
||||
* Destroys and cleans up all resources associated with the context.
|
||||
* It is the caller's responsibility to ensure that the context or its resources
|
||||
* are not accessed or passed in subsequent API calls and doing so will result in undefined behavior.
|
||||
* These resources include CUDA types such as ::CUmodule, ::CUfunction, ::CUstream, ::CUevent,
|
||||
* ::CUarray, ::CUmipmappedArray, ::CUtexObject, ::CUsurfObject, ::CUtexref, ::CUsurfref,
|
||||
* ::CUgraphicsResource, ::CUlinkState, ::CUexternalMemory and ::CUexternalSemaphore.
|
||||
*
|
||||
* If \p ctx is current to the calling thread then \p ctx will also be
|
||||
* popped from the current thread's context stack (as though ::cuCtxPopCurrent()
|
||||
* were called). If \p ctx is current to other threads, then \p ctx will
|
||||
@@ -5672,7 +5632,6 @@ CUresult CUDAAPI cuModuleLoadFatBinary(CUmodule *module, const void *fatCubin);
|
||||
* ::CUDA_ERROR_INVALID_CONTEXT,
|
||||
* ::CUDA_ERROR_INVALID_VALUE
|
||||
* \notefnerr
|
||||
* \note_destroy_ub
|
||||
*
|
||||
* \sa ::cuModuleGetFunction,
|
||||
* ::cuModuleGetGlobal,
|
||||
@@ -5993,9 +5952,8 @@ cuLinkDestroy(CUlinkState state);
|
||||
/**
|
||||
* \brief Gets free and total memory
|
||||
*
|
||||
* Returns in \p *total the total amount of memory available to the the current context.
|
||||
* Returns in \p *free the amount of memory on the device that is free according to the OS.
|
||||
* CUDA is not guaranteed to be able to allocate all of the memory that the OS reports as free.
|
||||
* Returns in \p *free and \p *total respectively, the free and total amount of
|
||||
* memory available for allocation by the CUDA context, in bytes.
|
||||
*
|
||||
* \param free - Returned free memory in bytes
|
||||
* \param total - Returned total memory in bytes
|
||||
@@ -6839,10 +6797,10 @@ CUresult CUDAAPI cuIpcCloseMemHandle(CUdeviceptr dptr);
|
||||
*
|
||||
* - ::CU_MEMHOSTREGISTER_READ_ONLY: The pointer is treated as pointing to memory
|
||||
* that is considered read-only by the device. On platforms without
|
||||
* ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
|
||||
* CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES, this flag is
|
||||
* required in order to register memory mapped to the CPU as read-only. Support
|
||||
* for the use of this flag can be queried from the device attribute
|
||||
* ::CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with
|
||||
* CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED. Using this flag with
|
||||
* a current context associated with a device that does not have this attribute
|
||||
* set will cause ::cuMemHostRegister to error with CUDA_ERROR_NOT_SUPPORTED.
|
||||
*
|
||||
@@ -8987,7 +8945,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
|
||||
* float16's:
|
||||
* \code
|
||||
CUDA_ARRAY_DESCRIPTOR desc;
|
||||
desc.Format = CU_AD_FORMAT_HALF;
|
||||
desc.FormatFlags = CU_AD_FORMAT_HALF;
|
||||
desc.NumChannels = 4;
|
||||
desc.Width = width;
|
||||
desc.Height = height;
|
||||
@@ -8997,7 +8955,7 @@ CUresult CUDAAPI cuMemsetD2D32Async(CUdeviceptr dstDevice, size_t dstPitch, unsi
|
||||
* of which is two 8-bit unsigned chars:
|
||||
* \code
|
||||
CUDA_ARRAY_DESCRIPTOR arrayDesc;
|
||||
desc.Format = CU_AD_FORMAT_UNSIGNED_INT8;
|
||||
desc.FormatFlags = CU_AD_FORMAT_UNSIGNED_INT8;
|
||||
desc.NumChannels = 2;
|
||||
desc.Width = width;
|
||||
desc.Height = height;
|
||||
@@ -9323,7 +9281,7 @@ CUresult CUDAAPI cuArrayDestroy(CUarray hArray);
|
||||
* 4x16-bit float16's:
|
||||
* \code
|
||||
CUDA_ARRAY3D_DESCRIPTOR desc;
|
||||
desc.Format = CU_AD_FORMAT_HALF;
|
||||
desc.FormatFlags = CU_AD_FORMAT_HALF;
|
||||
desc.NumChannels = 4;
|
||||
desc.Width = width;
|
||||
desc.Height = height;
|
||||
@@ -15180,7 +15138,7 @@ CUresult CUDAAPI cuGraphExternalSemaphoresWaitNodeSetParams(CUgraphNode hNode, c
|
||||
* \param nodeParams - Parameters for the node
|
||||
*
|
||||
* When ::cuGraphAddMemAllocNode creates an allocation node, it returns the address of the allocation in
|
||||
* \p nodeParams.dptr. The allocation's address remains fixed across instantiations and launches.
|
||||
* \param nodeParams.dptr. The allocation's address remains fixed across instantiations and launches.
|
||||
*
|
||||
* If the allocation is freed in the same graph, by creating a free node using ::cuGraphAddMemFreeNode,
|
||||
* the allocation can be accessed by nodes ordered after the allocation node but before the free node.
|
||||
@@ -15356,9 +15314,7 @@ CUresult CUDAAPI cuGraphMemFreeNodeGetParams(CUgraphNode hNode, CUdeviceptr *dpt
|
||||
*
|
||||
* \sa
|
||||
* ::cuGraphAddMemAllocNode,
|
||||
* ::cuGraphAddMemFreeNode,
|
||||
* ::cuDeviceSetGraphMemAttribute,
|
||||
* ::cuDeviceGetGraphMemAttribute
|
||||
* ::cuGraphAddMemFreeNode
|
||||
*/
|
||||
CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
|
||||
|
||||
@@ -15384,7 +15340,6 @@ CUresult CUDAAPI cuDeviceGraphMemTrim(CUdevice device);
|
||||
* ::CUDA_ERROR_INVALID_DEVICE
|
||||
*
|
||||
* \sa
|
||||
* ::cuDeviceSetGraphMemAttribute,
|
||||
* ::cuGraphAddMemAllocNode,
|
||||
* ::cuGraphAddMemFreeNode
|
||||
*/
|
||||
@@ -15409,7 +15364,6 @@ CUresult CUDAAPI cuDeviceGetGraphMemAttribute(CUdevice device, CUgraphMem_attrib
|
||||
* ::CUDA_ERROR_INVALID_DEVICE
|
||||
*
|
||||
* \sa
|
||||
* ::cuDeviceGetGraphMemAttribute,
|
||||
* ::cuGraphAddMemAllocNode,
|
||||
* ::cuGraphAddMemFreeNode
|
||||
*/
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#pragma once
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_IR_BASIC_BLOCK_H_
|
||||
#define _TRITON_IR_BASIC_BLOCK_H_
|
||||
@@ -27,7 +27,7 @@ public:
|
||||
|
||||
private:
|
||||
// constructors
|
||||
basic_block(context &ctx, const std::string &name, function *parent);
|
||||
basic_block(context &ctx, const std::string &name, function *parent, basic_block *next);
|
||||
|
||||
public:
|
||||
// accessors
|
||||
@@ -35,6 +35,7 @@ public:
|
||||
context& get_context() { return ctx_; }
|
||||
|
||||
// get iterator to first instruction that is not a phi
|
||||
void replace_phi_uses_with(basic_block* before, basic_block* after);
|
||||
iterator get_first_non_phi();
|
||||
|
||||
// get instruction list
|
||||
@@ -60,13 +61,16 @@ public:
|
||||
inline const instruction &back() const { return *inst_list_.back(); }
|
||||
inline instruction &back() { return *inst_list_.back(); }
|
||||
|
||||
void append_instruction(ir::instruction* i);
|
||||
// split
|
||||
basic_block* split_before(ir::instruction* loc, const std::string& name);
|
||||
|
||||
// predecessors
|
||||
const std::vector<basic_block*>& get_predecessors() const { return preds_; }
|
||||
const std::vector<basic_block*>& get_successors() const { return succs_; }
|
||||
void add_predecessor(basic_block* pred);
|
||||
std::vector<basic_block*> get_predecessors() const;
|
||||
std::vector<basic_block*> get_successors() const;
|
||||
|
||||
// factory functions
|
||||
static basic_block* create(context &ctx, const std::string &name, function *parent);
|
||||
static basic_block* create(context &ctx, const std::string &name, function *parent, basic_block *next = nullptr);
|
||||
|
||||
void print(std::ostream &os);
|
||||
|
||||
|
@@ -22,6 +22,7 @@ class phi_node;
|
||||
|
||||
/* Builder */
|
||||
class builder{
|
||||
public:
|
||||
typedef basic_block::iterator iterator;
|
||||
|
||||
public:
|
||||
@@ -75,6 +76,7 @@ public:
|
||||
value* create_br(basic_block *dest);
|
||||
value* create_cond_br(value *cond, basic_block* if_dest, basic_block* else_dest);
|
||||
value* create_ret_void();
|
||||
value* create_ret(value *ret);
|
||||
// Cast instructions
|
||||
value *create_cast(cast_op_t op, value *v, type *dst_ty);
|
||||
value* create_ptr_to_int(value *src, type *dst_ty);
|
||||
@@ -86,6 +88,9 @@ public:
|
||||
value* create_fp_trunc(value *src, type *dst_ty);
|
||||
value* create_int_cast(value *src, type *dst_ty, bool is_signed);
|
||||
value *create_downcast(value *arg);
|
||||
// Call instruction
|
||||
value* create_call(function* fn, const std::vector<value*>& args);
|
||||
value* create_launch(function* fn, const std::vector<value*>& args, const std::vector<value*>& grid, value* num_warps);
|
||||
// Phi instruction
|
||||
phi_node* create_phi(type *ty, unsigned num_reserved);
|
||||
// Binary instructions
|
||||
@@ -142,6 +147,9 @@ public:
|
||||
value *create_store(value *ptr, value *val);
|
||||
value *create_masked_load(value *arg, value *mask, value *false_value, load_inst::CACHE_MODIFIER cache, load_inst::EVICTION_POLICY eviction, bool is_volatile);
|
||||
value *create_masked_store(value *ptr, value *val, value *mask);
|
||||
// Struct instructions
|
||||
value *create_insert_value(value* val, value *elt, size_t idx);
|
||||
value *create_extract_value(value* val, size_t idx);
|
||||
// Block instruction
|
||||
value *create_splat(value *arg, const type::block_shapes_t &shapes);
|
||||
value *create_reshape(value *arg, const type::block_shapes_t &shapes);
|
||||
|
@@ -31,7 +31,8 @@ public:
|
||||
std::map<std::pair<type*, unsigned>, std::unique_ptr<pointer_type>> ptr_tys;
|
||||
// Block types
|
||||
std::map<std::pair<type*, type::block_shapes_t>, std::unique_ptr<block_type>> block_tys;
|
||||
|
||||
// Struct types
|
||||
std::map<type::contained_tys_vec_t, struct_type*> struct_tys;
|
||||
// Int constants
|
||||
std::map<std::pair<type*, uint64_t>, std::unique_ptr<constant_int>> int_constants_;
|
||||
// Float constants
|
||||
|
@@ -95,6 +95,9 @@ enum value_id_t: unsigned {
|
||||
INSTRUCTIONS
|
||||
* ------------ */
|
||||
INST_BEGIN,
|
||||
// call
|
||||
INST_CALL,
|
||||
INST_LAUNCH,
|
||||
// phi
|
||||
INST_PHI,
|
||||
// arithmetic
|
||||
@@ -129,6 +132,9 @@ enum value_id_t: unsigned {
|
||||
INST_MASKED_LOAD_ASYNC,
|
||||
INST_UNMASKED_STORE,
|
||||
INST_MASKED_STORE,
|
||||
// struct
|
||||
INST_EXTRACT_VALUE,
|
||||
INST_INSERT_VALUE,
|
||||
// retile
|
||||
INST_RESHAPE,
|
||||
INST_SPLAT,
|
||||
|
@@ -24,7 +24,7 @@ public:
|
||||
static argument* create(type *ty, const std::string &name,
|
||||
function *parent = nullptr, unsigned arg_no = 0);
|
||||
function* get_parent() const;
|
||||
unsigned get_arg_no() const;
|
||||
unsigned get_arg_no() const;
|
||||
|
||||
void accept(visitor *v);
|
||||
|
||||
@@ -121,6 +121,8 @@ public:
|
||||
const attr_map_t &attrs() { return attrs_; }
|
||||
bool has_attr(unsigned arg_id) const { return attrs_.find(arg_id) != attrs_.end(); }
|
||||
std::set<attribute> get_attributes(const argument* arg) { return attrs_[arg->get_arg_no() + 1]; }
|
||||
void set_is_kernel(bool new_val) { is_kernel_ = new_val; }
|
||||
bool get_is_kernel() { return is_kernel_; }
|
||||
|
||||
void print(std::ostream &os);
|
||||
|
||||
@@ -134,6 +136,7 @@ private:
|
||||
args_t args_;
|
||||
blocks_t blocks_;
|
||||
attr_map_t attrs_;
|
||||
bool is_kernel_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@@ -81,6 +81,51 @@ private:
|
||||
value_id_t id_;
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// call_inst classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
class call_inst: public instruction {
|
||||
private:
|
||||
std::string repr_impl() const;
|
||||
call_inst(ir::function* fn, const std::vector<ir::value*>& values, const std::string& name, instruction* next);
|
||||
|
||||
public:
|
||||
static call_inst* create(ir::function* fn, const std::vector<ir::value*>& values, const std::string &name = "", instruction *next = nullptr);
|
||||
ir::function* get_fn() { return fn_; }
|
||||
|
||||
_TRITON_DEFINE_CLONE(call_inst)
|
||||
_TRITON_DEFINE_ACCEPT(call_inst)
|
||||
|
||||
private:
|
||||
ir::function* fn_;
|
||||
};
|
||||
|
||||
class launch_inst: public instruction {
|
||||
private:
|
||||
std::string repr_impl() const { return "launch"; }
|
||||
launch_inst(ir::function* fn, const std::vector<ir::value*>& values, const std::vector<ir::value*>& grid, ir::value* num_warps,
|
||||
const std::string &name = "", instruction *next = nullptr);
|
||||
|
||||
public:
|
||||
static launch_inst* create(ir::function* fn, const std::vector<ir::value*>& values, const std::vector<ir::value*>& grid, ir::value* num_warps,
|
||||
const std::string& name = "", instruction* next = nullptr);
|
||||
|
||||
ir::function* get_fn();
|
||||
std::vector<ir::value*> get_values();
|
||||
std::vector<ir::value*> get_grid();
|
||||
ir::value* get_num_warps();
|
||||
|
||||
|
||||
_TRITON_DEFINE_CLONE(launch_inst)
|
||||
_TRITON_DEFINE_ACCEPT(launch_inst)
|
||||
|
||||
private:
|
||||
unsigned val_begin;
|
||||
unsigned val_end;
|
||||
unsigned grid_begin;
|
||||
unsigned grid_end;
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// phi_node classes
|
||||
@@ -546,6 +591,44 @@ public:
|
||||
_TRITON_DEFINE_ACCEPT(masked_store_inst)
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// struct classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// insert_value
|
||||
|
||||
class insert_value_inst: public instruction {
|
||||
private:
|
||||
std::string repr_impl() const { return "insertvalue"; }
|
||||
insert_value_inst(value *val, value *elt, size_t idx, const std::string &name, instruction *next);
|
||||
|
||||
public:
|
||||
static insert_value_inst* create(value *val, value* elt, size_t idx, const std::string &name = "", instruction *next = nullptr);
|
||||
size_t get_idx() { return idx_; }
|
||||
_TRITON_DEFINE_CLONE(insert_value_inst)
|
||||
_TRITON_DEFINE_ACCEPT(insert_value_inst)
|
||||
|
||||
private:
|
||||
size_t idx_;
|
||||
};
|
||||
|
||||
// extract_value
|
||||
|
||||
class extract_value_inst: public instruction {
|
||||
private:
|
||||
std::string repr_impl() const { return "extractvalue"; }
|
||||
extract_value_inst(value *val, size_t idx, const std::string &name, instruction *next);
|
||||
|
||||
public:
|
||||
static extract_value_inst* create(value *val, size_t idx, const std::string &name = "", instruction *next = nullptr);
|
||||
size_t get_idx() { return idx_; }
|
||||
_TRITON_DEFINE_CLONE(extract_value_inst)
|
||||
_TRITON_DEFINE_ACCEPT(extract_value_inst)
|
||||
|
||||
private:
|
||||
size_t idx_;
|
||||
};
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// retile_inst classes
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@@ -34,79 +34,97 @@ class constant;
|
||||
class global_value;
|
||||
class alloc_const;
|
||||
|
||||
/* Module */
|
||||
|
||||
class module {
|
||||
class value_constructor {
|
||||
typedef std::pair<std::string, basic_block*> val_key_t;
|
||||
friend class function;
|
||||
typedef std::pair<ir::metadata::kind_t, unsigned> md_pair_t;
|
||||
|
||||
public:
|
||||
typedef std::map<std::string, global_value*> symbols_map_t;
|
||||
typedef std::vector<function*> functions_list_t;
|
||||
struct current_iteration_info_t{
|
||||
lang::iteration_statement *statement;
|
||||
basic_block *block;
|
||||
};
|
||||
|
||||
private:
|
||||
phi_node *make_phi(type *ty, unsigned num_values, basic_block *block);
|
||||
value *try_remove_trivial_phis(ir::phi_node *&phi);
|
||||
value *add_phi_operands(const std::string& name, phi_node *&phi);
|
||||
value *get_value_recursive(const std::string& name, basic_block *block);
|
||||
|
||||
public:
|
||||
value_constructor(builder &builder);
|
||||
|
||||
void set_value(const std::string& name, basic_block* block, value *x);
|
||||
void set_value(const std::string& name, value* x);
|
||||
const std::map<val_key_t, value*>& get_values() { return values_; }
|
||||
void set_values(const std::map<val_key_t, value*>& values) { values_ = values; }
|
||||
value *get_value(const std::string& name, basic_block* block);
|
||||
value *get_value(const std::string& name);
|
||||
void set_type(const std::string& name, ir::type* ty) { types_[name] = ty; }
|
||||
// Seal block -- no more predecessors will be added
|
||||
void seal_block(basic_block *block);
|
||||
// Metadata
|
||||
void add_metadata(const std::string &name, md_pair_t x) { metadatas_[name] = x; }
|
||||
|
||||
private:
|
||||
ir::builder& builder_;
|
||||
std::map<val_key_t, value*> values_;
|
||||
std::map<std::string, type*> types_;
|
||||
std::set<basic_block*> sealed_blocks_;
|
||||
std::map<basic_block*, std::map<std::string, phi_node*>> incomplete_phis_;
|
||||
std::map<value*, value**> current_phi_;
|
||||
std::map<std::string, md_pair_t> metadatas_;
|
||||
};
|
||||
|
||||
/* Module */
|
||||
|
||||
class module {
|
||||
typedef std::pair<std::string, basic_block*> val_key_t;
|
||||
friend class function;
|
||||
|
||||
public:
|
||||
typedef std::map<std::string, global_value*> symbols_map_t;
|
||||
typedef std::vector<function*> functions_list_t;
|
||||
|
||||
private:
|
||||
void push_function(function *fn) { functions_.push_back(fn); }
|
||||
|
||||
public:
|
||||
module(const std::string &name, builder& builder);
|
||||
builder& get_builder();
|
||||
// Setters
|
||||
void set_value(const std::string& name, basic_block* block, value *x);
|
||||
void set_value(const std::string& name, value* x);
|
||||
void set_const(const std::string& name);
|
||||
void set_continue_fn(std::function<ir::value*()> fn);
|
||||
// Getters
|
||||
const std::map<val_key_t, value*>& get_values() { return values_; }
|
||||
const std::map<std::string, type*>& get_types() { return types_; }
|
||||
void set_values(const std::map<val_key_t, value*>& values) { values_ = values; }
|
||||
void set_types(const std::map<std::string, type*>& types) { types_ = types; }
|
||||
|
||||
value *get_value(const std::string& name, basic_block* block);
|
||||
value *get_value(const std::string& name);
|
||||
void set_type(const std::string& name, ir::type* ty) { types_[name] = ty; }
|
||||
const std::string& get_name();
|
||||
std::function<ir::value*()> get_continue_fn();
|
||||
// Seal block -- no more predecessors will be added
|
||||
void seal_block(basic_block *block);
|
||||
// Functions
|
||||
const functions_list_t &get_function_list() const { return functions_; }
|
||||
functions_list_t &get_function_list() { return functions_; }
|
||||
function *get_function(const std::string& name) {
|
||||
if(symbols_.find(name) == symbols_.end())
|
||||
throw std::runtime_error("function " + name + " is not declared");
|
||||
return (function*)symbols_.at(name);
|
||||
}
|
||||
function *get_or_insert_function(const std::string &name, function_type *ty);
|
||||
bool has_function(const std::string& name){
|
||||
return symbols_.find(name) != symbols_.end();
|
||||
}
|
||||
void remove_function(ir::function* fn){
|
||||
functions_.erase(std::remove(functions_.begin(), functions_.end(), fn), functions_.end());
|
||||
}
|
||||
|
||||
void reset_ret_ty(const std::string& name, type* ty);
|
||||
|
||||
// Const allocation
|
||||
void add_alloc(ir::alloc_const* x) { allocs_.push_back(x); }
|
||||
const std::vector<ir::alloc_const*>& allocs() { return allocs_; }
|
||||
// Register global
|
||||
void register_global(const std::string& name, ir::value *x) { globals_[name] = x; }
|
||||
const std::map<std::string, ir::value*>& globals() const { return globals_; }
|
||||
// Metadata
|
||||
void add_metadata(const std::string &name, md_pair_t x) { metadatas_[name] = x; }
|
||||
|
||||
//
|
||||
void print(std::ostream &os);
|
||||
|
||||
private:
|
||||
std::string name_;
|
||||
builder& builder_;
|
||||
std::map<val_key_t, value*> values_;
|
||||
std::map<std::string, type*> types_;
|
||||
std::set<std::string> const_;
|
||||
std::set<basic_block*> sealed_blocks_;
|
||||
std::map<basic_block*, std::map<std::string, phi_node*>> incomplete_phis_;
|
||||
functions_list_t functions_;
|
||||
symbols_map_t symbols_;
|
||||
std::function<ir::value*()> continue_fn_;
|
||||
std::map<value*, value**> current_phi_;
|
||||
std::vector<ir::alloc_const*> allocs_;
|
||||
std::map<std::string, ir::value*> globals_;
|
||||
std::map<std::string, md_pair_t> metadatas_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@@ -1,4 +1,4 @@
|
||||
#pragma once
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_IR_TYPE_H_
|
||||
#define _TRITON_IR_TYPE_H_
|
||||
@@ -73,6 +73,8 @@ public:
|
||||
type *get_tile_element_ty() const;
|
||||
unsigned get_pointer_address_space() const;
|
||||
type *get_pointer_element_ty() const;
|
||||
unsigned get_struct_numel() const { return contained_tys_.size(); }
|
||||
type *get_struct_type(unsigned int i) const { return contained_tys_[i]; }
|
||||
|
||||
// primitive predicates
|
||||
bool is_void_ty() const { return id_ == VoidTyID; }
|
||||
@@ -91,6 +93,7 @@ public:
|
||||
bool is_bool_ty() const { return is_integer_ty(1); }
|
||||
bool is_pointer_ty() const { return id_ == PointerTyID; }
|
||||
bool is_block_ty() const { return id_ == BlockTyID; }
|
||||
bool is_struct_ty() const { return id_ == StructTyID; }
|
||||
|
||||
// Composite predicates
|
||||
bool is_int_or_tileint_ty();
|
||||
@@ -138,10 +141,10 @@ public:
|
||||
switch(id_) {
|
||||
case VoidTyID: return "void";
|
||||
case FP8TyID: return "fp8";
|
||||
case BF16TyID: return "bf16";
|
||||
case FP16TyID: return "f16";
|
||||
case FP32TyID: return "f32";
|
||||
case FP64TyID: return "f64";
|
||||
case BF16TyID: return "bf16";
|
||||
case LabelTyID: return "label";
|
||||
case MetadataTyID: return "md";
|
||||
case TokenTyID: return "tok";
|
||||
@@ -194,6 +197,16 @@ public:
|
||||
type* get_type_at_index(value *idx) const;
|
||||
};
|
||||
|
||||
class struct_type: public composite_type {
|
||||
public:
|
||||
struct_type(const contained_tys_vec_t& tys, bool is_packed);
|
||||
unsigned get_num_types() const { return contained_tys_.size(); }
|
||||
static struct_type* get(const contained_tys_vec_t& tys, bool is_packed);
|
||||
|
||||
private:
|
||||
bool is_packed_;
|
||||
};
|
||||
|
||||
class block_type: public composite_type {
|
||||
private:
|
||||
block_type(type *ty, const block_shapes_t &shapes);
|
||||
@@ -242,6 +255,7 @@ public:
|
||||
ty_iterator params_end() { return contained_tys_.end(); }
|
||||
type* get_param_ty(unsigned i) const { return contained_tys_.at(1 + i); }
|
||||
type* get_return_ty() const { return contained_tys_.at(0); }
|
||||
void reset_ret_ty(type* ty) { contained_tys_[0] = ty;}
|
||||
// factory methods
|
||||
static function_type* get(type *ret_ty, const std::vector<type*>& param_tys);
|
||||
};
|
||||
|
@@ -21,7 +21,7 @@ class visitor;
|
||||
|
||||
class value {
|
||||
public:
|
||||
typedef std::set<user*> users_t;
|
||||
typedef std::vector<user*> users_t;
|
||||
|
||||
public:
|
||||
// constructor
|
||||
@@ -30,7 +30,7 @@ public:
|
||||
// uses
|
||||
void add_use(user* arg);
|
||||
users_t::iterator erase_use(user* arg);
|
||||
const std::set<user*> &get_users() { return users_; }
|
||||
const std::vector<user*> &get_users() { return users_; }
|
||||
void replace_all_uses_with(value *target);
|
||||
// name
|
||||
void set_name(const std::string &name);
|
||||
|
@@ -11,6 +11,9 @@ class value;
|
||||
|
||||
class instruction;
|
||||
|
||||
class call_inst;
|
||||
class launch_inst;
|
||||
|
||||
class phi_node;
|
||||
class binary_operator;
|
||||
class getelementptr_inst;
|
||||
@@ -42,6 +45,9 @@ class masked_load_inst;
|
||||
class unmasked_store_inst;
|
||||
class masked_store_inst;
|
||||
|
||||
class extract_value_inst;
|
||||
class insert_value_inst;
|
||||
|
||||
class retile_inst;
|
||||
class reshape_inst;
|
||||
class splat_inst;
|
||||
@@ -105,6 +111,8 @@ public:
|
||||
virtual ~visitor() {}
|
||||
|
||||
virtual void visit_value(ir::value*);
|
||||
virtual void visit_call_inst(ir::call_inst*) = 0;
|
||||
virtual void visit_launch_inst(ir::launch_inst*) = 0;
|
||||
|
||||
virtual void visit_basic_block(basic_block*) = 0;
|
||||
virtual void visit_argument(argument*) = 0;
|
||||
@@ -132,6 +140,9 @@ public:
|
||||
virtual void visit_sin_inst(sin_inst*) = 0;
|
||||
virtual void visit_log_inst(log_inst*) = 0;
|
||||
|
||||
virtual void visit_extract_value_inst(extract_value_inst*) = 0;
|
||||
virtual void visit_insert_value_inst(insert_value_inst*) = 0;
|
||||
|
||||
virtual void visit_reshape_inst(reshape_inst*) = 0;
|
||||
virtual void visit_splat_inst(splat_inst*) = 0;
|
||||
virtual void visit_cat_inst(cat_inst*) = 0;
|
||||
|
Reference in New Issue
Block a user