[DRIVER] Fix typos (#939)

This commit is contained in:
Yang Hau
2022-12-03 03:13:46 +08:00
committed by GitHub
parent 44f577984d
commit 8650b4d1cb
8 changed files with 60 additions and 60 deletions

View File

@@ -818,7 +818,7 @@ typedef enum CUcomputemode_enum {
* Memory advise values * Memory advise values
*/ */
typedef enum CUmem_advise_enum { typedef enum CUmem_advise_enum {
CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occassionally be written to */ CU_MEM_ADVISE_SET_READ_MOSTLY = 1, /**< Data will mostly be read and only occasionally be written to */
CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */ CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2, /**< Undo the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY */
CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */ CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3, /**< Set the preferred location for the data as the specified device */
CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */ CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4, /**< Clear the preferred location for the data */
@@ -827,7 +827,7 @@ typedef enum CUmem_advise_enum {
} CUmem_advise; } CUmem_advise;
typedef enum CUmem_range_attribute_enum { typedef enum CUmem_range_attribute_enum {
CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occassionally be written to */ CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1, /**< Whether the range will mostly be read and only occasionally be written to */
CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */ CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2, /**< The preferred location of the range */
CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */ CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3, /**< Memory range has ::CU_MEM_ADVISE_SET_ACCESSED_BY set for specified device */
CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */ CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4 /**< The last location to which the range was prefetched */
@@ -849,7 +849,7 @@ typedef enum CUjit_option_enum
* IN: Specifies minimum number of threads per block to target compilation * IN: Specifies minimum number of threads per block to target compilation
* for\n * for\n
* OUT: Returns the number of threads the compiler actually targeted. * OUT: Returns the number of threads the compiler actually targeted.
* This restricts the resource utilization fo the compiler (e.g. max * This restricts the resource utilization of the compiler (e.g. max
* registers) such that a block with the given number of threads should be * registers) such that a block with the given number of threads should be
* able to launch based on register limitations. Note, this option does not * able to launch based on register limitations. Note, this option does not
* currently take into account any other resource limitations, such as * currently take into account any other resource limitations, such as
@@ -974,10 +974,10 @@ typedef enum CUjit_option_enum
CU_JIT_FAST_COMPILE, CU_JIT_FAST_COMPILE,
/** /**
* Array of device symbol names that will be relocated to the corresponing * Array of device symbol names that will be relocated to the corresponding
* host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n * host addresses stored in ::CU_JIT_GLOBAL_SYMBOL_ADDRESSES.\n
* Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n * Must contain ::CU_JIT_GLOBAL_SYMBOL_COUNT entries.\n
* When loding a device module, driver will relocate all encountered * When loading a device module, driver will relocate all encountered
* unresolved symbols to the host addresses.\n * unresolved symbols to the host addresses.\n
* It is only allowed to register symbols that correspond to unresolved * It is only allowed to register symbols that correspond to unresolved
* global variables.\n * global variables.\n
@@ -1194,7 +1194,7 @@ typedef enum CUlimit_enum {
* Resource types * Resource types
*/ */
typedef enum CUresourcetype_enum { typedef enum CUresourcetype_enum {
CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resoure */ CU_RESOURCE_TYPE_ARRAY = 0x00, /**< Array resource */
CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */ CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01, /**< Mipmapped array resource */
CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */ CU_RESOURCE_TYPE_LINEAR = 0x02, /**< Linear resource */
CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */ CU_RESOURCE_TYPE_PITCH2D = 0x03 /**< Pitch 2D resource */
@@ -2914,9 +2914,9 @@ typedef struct CUmemAllocationProp_st {
CUmemLocation location; CUmemLocation location;
/** /**
* Windows-specific POBJECT_ATTRIBUTES required when * Windows-specific POBJECT_ATTRIBUTES required when
* ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object atributes structure * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This object attributes structure
* includes security attributes that define * includes security attributes that define
* the scope of which exported allocations may be tranferred to other * the scope of which exported allocations may be transferred to other
* processes. In all other cases, this field is required to be zero. * processes. In all other cases, this field is required to be zero.
*/ */
void *win32HandleMetaData; void *win32HandleMetaData;
@@ -3036,7 +3036,7 @@ typedef struct CUmemPoolProps_st {
/** /**
* Windows-specific LPSECURITYATTRIBUTES required when * Windows-specific LPSECURITYATTRIBUTES required when
* ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines * ::CU_MEM_HANDLE_TYPE_WIN32 is specified. This security attribute defines
* the scope of which exported allocations may be tranferred to other * the scope of which exported allocations may be transferred to other
* processes. In all other cases, this field is required to be zero. * processes. In all other cases, this field is required to be zero.
*/ */
void *win32SecurityAttributes; void *win32SecurityAttributes;
@@ -3519,7 +3519,7 @@ CUresult CUDAAPI cuDeviceGet(CUdevice *device, int ordinal);
CUresult CUDAAPI cuDeviceGetCount(int *count); CUresult CUDAAPI cuDeviceGetCount(int *count);
/** /**
* \brief Returns an identifer string for the device * \brief Returns an identifier string for the device
* *
* Returns an ASCII string identifying the device \p dev in the NULL-terminated * Returns an ASCII string identifying the device \p dev in the NULL-terminated
* string pointed to by \p name. \p len specifies the maximum length of the * string pointed to by \p name. \p len specifies the maximum length of the
@@ -3556,7 +3556,7 @@ CUresult CUDAAPI cuDeviceGetName(char *name, int len, CUdevice dev);
* Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will * Note there is a later version of this API, ::cuDeviceGetUuid_v2. It will
* supplant this version in 12.0, which is retained for minor version compatibility. * supplant this version in 12.0, which is retained for minor version compatibility.
* *
* Returns 16-octets identifing the device \p dev in the structure * Returns 16-octets identifying the device \p dev in the structure
* pointed by the \p uuid. * pointed by the \p uuid.
* *
* \param uuid - Returned UUID * \param uuid - Returned UUID
@@ -3586,7 +3586,7 @@ CUresult CUDAAPI cuDeviceGetUuid(CUuuid *uuid, CUdevice dev);
/** /**
* \brief Return an UUID for the device (11.4+) * \brief Return an UUID for the device (11.4+)
* *
* Returns 16-octets identifing the device \p dev in the structure * Returns 16-octets identifying the device \p dev in the structure
* pointed by the \p uuid. If the device is in MIG mode, returns its * pointed by the \p uuid. If the device is in MIG mode, returns its
* MIG UUID which uniquely identifies the subscribed MIG compute instance. * MIG UUID which uniquely identifies the subscribed MIG compute instance.
* *
@@ -3867,7 +3867,7 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
* supports native atomic operations. * supports native atomic operations.
* - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance * - ::CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO: Ratio of single precision performance
* (in floating-point operations per second) to double precision performance. * (in floating-point operations per second) to double precision performance.
* - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device suppports coherently accessing * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS: Device supports coherently accessing
* pageable memory without calling cudaHostRegister on it. * pageable memory without calling cudaHostRegister on it.
* - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory * - ::CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS: Device can coherently access managed memory
* concurrently with the CPU. * concurrently with the CPU.
@@ -3875,7 +3875,7 @@ CUresult CUDAAPI cuDeviceGetTexture1DLinearMaxWidth(size_t *maxWidthInElements,
* - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered * - ::CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM: Device can access host registered
* memory at the same virtual address as the CPU. * memory at the same virtual address as the CPU.
* - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size * - ::CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN: The maximum per block shared memory size
* suported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call. * supported on this device. This is the maximum value that can be opted into when using the cuFuncSetAttribute() call.
* For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES * For more details see ::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
* - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's * - ::CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES: Device accesses pageable memory via the host's
* page tables. * page tables.
@@ -4132,7 +4132,7 @@ __CUDA_DEPRECATED CUresult CUDAAPI cuDeviceGetProperties(CUdevprop *prop, CUdevi
* *
* \deprecated * \deprecated
* *
* This function was deprecated as of CUDA 5.0 and its functionality superceded * This function was deprecated as of CUDA 5.0 and its functionality superseded
* by ::cuDeviceGetAttribute(). * by ::cuDeviceGetAttribute().
* *
* Returns in \p *major and \p *minor the major and minor revision numbers that * Returns in \p *major and \p *minor the major and minor revision numbers that
@@ -4962,10 +4962,10 @@ CUresult CUDAAPI cuCtxSynchronize(void);
* returned. * returned.
* *
* - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity. * - ::CU_LIMIT_MAX_L2_FETCH_GRANULARITY controls the L2 cache fetch granularity.
* Values can range from 0B to 128B. This is purely a performence hint and * Values can range from 0B to 128B. This is purely a performance hint and
* it can be ignored or clamped depending on the platform. * it can be ignored or clamped depending on the platform.
* *
* - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes availabe for * - ::CU_LIMIT_PERSISTING_L2_CACHE_SIZE controls size in bytes available for
* persisting L2 cache. This is purely a performance hint and it can be * persisting L2 cache. This is purely a performance hint and it can be
* ignored or clamped depending on the platform. * ignored or clamped depending on the platform.
* *
@@ -6398,7 +6398,7 @@ CUresult CUDAAPI cuMemHostGetFlags(unsigned int *pFlags, void *p);
* ::cuStreamAttachMemAsync will be required to enable access on such devices. * ::cuStreamAttachMemAsync will be required to enable access on such devices.
* *
* If the association is later changed via ::cuStreamAttachMemAsync to * If the association is later changed via ::cuStreamAttachMemAsync to
* a single stream, the default association as specifed during ::cuMemAllocManaged * a single stream, the default association as specified during ::cuMemAllocManaged
* is restored when that stream is destroyed. For __managed__ variables, the * is restored when that stream is destroyed. For __managed__ variables, the
* default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a * default association is always ::CU_MEM_ATTACH_GLOBAL. Note that destroying a
* stream is an asynchronous operation, and as a result, the change to default * stream is an asynchronous operation, and as a result, the change to default
@@ -9616,13 +9616,13 @@ CUresult CUDAAPI cuMemAddressFree(CUdeviceptr ptr, size_t size);
* \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties * \brief Create a CUDA memory handle representing a memory allocation of a given size described by the given properties
* *
* This creates a memory allocation on the target device specified through the * This creates a memory allocation on the target device specified through the
* \p prop strcuture. The created allocation will not have any device or host * \p prop structure. The created allocation will not have any device or host
* mappings. The generic memory \p handle for the allocation can be * mappings. The generic memory \p handle for the allocation can be
* mapped to the address space of calling process via ::cuMemMap. This handle * mapped to the address space of calling process via ::cuMemMap. This handle
* cannot be transmitted directly to other processes (see * cannot be transmitted directly to other processes (see
* ::cuMemExportToShareableHandle). On Windows, the caller must also pass * ::cuMemExportToShareableHandle). On Windows, the caller must also pass
* an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which * an LPSECURITYATTRIBUTE in \p prop to be associated with this handle which
* limits or allows access to this handle for a recepient process (see * limits or allows access to this handle for a recipient process (see
* ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this * ::CUmemAllocationProp::win32HandleMetaData for more). The \p size of this
* allocation must be a multiple of the the value given via * allocation must be a multiple of the the value given via
* ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM * ::cuMemGetAllocationGranularity with the ::CU_MEM_ALLOC_GRANULARITY_MINIMUM
@@ -9660,7 +9660,7 @@ CUresult CUDAAPI cuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
* are unmapped and when all outstanding references to the handle (including it's * are unmapped and when all outstanding references to the handle (including it's
* shareable counterparts) are also released. The generic memory handle can be * shareable counterparts) are also released. The generic memory handle can be
* freed when there are still outstanding mappings made with this handle. Each * freed when there are still outstanding mappings made with this handle. Each
* time a recepient process imports a shareable handle, it needs to pair it with * time a recipient process imports a shareable handle, it needs to pair it with
* ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle * ::cuMemRelease for the handle to be freed. If \p handle is not a valid handle
* the behavior is undefined. * the behavior is undefined.
* *
@@ -10975,7 +10975,7 @@ CUresult CUDAAPI cuMemAdvise(CUdeviceptr devPtr, size_t count, CUmem_advise advi
* a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU * a GPU id or CU_DEVICE_CPU depending on whether the last location for prefetch was a GPU or the CPU
* respectively. If any page in the memory range was never explicitly prefetched or if all pages were not * respectively. If any page in the memory range was never explicitly prefetched or if all pages were not
* prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the * prefetched to the same location, CU_DEVICE_INVALID will be returned. Note that this simply returns the
* last location that the applicaton requested to prefetch the memory range to. It gives no indication as to * last location that the application requested to prefetch the memory range to. It gives no indication as to
* whether the prefetch operation to that location has completed or even begun. * whether the prefetch operation to that location has completed or even begun.
* *
* \param data - A pointers to a memory location where the result * \param data - A pointers to a memory location where the result
@@ -13561,7 +13561,7 @@ CUresult CUDAAPI cuLaunchCooperativeKernel(CUfunction f,
* All kernels launched must be identical with respect to the compiled code. Note that * All kernels launched must be identical with respect to the compiled code. Note that
* any __device__, __constant__ or __managed__ variables present in the module that owns * any __device__, __constant__ or __managed__ variables present in the module that owns
* the kernel launched on each device, are independently instantiated on every device. * the kernel launched on each device, are independently instantiated on every device.
* It is the application's responsiblity to ensure these variables are initialized and * It is the application's responsibility to ensure these variables are initialized and
* used appropriately. * used appropriately.
* *
* The size of the grids as specified in blocks, the size of the blocks themselves * The size of the grids as specified in blocks, the size of the blocks themselves

View File

@@ -328,7 +328,7 @@ typedef enum nvmlGpuLevel_enum
typedef enum nvmlGpuP2PStatus_enum typedef enum nvmlGpuP2PStatus_enum
{ {
NVML_P2P_STATUS_OK = 0, NVML_P2P_STATUS_OK = 0,
NVML_P2P_STATUS_CHIPSET_NOT_SUPPORED, NVML_P2P_STATUS_CHIPSET_NOT_SUPPORTED,
NVML_P2P_STATUS_GPU_NOT_SUPPORTED, NVML_P2P_STATUS_GPU_NOT_SUPPORTED,
NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED, NVML_P2P_STATUS_IOH_TOPOLOGY_NOT_SUPPORTED,
NVML_P2P_STATUS_DISABLED_BY_REGKEY, NVML_P2P_STATUS_DISABLED_BY_REGKEY,
@@ -736,7 +736,7 @@ typedef enum nvmlReturn_enum
NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use NVML_ERROR_IN_USE = 19, //!< An operation cannot be performed because the GPU is currently in use
NVML_ERROR_MEMORY = 20, //!< Insufficient memory NVML_ERROR_MEMORY = 20, //!< Insufficient memory
NVML_ERROR_NO_DATA = 21, //!<No data NVML_ERROR_NO_DATA = 21, //!<No data
NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, becasue ECC is enabled NVML_ERROR_VGPU_ECC_NOT_SUPPORTED = 22, //!< The requested vgpu operation is not available on target device, because ECC is enabled
NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred NVML_ERROR_UNKNOWN = 999 //!< An internal driver error occurred
} nvmlReturn_t; } nvmlReturn_t;
@@ -1463,7 +1463,7 @@ typedef struct nvmlEncoderSessionInfo_st
*/ */
typedef enum nvmlFBCSessionType_enum typedef enum nvmlFBCSessionType_enum
{ {
NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknwon NVML_FBC_SESSION_TYPE_UNKNOWN = 0, //!< Unknown
NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys NVML_FBC_SESSION_TYPE_TOSYS, //!< ToSys
NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda NVML_FBC_SESSION_TYPE_CUDA, //!< Cuda
NVML_FBC_SESSION_TYPE_VID, //!< Vid NVML_FBC_SESSION_TYPE_VID, //!< Vid
@@ -3678,10 +3678,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetEncoderStats (nvmlDevice_t device, unsigned in
* Retrieves information about active encoder sessions on a target device. * Retrieves information about active encoder sessions on a target device.
* *
* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfos. The
* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
* written to the buffer. * written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the active session array, the function returns * If the supplied buffer is not large enough to accommodate the active session array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
* To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
* NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
@@ -3727,7 +3727,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetDecoderUtilization(nvmlDevice_t device, unsign
* For Maxwell &tm; or newer fully supported devices. * For Maxwell &tm; or newer fully supported devices.
* *
* @param device The identifier of the target device * @param device The identifier of the target device
* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats * @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats
* *
* @return * @return
* - \ref NVML_SUCCESS if \a fbcStats is fetched * - \ref NVML_SUCCESS if \a fbcStats is fetched
@@ -3742,10 +3742,10 @@ nvmlReturn_t DECLDIR nvmlDeviceGetFBCStats(nvmlDevice_t device, nvmlFBCStats_t *
* Retrieves information about active frame buffer capture sessions on a target device. * Retrieves information about active frame buffer capture sessions on a target device.
* *
* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
* written to the buffer. * written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the active session array, the function returns * If the supplied buffer is not large enough to accommodate the active session array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return * To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return
* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. * NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
@@ -4208,7 +4208,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetRetiredPages(nvmlDevice_t device, nvmlPageReti
* The address information provided from this API is the hardware address of the page that was retired. Note * The address information provided from this API is the hardware address of the page that was retired. Note
* that this does not match the virtual address used in CUDA, but will match the address information in XID 63 * that this does not match the virtual address used in CUDA, but will match the address information in XID 63
* *
* \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps paramter to return the time of each page's * \note nvmlDeviceGetRetiredPages_v2 adds an additional timestamps parameter to return the time of each page's
* retirement. * retirement.
* *
* For Kepler &tm; or newer fully supported devices. * For Kepler &tm; or newer fully supported devices.
@@ -4476,7 +4476,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetDriverModel(nvmlDevice_t device, nvmlDriverMod
* Set clocks that device will lock to. * Set clocks that device will lock to.
* *
* Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz. * Sets the clocks that the device will be running at to the value in the range of minGpuClockMHz to maxGpuClockMHz.
* Setting this will supercede application clock values and take effect regardless if a cuda app is running. * Setting this will supersede application clock values and take effect regardless if a cuda app is running.
* See /ref nvmlDeviceSetApplicationsClocks * See /ref nvmlDeviceSetApplicationsClocks
* *
* Can be used as a setting to request constant performance. * Can be used as a setting to request constant performance.
@@ -5297,7 +5297,7 @@ nvmlReturn_t DECLDIR nvmlDeviceSetVirtualizationMode(nvmlDevice_t device, nvmlGp
* pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount * pointed at by \a vgpuTypeIds. The element count of nvmlVgpuTypeId_t array is passed in \a vgpuCount, and \a vgpuCount
* is used to return the number of vGPU types written to the buffer. * is used to return the number of vGPU types written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
* To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0. * To query the number of vGPU types supported for the GPU, call this function with *vgpuCount = 0.
* The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported. * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are supported.
@@ -5327,9 +5327,9 @@ nvmlReturn_t DECLDIR nvmlDeviceGetSupportedVgpus(nvmlDevice_t device, unsigned i
* can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable * can concurrently run on a device. For example, if only one vGPU type is allowed at a time on a device, then the creatable
* list will be restricted to whatever vGPU type is already running on the device. * list will be restricted to whatever vGPU type is already running on the device.
* *
* If the supplied buffer is not large enough to accomodate the vGPU type array, the function returns * If the supplied buffer is not large enough to accommodate the vGPU type array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuTypeId_t array required in \a vgpuCount.
* To query the number of vGPU types createable for the GPU, call this function with *vgpuCount = 0. * To query the number of vGPU types creatable for the GPU, call this function with *vgpuCount = 0.
* The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable. * The code will return NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU types are creatable.
* *
* @param device The identifier of the target device * @param device The identifier of the target device
@@ -5392,7 +5392,7 @@ nvmlReturn_t DECLDIR nvmlVgpuTypeGetName(nvmlVgpuTypeId_t vgpuTypeId, char *vgpu
* *
* @param vgpuTypeId Handle to vGPU type * @param vgpuTypeId Handle to vGPU type
* @param deviceID Device ID and vendor ID of the device contained in single 32 bit value * @param deviceID Device ID and vendor ID of the device contained in single 32 bit value
* @param subsystemID Subsytem ID and subsytem vendor ID of the device contained in single 32 bit value * @param subsystemID subsystem ID and subsystem vendor ID of the device contained in single 32 bit value
* *
* @return * @return
* - \ref NVML_SUCCESS successful completion * - \ref NVML_SUCCESS successful completion
@@ -5516,10 +5516,10 @@ nvmlReturn_t DECLDIR nvmlVgpuTypeGetMaxInstances(nvmlDevice_t device, nvmlVgpuTy
* Retrieve the active vGPU instances on a device. * Retrieve the active vGPU instances on a device.
* *
* An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The * An array of active vGPU instances is returned in the caller-supplied buffer pointed at by \a vgpuInstances. The
* array elememt count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances * array element count is passed in \a vgpuCount, and \a vgpuCount is used to return the number of vGPU instances
* written to the buffer. * written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the vGPU instance array, the function returns * If the supplied buffer is not large enough to accommodate the vGPU instance array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlVgpuInstance_t array required in \a vgpuCount.
* To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return * To query the number of active vGPU instances, call this function with *vgpuCount = 0. The code will return
* NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported. * NVML_ERROR_INSUFFICIENT_SIZE, or NVML_SUCCESS if no vGPU Types are supported.
@@ -5702,7 +5702,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFrameRateLimit(nvmlVgpuInstance_t vgpuIn
* @param encoderCapacity Reference to an unsigned int for the encoder capacity * @param encoderCapacity Reference to an unsigned int for the encoder capacity
* *
* @return * @return
* - \ref NVML_SUCCESS if \a encoderCapacity has been retrived * - \ref NVML_SUCCESS if \a encoderCapacity has been retrieved
* - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized * - \ref NVML_ERROR_UNINITIALIZED if the library has not been successfully initialized
* - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid * - \ref NVML_ERROR_INVALID_ARGUMENT if \a vgpuInstance is 0, or \a encoderQueryType is invalid
* - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system * - \ref NVML_ERROR_NOT_FOUND if \a vgpuInstance does not match a valid active vGPU instance on the system
@@ -5863,10 +5863,10 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderStats(nvmlVgpuInstance_t vgpuInst
* Retrieves information about all active encoder sessions on a vGPU Instance. * Retrieves information about all active encoder sessions on a vGPU Instance.
* *
* An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The * An array of active encoder sessions is returned in the caller-supplied buffer pointed at by \a sessionInfo. The
* array elememt count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
* written to the buffer. * written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the active session array, the function returns * If the supplied buffer is not large enough to accommodate the active session array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlEncoderSessionInfo_t array required in \a sessionCount.
* To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return * To query the number of active encoder sessions, call this function with *sessionCount = 0. The code will return
* NVML_SUCCESS with number of active encoder sessions updated in *sessionCount. * NVML_SUCCESS with number of active encoder sessions updated in *sessionCount.
@@ -5896,7 +5896,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetEncoderSessions(nvmlVgpuInstance_t vgpuI
* For Maxwell &tm; or newer fully supported devices. * For Maxwell &tm; or newer fully supported devices.
* *
* @param vgpuInstance Identifier of the target vGPU instance * @param vgpuInstance Identifier of the target vGPU instance
* @param fbcStats Reference to nvmlFBCStats_t structure contianing NvFBC stats * @param fbcStats Reference to nvmlFBCStats_t structure containing NvFBC stats
* *
* @return * @return
* - \ref NVML_SUCCESS if \a fbcStats is fetched * - \ref NVML_SUCCESS if \a fbcStats is fetched
@@ -5914,7 +5914,7 @@ nvmlReturn_t DECLDIR nvmlVgpuInstanceGetFBCStats(nvmlVgpuInstance_t vgpuInstance
* array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions * array element count is passed in \a sessionCount, and \a sessionCount is used to return the number of sessions
* written to the buffer. * written to the buffer.
* *
* If the supplied buffer is not large enough to accomodate the active session array, the function returns * If the supplied buffer is not large enough to accommodate the active session array, the function returns
* NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount. * NVML_ERROR_INSUFFICIENT_SIZE, with the element count of nvmlFBCSessionInfo_t array required in \a sessionCount.
* To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return * To query the number of active FBC sessions, call this function with *sessionCount = 0. The code will return
* NVML_SUCCESS with number of active FBC sessions updated in *sessionCount. * NVML_SUCCESS with number of active FBC sessions updated in *sessionCount.
@@ -6094,7 +6094,7 @@ typedef struct nvmlVgpuPgpuMetadata_st
unsigned int version; //!< Current version of the structure unsigned int version; //!< Current version of the structure
unsigned int revision; //!< Current revision of the structure unsigned int revision; //!< Current revision of the structure
char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version char hostDriverVersion[NVML_SYSTEM_DRIVER_VERSION_BUFFER_SIZE]; //!< Host driver version
unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualizaion capabilities bitfileld unsigned int pgpuVirtualizationCaps; //!< Pgpu virtualization capabilities bitfield
unsigned int reserved[7]; //!< Reserved for internal use unsigned int reserved[7]; //!< Reserved for internal use
unsigned int opaqueDataSize; //!< Size of opaque data field in bytes unsigned int opaqueDataSize; //!< Size of opaque data field in bytes
char opaqueData[4]; //!< Opaque data char opaqueData[4]; //!< Opaque data
@@ -6191,7 +6191,7 @@ nvmlReturn_t DECLDIR nvmlDeviceGetVgpuMetadata(nvmlDevice_t device, nvmlVgpuPgpu
* *
* The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The * The caller passes in a buffer via \a compatibilityInfo, into which a compatibility information structure is written. The
* structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility * structure defines the states in which the vGPU / VM may be booted on the physical GPU. If the vGPU / VM compatibility
* with the physical GPU is limited, a limit code indicates the factor limiting compability. * with the physical GPU is limited, a limit code indicates the factor limiting compatibility.
* (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details). * (see \ref nvmlVgpuPgpuCompatibilityLimitCode_t for details).
* *
* Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to * Note: vGPU compatibility does not take into account dynamic capacity conditions that may limit a system's ability to

View File

@@ -950,7 +950,7 @@ namespace half_float
/// Convert half-precision floating point to integer. /// Convert half-precision floating point to integer.
/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
/// \tparam E `true` for round to even, `false` for round away from zero /// \tparam E `true` for round to even, `false` for round away from zero
/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) /// \tparam T type to convert to (builtin integer type with at least 16 bits precision, excluding any implicit sign bits)
/// \param value binary representation of half-precision value /// \param value binary representation of half-precision value
/// \return integral value /// \return integral value
template<std::float_round_style R,bool E,typename T> T half2int_impl(uint16 value) template<std::float_round_style R,bool E,typename T> T half2int_impl(uint16 value)
@@ -988,13 +988,13 @@ namespace half_float
/// Convert half-precision floating point to integer. /// Convert half-precision floating point to integer.
/// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding /// \tparam R rounding mode to use, `std::round_indeterminate` for fastest rounding
/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) /// \tparam T type to convert to (builtin integer type with at least 16 bits precision, excluding any implicit sign bits)
/// \param value binary representation of half-precision value /// \param value binary representation of half-precision value
/// \return integral value /// \return integral value
template<std::float_round_style R,typename T> T half2int(uint16 value) { return half2int_impl<R,HALF_ROUND_TIES_TO_EVEN,T>(value); } template<std::float_round_style R,typename T> T half2int(uint16 value) { return half2int_impl<R,HALF_ROUND_TIES_TO_EVEN,T>(value); }
/// Convert half-precision floating point to integer using round-to-nearest-away-from-zero. /// Convert half-precision floating point to integer using round-to-nearest-away-from-zero.
/// \tparam T type to convert to (buitlin integer type with at least 16 bits precision, excluding any implicit sign bits) /// \tparam T type to convert to (builtin integer type with at least 16 bits precision, excluding any implicit sign bits)
/// \param value binary representation of half-precision value /// \param value binary representation of half-precision value
/// \return integral value /// \return integral value
template<typename T> T half2int_up(uint16 value) { return half2int_impl<std::round_to_nearest,0,T>(value); } template<typename T> T half2int_up(uint16 value) { return half2int_impl<std::round_to_nearest,0,T>(value); }
@@ -1053,7 +1053,7 @@ namespace half_float
/// Half-precision floating point type. /// Half-precision floating point type.
/// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and /// This class implements an IEEE-conformant half-precision floating point type with the usual arithmetic operators and
/// conversions. It is implicitly convertible to single-precision floating point, which makes artihmetic expressions and /// conversions. It is implicitly convertible to single-precision floating point, which makes arithmetic expressions and
/// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations /// functions with mixed-type operands to be of the most precise operand type. Additionally all arithmetic operations
/// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to /// (and many mathematical functions) are carried out in single-precision internally. All conversions from single- to
/// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic /// half-precision are done using the library's default rounding mode, but temporary results inside chained arithmetic
@@ -1062,7 +1062,7 @@ namespace half_float
/// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and /// According to the C++98/03 definition, the half type is not a POD type. But according to C++11's less strict and
/// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which /// extended definitions it is both a standard layout type and a trivially copyable type (even if not a POD type), which
/// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the /// means it can be standard-conformantly copied using raw binary copies. But in this context some more words about the
/// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not neccessarily have to be of /// actual size of the type. Although the half is representing an IEEE 16-bit type, it does not necessarily have to be of
/// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most /// exactly 16-bits size. But on any reasonable implementation the actual binary representation of this type will most
/// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit /// probably not ivolve any additional "magic" or padding beyond the simple binary representation of the underlying 16-bit
/// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if /// IEEE number, even if not strictly guaranteed by the standard. But even then it only has an actual size of 16 bits if
@@ -2181,7 +2181,7 @@ namespace half_float
/// Identity. /// Identity.
/// \param arg operand /// \param arg operand
/// \return uncahnged operand /// \return unchanged operand
template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator+(T arg) { return arg; } template<typename T> HALF_CONSTEXPR typename enable<T,T>::type operator+(T arg) { return arg; }
/// Negation. /// Negation.
@@ -2620,7 +2620,7 @@ namespace half_float
/// Multiply by power of two. /// Multiply by power of two.
/// \param arg number to modify /// \param arg number to modify
/// \param exp power of two to multiply with /// \param exp power of two to multiply with
/// \return \a arg multplied by 2 raised to \a exp /// \return \a arg multiplied by 2 raised to \a exp
// template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); } // template<typename T> typename enable<half,T>::type ldexp(T arg, int exp) { return functions::scalbln(arg, exp); }
inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); } inline half ldexp(half arg, int exp) { return functions::scalbln(arg, exp); }
inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); } inline half ldexp(expr arg, int exp) { return functions::scalbln(arg, exp); }
@@ -2636,7 +2636,7 @@ namespace half_float
/// Multiply by power of two. /// Multiply by power of two.
/// \param arg number to modify /// \param arg number to modify
/// \param exp power of two to multiply with /// \param exp power of two to multiply with
/// \return \a arg multplied by 2 raised to \a exp /// \return \a arg multiplied by 2 raised to \a exp
// template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); } // template<typename T> typename enable<half,T>::type scalbn(T arg, int exp) { return functions::scalbln(arg, exp); }
inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); } inline half scalbn(half arg, int exp) { return functions::scalbln(arg, exp); }
inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); } inline half scalbn(expr arg, int exp) { return functions::scalbln(arg, exp); }
@@ -2644,7 +2644,7 @@ namespace half_float
/// Multiply by power of two. /// Multiply by power of two.
/// \param arg number to modify /// \param arg number to modify
/// \param exp power of two to multiply with /// \param exp power of two to multiply with
/// \return \a arg multplied by 2 raised to \a exp /// \return \a arg multiplied by 2 raised to \a exp
// template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); } // template<typename T> typename enable<half,T>::type scalbln(T arg, long exp) { return functions::scalbln(arg, exp); }
inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); } inline half scalbln(half arg, long exp) { return functions::scalbln(arg, exp); }
inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); } inline half scalbln(expr arg, long exp) { return functions::scalbln(arg, exp); }

View File

@@ -91,7 +91,7 @@ void inliner::do_inline(ir::function* fn, ir::call_inst* callsite, ir::builder&
if(inst_map.find(inst_op) != inst_map.end()) if(inst_map.find(inst_op) != inst_map.end())
new_inst->set_operand(k, inst_map.at(inst_op)); new_inst->set_operand(k, inst_map.at(inst_op));
} }
// handles a ret instruciton. // handles a ret instruction.
// instead of returning we need to branch to after the function call // instead of returning we need to branch to after the function call
if(ir::return_inst* ret = dynamic_cast<ir::return_inst*>(new_inst)) { if(ir::return_inst* ret = dynamic_cast<ir::return_inst*>(new_inst)) {
if(ir::value* ret_val = ret->get_return_value()) if(ir::value* ret_val = ret->get_return_value())

View File

@@ -51,7 +51,7 @@ def get_thirdparty_packages(triton_cache_path):
Package("pybind11", "pybind11-2.10.0", "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz", "include/pybind11/pybind11.h", "PYBIND11_INCLUDE_DIR", "") Package("pybind11", "pybind11-2.10.0", "https://github.com/pybind/pybind11/archive/refs/tags/v2.10.0.tar.gz", "include/pybind11/pybind11.h", "PYBIND11_INCLUDE_DIR", "")
] ]
if not use_system_llvm(): if not use_system_llvm():
# donwload LLVM if no suitable system LLVM is installed # download LLVM if no suitable system LLVM is installed
packages.append( packages.append(
Package("llvm", "clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04", "https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.1/clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04.tar.xz", "lib", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR") Package("llvm", "clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04", "https://github.com/llvm/llvm-project/releases/download/llvmorg-11.0.1/clang+llvm-11.0.1-x86_64-linux-gnu-ubuntu-16.04.tar.xz", "lib", "LLVM_INCLUDE_DIRS", "LLVM_LIBRARY_DIR")
) )

View File

@@ -232,7 +232,7 @@ ir::value *store(ir::value *ptr, ir::value *val, std::optional<ir::value *> _mas
----------------------------------------------*/ ----------------------------------------------*/
std::string dot_docstr = R"pbdoc( std::string dot_docstr = R"pbdoc(
Returns the matrix product of two blocks. Returns the matrix product of two blocks.
The two blocks must be two dimensionals and have compatible inner dimensions. The two blocks must be two dimensions and have compatible inner dimensions.
:param input: The first block to be multiplied. :param input: The first block to be multiplied.
:type input: 2D block of scalar-type in {`float16`, `float32`} :type input: 2D block of scalar-type in {`float16`, `float32`}

View File

@@ -1126,7 +1126,7 @@ class CacheManager:
os.rename(filepath + ".tmp", filepath) os.rename(filepath + ".tmp", filepath)
# utilties for generating and compiling C wrappers # utilities for generating and compiling C wrappers
@functools.lru_cache() @functools.lru_cache()

View File

@@ -768,7 +768,7 @@ def dot(input, other, trans_a=False, trans_b=False, allow_tf32=True, _builder=No
""" """
Returns the matrix product of two blocks. Returns the matrix product of two blocks.
The two blocks must be two dimensionals and have compatible inner dimensions. The two blocks must be two dimensions and have compatible inner dimensions.
:param input: The first tensor to be multiplied. :param input: The first tensor to be multiplied.
:type input: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`} :type input: 2D tensor of scalar-type in {:code:`float16`, :code:`bfloat16`, :code:`float32`}