[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)
- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
This commit is contained in:
@@ -31,7 +31,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS -std=gnu++17")
|
||||
# LLVM
|
||||
##########
|
||||
if("${LLVM_LIBRARY_DIR}" STREQUAL "")
|
||||
find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
|
||||
find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu")
|
||||
message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
|
||||
if(APPLE)
|
||||
set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
|
||||
@@ -39,14 +39,52 @@ if("${LLVM_LIBRARY_DIR}" STREQUAL "")
|
||||
# sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
|
||||
else()
|
||||
set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
|
||||
set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a
|
||||
libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a
|
||||
libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a
|
||||
libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a
|
||||
libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMMCParser.a
|
||||
libLLVMBitReader.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMNVPTXDesc.a
|
||||
libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMBinaryFormat.a libLLVMNVPTXInfo.a
|
||||
libLLVMSupport.a libLLVMDemangle.a)
|
||||
set(LLVM_LIBRARIES
|
||||
libLLVMNVPTXCodeGen.a
|
||||
libLLVMNVPTXDesc.a
|
||||
libLLVMNVPTXInfo.a
|
||||
libLLVMAMDGPUDisassembler.a
|
||||
libLLVMMCDisassembler.a
|
||||
libLLVMAMDGPUCodeGen.a
|
||||
libLLVMMIRParser.a
|
||||
libLLVMGlobalISel.a
|
||||
libLLVMSelectionDAG.a
|
||||
libLLVMipo.a
|
||||
libLLVMInstrumentation.a
|
||||
libLLVMVectorize.a
|
||||
libLLVMLinker.a
|
||||
libLLVMIRReader.a
|
||||
libLLVMAsmParser.a
|
||||
libLLVMFrontendOpenMP.a
|
||||
libLLVMAsmPrinter.a
|
||||
libLLVMDebugInfoDWARF.a
|
||||
libLLVMCodeGen.a
|
||||
libLLVMTarget.a
|
||||
libLLVMScalarOpts.a
|
||||
libLLVMInstCombine.a
|
||||
libLLVMAggressiveInstCombine.a
|
||||
libLLVMTransformUtils.a
|
||||
libLLVMBitWriter.a
|
||||
libLLVMAnalysis.a
|
||||
libLLVMProfileData.a
|
||||
libLLVMObject.a
|
||||
libLLVMTextAPI.a
|
||||
libLLVMBitReader.a
|
||||
libLLVMAMDGPUAsmParser.a
|
||||
libLLVMMCParser.a
|
||||
libLLVMAMDGPUDesc.a
|
||||
libLLVMAMDGPUUtils.a
|
||||
libLLVMMC.a
|
||||
libLLVMDebugInfoCodeView.a
|
||||
libLLVMDebugInfoMSF.a
|
||||
libLLVMCore.a
|
||||
libLLVMRemarks.a
|
||||
libLLVMBitstreamReader.a
|
||||
libLLVMBinaryFormat.a
|
||||
libLLVMAMDGPUInfo.a
|
||||
libLLVMSupport.a
|
||||
libLLVMDemangle.a
|
||||
)
|
||||
endif()
|
||||
include_directories("${LLVM_INCLUDE_DIRS}")
|
||||
|
||||
|
@@ -4,8 +4,17 @@
|
||||
|
||||
#include <memory>
|
||||
|
||||
namespace llvm{
|
||||
class Module;
|
||||
class LLVMContext;
|
||||
}
|
||||
|
||||
namespace triton{
|
||||
|
||||
namespace codegen {
|
||||
class target;
|
||||
}
|
||||
|
||||
namespace ir{
|
||||
class module;
|
||||
}
|
||||
@@ -21,8 +30,10 @@ namespace codegen{
|
||||
|
||||
// TODO:
|
||||
// There should be a proper pass manager there!
|
||||
void add_passes_to_emit_bin(ir::module &ir, driver::device* dev, int num_warps, int num_stages, bool force_nc_cache,
|
||||
driver::module*& mod, driver::kernel*& ker, size_t& shared_mem);
|
||||
std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx,
|
||||
codegen::target* target,
|
||||
int sm, int num_warps,
|
||||
int num_stages, bool force_nc_cache, int &shared_static);
|
||||
|
||||
|
||||
}
|
||||
|
@@ -1,137 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_BACKEND_H_
|
||||
#define _TRITON_DRIVER_BACKEND_H_
|
||||
|
||||
|
||||
#include <map>
|
||||
#include <list>
|
||||
#include <vector>
|
||||
#include "triton/driver/context.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class Module;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
{
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class buffer;
|
||||
class stream;
|
||||
class device;
|
||||
class context;
|
||||
class platform;
|
||||
class module;
|
||||
class kernel;
|
||||
|
||||
struct backend
|
||||
{
|
||||
|
||||
// platforms
|
||||
class platforms
|
||||
{
|
||||
friend class backend;
|
||||
private:
|
||||
static void init();
|
||||
|
||||
public:
|
||||
static void get(std::vector<driver::platform*> &results);
|
||||
|
||||
private:
|
||||
static std::vector<driver::platform*> cache_;
|
||||
};
|
||||
|
||||
// devices
|
||||
class devices
|
||||
{
|
||||
friend class backend;
|
||||
|
||||
private:
|
||||
static void init(const std::vector<platform *> &platforms);
|
||||
|
||||
public:
|
||||
static void get(std::vector<driver::device*>& devs);
|
||||
|
||||
private:
|
||||
static std::vector<driver::device*> cache_;
|
||||
};
|
||||
|
||||
// modules
|
||||
class modules
|
||||
{
|
||||
friend class backend;
|
||||
|
||||
public:
|
||||
static void release();
|
||||
|
||||
private:
|
||||
static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
|
||||
};
|
||||
|
||||
// kernels
|
||||
class kernels
|
||||
{
|
||||
friend class backend;
|
||||
public:
|
||||
static void release();
|
||||
static driver::kernel* get(driver::module* mod, const std::string & name);
|
||||
private:
|
||||
static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
|
||||
};
|
||||
|
||||
// contexts
|
||||
class contexts
|
||||
{
|
||||
friend class backend;
|
||||
private:
|
||||
static void init(const std::vector<device *> &);
|
||||
static void release();
|
||||
public:
|
||||
static driver::context* get_default();
|
||||
|
||||
static driver::context* import(CUcontext ctx)
|
||||
{
|
||||
for(driver::context* x: cache_){
|
||||
driver::cu_context* cu_x = (driver::cu_context*)x;
|
||||
if(*cu_x->cu()==ctx)
|
||||
return x;
|
||||
}
|
||||
cache_.emplace_back(new driver::cu_context(ctx, false));
|
||||
return cache_.back();
|
||||
}
|
||||
|
||||
static void get(std::list<driver::context*> &);
|
||||
|
||||
private:
|
||||
static std::list<driver::context*> cache_;
|
||||
};
|
||||
|
||||
// streams
|
||||
class streams
|
||||
{
|
||||
friend class backend;
|
||||
private:
|
||||
static void init(std::list<context*> const &);
|
||||
static void release();
|
||||
public:
|
||||
static void get(driver::context*, std::vector<driver::stream *> &streams);
|
||||
static driver::stream* get(driver::context*, unsigned int id = 0);
|
||||
static driver::stream* get_default();
|
||||
private:
|
||||
static std::map<driver::context*, std::vector<driver::stream*> > cache_;
|
||||
};
|
||||
|
||||
static void init();
|
||||
static void release();
|
||||
static void synchronize(triton::driver::context *);
|
||||
|
||||
static unsigned int default_device;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,48 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_BUFFER_H_
|
||||
#define _TRITON_DRIVER_BUFFER_H_
|
||||
|
||||
#include "triton/driver/handle.h"
|
||||
#include "triton/driver/context.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class stream;
|
||||
|
||||
// Base
|
||||
class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
|
||||
public:
|
||||
buffer(size_t size, CUdeviceptr cl, bool take_ownership);
|
||||
buffer(size_t size, host_buffer_t hst, bool take_ownership);
|
||||
uintptr_t addr_as_uintptr_t();
|
||||
static buffer* create(driver::context* ctx, size_t size);
|
||||
size_t size();
|
||||
|
||||
protected:
|
||||
size_t size_;
|
||||
};
|
||||
|
||||
// CPU
|
||||
class host_buffer: public buffer
|
||||
{
|
||||
public:
|
||||
host_buffer(size_t size);
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_buffer: public buffer
|
||||
{
|
||||
public:
|
||||
cu_buffer(size_t size);
|
||||
cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
|
||||
void set_zero(triton::driver::stream *queue, size_t size);
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,50 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_CONTEXT_H_
|
||||
#define _TRITON_DRIVER_CONTEXT_H_
|
||||
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/handle.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class context: public polymorphic_resource<CUcontext, host_context_t>{
|
||||
protected:
|
||||
static std::string get_cache_path();
|
||||
|
||||
public:
|
||||
context(driver::device *dev, CUcontext cu, bool take_ownership);
|
||||
context(driver::device *dev, host_context_t hst, bool take_ownership);
|
||||
driver::device* device() const;
|
||||
std::string const & cache_path() const;
|
||||
// factory methods
|
||||
static context* create(driver::device *dev);
|
||||
|
||||
protected:
|
||||
driver::device* dev_;
|
||||
std::string cache_path_;
|
||||
};
|
||||
|
||||
// Host
|
||||
class host_context: public context {
|
||||
public:
|
||||
host_context(driver::device* dev);
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_context: public context {
|
||||
private:
|
||||
static CUdevice get_device_of(CUcontext);
|
||||
public:
|
||||
//Constructors
|
||||
cu_context(CUcontext cu, bool take_ownership = true);
|
||||
cu_context(driver::device* dev);
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,82 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_DEVICE_H_
|
||||
#define _TRITON_DRIVER_DEVICE_H_
|
||||
|
||||
#include "triton/driver/platform.h"
|
||||
#include "triton/driver/handle.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace codegen
|
||||
{
|
||||
class target;
|
||||
}
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class context;
|
||||
|
||||
// Base device
|
||||
class device: public polymorphic_resource<CUdevice, host_device_t>{
|
||||
public:
|
||||
using polymorphic_resource::polymorphic_resource;
|
||||
virtual size_t max_threads_per_block() const = 0;
|
||||
virtual size_t max_shared_memory() const = 0;
|
||||
virtual std::unique_ptr<codegen::target> make_target() const = 0;
|
||||
};
|
||||
|
||||
// Host device
|
||||
class host_device: public device {
|
||||
public:
|
||||
host_device(): device(host_device_t(), true){ }
|
||||
size_t max_threads_per_block() const { return 1; }
|
||||
size_t max_shared_memory() const { return 0; }
|
||||
std::unique_ptr<codegen::target> make_target() const;
|
||||
};
|
||||
|
||||
// CUDA device
|
||||
class cu_device: public device {
|
||||
private:
|
||||
//Metaprogramming elper to get cuda info from attribute
|
||||
template<CUdevice_attribute attr>
|
||||
int cuGetInfo() const;
|
||||
|
||||
inline nvmlDevice_t nvml_device() const;
|
||||
|
||||
public:
|
||||
cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
|
||||
// Informations
|
||||
std::string infos() const;
|
||||
size_t address_bits() const;
|
||||
std::vector<size_t> max_block_dim() const;
|
||||
size_t warp_size() const;
|
||||
// Compute Capability
|
||||
void interpret_as(int cc);
|
||||
int compute_capability() const;
|
||||
// Identifier
|
||||
std::string name() const;
|
||||
std::string pci_bus_id() const;
|
||||
// Clocks
|
||||
size_t current_sm_clock() const;
|
||||
size_t current_mem_clock() const;
|
||||
size_t max_threads_per_block() const;
|
||||
size_t max_shared_memory() const;
|
||||
size_t max_sm_clock() const;
|
||||
size_t max_mem_clock() const;
|
||||
void set_max_clock();
|
||||
void enable_peer_access(CUdeviceptr peer_mem_ptr) const;
|
||||
// Target
|
||||
std::unique_ptr<codegen::target> make_target() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<int> interpreted_as_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@@ -10,6 +10,10 @@
|
||||
#include "triton/external/CUDA/cuda.h"
|
||||
#include "triton/external/CUDA/nvml.h"
|
||||
|
||||
//// HIP backend
|
||||
//#define __HIP_PLATFORM_AMD__
|
||||
#include "triton/external/hip.h"
|
||||
|
||||
//Exceptions
|
||||
#include <iostream>
|
||||
#include <stdexcept>
|
||||
@@ -28,6 +32,7 @@ class cu_context;
|
||||
|
||||
template<class T> void check(T){}
|
||||
void check(CUresult err);
|
||||
void check(hipError_t err);
|
||||
|
||||
class dispatch
|
||||
{
|
||||
@@ -58,17 +63,18 @@ protected:
|
||||
}
|
||||
|
||||
public:
|
||||
static void release();
|
||||
// Nvidia
|
||||
static bool nvmlinit();
|
||||
static bool cuinit();
|
||||
static void release();
|
||||
// AMD
|
||||
static bool hipinit();
|
||||
|
||||
/* ------------------- *
|
||||
* CUDA
|
||||
* ------------------- */
|
||||
// context management
|
||||
static CUresult cuInit(unsigned int Flags);
|
||||
static CUresult cuCtxGetCurrent(CUcontext *pctx);
|
||||
static CUresult cuCtxSetCurrent(CUcontext ctx);
|
||||
static CUresult cuCtxDestroy_v2(CUcontext ctx);
|
||||
static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
|
||||
static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
|
||||
@@ -128,6 +134,55 @@ public:
|
||||
static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
|
||||
static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
|
||||
|
||||
/* ------------------- *
|
||||
* HIP
|
||||
* ------------------- */
|
||||
// context management
|
||||
static hipError_t hipInit(unsigned int Flags);
|
||||
static hipError_t hipCtxDestroy(hipCtx_t ctx);
|
||||
static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
|
||||
static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
|
||||
static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
|
||||
static hipError_t hipCtxGetDevice(hipDevice_t* result);
|
||||
static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
|
||||
static hipError_t hipDriverGetVersion(int *driverVersion);
|
||||
// device management
|
||||
static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
|
||||
static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
|
||||
static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
|
||||
static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
|
||||
static hipError_t hipGetDeviceCount(int *count);
|
||||
// module management
|
||||
static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
|
||||
static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
|
||||
static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
|
||||
static hipError_t hipModuleUnload(hipModule_t hmod);
|
||||
static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
|
||||
static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
|
||||
// stream management
|
||||
static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
|
||||
static hipError_t hipStreamSynchronize(hipStream_t hStream);
|
||||
static hipError_t hipStreamDestroy(hipStream_t hStream);
|
||||
static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
|
||||
// function management
|
||||
static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
|
||||
static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
|
||||
static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
|
||||
// memory management
|
||||
static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
|
||||
static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
|
||||
static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
|
||||
static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
|
||||
static hipError_t hipFree(hipDeviceptr_t dptr);
|
||||
static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
|
||||
static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
|
||||
static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
|
||||
// event management
|
||||
static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
|
||||
static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
|
||||
static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
|
||||
static hipError_t hipEventDestroy(hipEvent_t hEvent);
|
||||
|
||||
|
||||
|
||||
private:
|
||||
@@ -135,6 +190,7 @@ private:
|
||||
// Libraries
|
||||
static void* cuda_;
|
||||
static void* nvml_;
|
||||
static void* hip_;
|
||||
|
||||
|
||||
/* ------------------- *
|
||||
@@ -194,9 +250,6 @@ private:
|
||||
static void* cuEventRecord_;
|
||||
static void* cuEventDestroy_v2_;
|
||||
|
||||
|
||||
|
||||
|
||||
/* ------------------- *
|
||||
* NVML
|
||||
* ------------------- */
|
||||
@@ -205,6 +258,55 @@ private:
|
||||
static void* nvmlDeviceGetClockInfo_;
|
||||
static void* nvmlDeviceGetMaxClockInfo_;
|
||||
static void* nvmlDeviceSetApplicationsClocks_;
|
||||
|
||||
/* ------------------- *
|
||||
* HIP
|
||||
* ------------------- */
|
||||
// context management
|
||||
static void* hipInit_;
|
||||
static void* hipCtxDestroy_;
|
||||
static void* hipCtxCreate_;
|
||||
static void* hipCtxPushCurrent_;
|
||||
static void* hipCtxPopCurrent_;
|
||||
static void* hipCtxGetDevice_;
|
||||
static void* hipCtxEnablePeerAccess_;
|
||||
static void* hipDriverGetVersion_;
|
||||
// device management
|
||||
static void* hipGetDevice_;
|
||||
static void* hipDeviceGetName_;
|
||||
static void* hipDeviceGetPCIBusId_;
|
||||
static void* hipDeviceGetAttribute_;
|
||||
static void* hipGetDeviceCount_;
|
||||
// module management
|
||||
static void* hipModuleGetGlobal_;
|
||||
static void* hipModuleLoad_;
|
||||
static void* hipModuleLoadData_;
|
||||
static void* hipModuleUnload_;
|
||||
static void* hipModuleLoadDataEx_;
|
||||
static void* hipModuleGetFunction_;
|
||||
// stream management
|
||||
static void* hipStreamCreate_;
|
||||
static void* hipStreamSynchronize_;
|
||||
static void* hipStreamDestroy_;
|
||||
static void* hipModuleLaunchKernel_;;
|
||||
// function management
|
||||
static void* hipFuncGetAttributes_;
|
||||
static void* hipFuncSetAttribute_;
|
||||
static void* hipFuncSetCacheConfig_;
|
||||
// memory management
|
||||
static void* hipMalloc_;
|
||||
static void* hipPointerGetAttribute_;
|
||||
static void* hipMemsetD8Async_;
|
||||
static void* hipMemcpyDtoH_;
|
||||
static void* hipFree_;
|
||||
static void* hipMemcpyDtoHAsync_;
|
||||
static void* hipMemcpyHtoDAsync_;
|
||||
static void* hipMemcpyHtoD_;
|
||||
// event management
|
||||
static void* hipEventCreate_;
|
||||
static void* hipEventElapsedTime_;
|
||||
static void* hipEventRecord_;
|
||||
static void* hipEventDestroy_;
|
||||
};
|
||||
|
||||
}
|
||||
|
@@ -141,6 +141,78 @@ namespace triton
|
||||
TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow ,"runtime fp overflow");
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
namespace hip
|
||||
{
|
||||
class base: public std::exception{};
|
||||
|
||||
#define TRITON_CREATE_HIP_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "HIP: Error- " msg; } }
|
||||
|
||||
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_value ,"invalid value");
|
||||
TRITON_CREATE_HIP_EXCEPTION(out_of_memory ,"out of memory");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_initialized ,"not initialized");
|
||||
TRITON_CREATE_HIP_EXCEPTION(deinitialized ,"deinitialized");
|
||||
TRITON_CREATE_HIP_EXCEPTION(profiler_disabled ,"profiler disabled");
|
||||
TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized ,"profiler not initialized");
|
||||
TRITON_CREATE_HIP_EXCEPTION(profiler_already_started ,"profiler already started");
|
||||
TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped ,"profiler already stopped");
|
||||
TRITON_CREATE_HIP_EXCEPTION(no_device ,"no device");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_device ,"invalid device");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_image ,"invalid image");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_context ,"invalid context");
|
||||
TRITON_CREATE_HIP_EXCEPTION(context_already_current ,"context already current");
|
||||
TRITON_CREATE_HIP_EXCEPTION(map_failed ,"map failed");
|
||||
TRITON_CREATE_HIP_EXCEPTION(unmap_failed ,"unmap failed");
|
||||
TRITON_CREATE_HIP_EXCEPTION(array_is_mapped ,"array is mapped");
|
||||
TRITON_CREATE_HIP_EXCEPTION(already_mapped ,"already mapped");
|
||||
TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu ,"no binary for gpu");
|
||||
TRITON_CREATE_HIP_EXCEPTION(already_acquired ,"already acquired");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_mapped ,"not mapped");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array ,"not mapped as array");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer ,"not mapped as pointer");
|
||||
TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable ,"ecc uncorrectable");
|
||||
TRITON_CREATE_HIP_EXCEPTION(unsupported_limit ,"unsupported limit");
|
||||
TRITON_CREATE_HIP_EXCEPTION(context_already_in_use ,"context already in use");
|
||||
TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported ,"peer access unsupported");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_ptx ,"invalid ptx");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context ,"invalid graphics context");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_source ,"invalid source");
|
||||
TRITON_CREATE_HIP_EXCEPTION(file_not_found ,"file not found");
|
||||
TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found ,"shared object symbol not found");
|
||||
TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed ,"shared object init failed");
|
||||
TRITON_CREATE_HIP_EXCEPTION(operating_system ,"operating system");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_handle ,"invalid handle");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_found ,"not found");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_ready ,"not ready");
|
||||
TRITON_CREATE_HIP_EXCEPTION(illegal_address ,"illegal address");
|
||||
TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources ,"launch out of resources");
|
||||
TRITON_CREATE_HIP_EXCEPTION(launch_timeout ,"launch timeout");
|
||||
TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing ,"launch incompatible texturing");
|
||||
TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled ,"peer access already enabled");
|
||||
TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled ,"peer access not enabled");
|
||||
TRITON_CREATE_HIP_EXCEPTION(primary_context_active ,"primary context active");
|
||||
TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed ,"context is destroyed");
|
||||
TRITON_CREATE_HIP_EXCEPTION(assert_error ,"assert");
|
||||
TRITON_CREATE_HIP_EXCEPTION(too_many_peers ,"too many peers");
|
||||
TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered ,"host memory already registered");
|
||||
TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered ,"hot memory not registered");
|
||||
TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error ,"hardware stack error");
|
||||
TRITON_CREATE_HIP_EXCEPTION(illegal_instruction ,"illegal instruction");
|
||||
TRITON_CREATE_HIP_EXCEPTION(misaligned_address ,"misaligned address");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_address_space ,"invalid address space");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_pc ,"invalid pc");
|
||||
TRITON_CREATE_HIP_EXCEPTION(launch_failed ,"launch failed");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_permitted ,"not permitted");
|
||||
TRITON_CREATE_HIP_EXCEPTION(not_supported ,"not supported");
|
||||
TRITON_CREATE_HIP_EXCEPTION(invalid_symbol ,"invalid symbol");
|
||||
TRITON_CREATE_HIP_EXCEPTION(unknown ,"unknown");
|
||||
|
||||
#undef TRITON_CREATE_CUDA_EXCEPTION
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@@ -1,146 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_HANDLE_H_
|
||||
#define _TRITON_DRIVER_HANDLE_H_
|
||||
|
||||
#include <memory>
|
||||
#include <map>
|
||||
#include <iostream>
|
||||
#include <functional>
|
||||
#include <type_traits>
|
||||
#include "triton/driver/dispatch.h"
|
||||
#include "llvm/ExecutionEngine/JITSymbol.h"
|
||||
#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
|
||||
#include "llvm/ExecutionEngine/Orc/Core.h"
|
||||
#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
|
||||
#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
|
||||
#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
|
||||
#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
|
||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
|
||||
#include "triton/tools/thread_pool.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class ExecutionEngine;
|
||||
class Function;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
enum backend_t {
|
||||
CUDA,
|
||||
Host
|
||||
};
|
||||
|
||||
// Host handles
|
||||
struct host_platform_t{
|
||||
|
||||
};
|
||||
|
||||
struct host_device_t{
|
||||
|
||||
};
|
||||
|
||||
struct host_context_t{
|
||||
|
||||
};
|
||||
|
||||
struct host_stream_t{
|
||||
std::shared_ptr<ThreadPool> pool;
|
||||
std::shared_ptr<std::vector<std::future<void>>> futures;
|
||||
std::vector<std::shared_ptr<char*>> args;
|
||||
};
|
||||
|
||||
struct host_module_t{
|
||||
std::string error;
|
||||
llvm::ExecutionEngine* engine;
|
||||
std::map<std::string, llvm::Function*> functions;
|
||||
void(*fn)(char**, int32_t, int32_t, int32_t);
|
||||
llvm::orc::ExecutionSession* ES;
|
||||
llvm::orc::RTDyldObjectLinkingLayer* ObjectLayer;
|
||||
llvm::orc::IRCompileLayer* CompileLayer;
|
||||
llvm::DataLayout* DL;
|
||||
llvm::orc::MangleAndInterner* Mangle;
|
||||
llvm::orc::ThreadSafeContext* Ctx;
|
||||
llvm::orc::JITDylib *MainJD;
|
||||
};
|
||||
|
||||
struct host_function_t{
|
||||
llvm::Function* fn;
|
||||
};
|
||||
|
||||
struct host_buffer_t{
|
||||
char* data;
|
||||
};
|
||||
|
||||
|
||||
// Extra CUDA handles
|
||||
struct cu_event_t{
|
||||
operator bool() const { return first && second; }
|
||||
CUevent first;
|
||||
CUevent second;
|
||||
};
|
||||
|
||||
struct CUPlatform{
|
||||
CUPlatform() : status_(dispatch::cuInit(0)) { }
|
||||
operator bool() const { return status_; }
|
||||
private:
|
||||
CUresult status_;
|
||||
};
|
||||
|
||||
template<class T, class CUType>
|
||||
class handle_interface{
|
||||
public:
|
||||
//Accessors
|
||||
operator CUType() const { return *(((T*)this)->cu().h_); }
|
||||
//Comparison
|
||||
bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
|
||||
bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
|
||||
bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class handle{
|
||||
public:
|
||||
template<class, class> friend class handle_interface;
|
||||
public:
|
||||
//Constructors
|
||||
handle(T h, bool take_ownership = true);
|
||||
handle();
|
||||
~handle();
|
||||
T& operator*() { return *h_; }
|
||||
T const & operator*() const { return *h_; }
|
||||
T* operator->() const { return h_.get(); }
|
||||
|
||||
protected:
|
||||
std::shared_ptr<T> h_;
|
||||
bool has_ownership_;
|
||||
};
|
||||
|
||||
template<class CUType, class HostType>
|
||||
class polymorphic_resource {
|
||||
public:
|
||||
polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
|
||||
polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
|
||||
virtual ~polymorphic_resource() { }
|
||||
|
||||
handle<CUType> cu() { return cu_; }
|
||||
handle<HostType> hst() { return hst_; }
|
||||
const handle<CUType>& cu() const { return cu_; }
|
||||
const handle<HostType>& hst() const { return hst_; }
|
||||
backend_t backend() { return backend_; }
|
||||
|
||||
protected:
|
||||
handle<CUType> cu_;
|
||||
handle<HostType> hst_;
|
||||
backend_t backend_;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,53 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_KERNEL_H_
|
||||
#define _TRITON_DRIVER_KERNEL_H_
|
||||
|
||||
#include "triton/driver/module.h"
|
||||
#include "triton/driver/handle.h"
|
||||
#include <memory>
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class GenericValue;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class cu_buffer;
|
||||
|
||||
// Base
|
||||
class kernel: public polymorphic_resource<CUfunction, host_function_t> {
|
||||
public:
|
||||
kernel(driver::module* program, CUfunction fn, bool has_ownership);
|
||||
kernel(driver::module* program, host_function_t fn, bool has_ownership);
|
||||
driver::module* module();
|
||||
static kernel* create(driver::module* program, const char* name);
|
||||
private:
|
||||
driver::module* program_;
|
||||
};
|
||||
|
||||
// Host
|
||||
class host_kernel: public kernel {
|
||||
public:
|
||||
//Constructors
|
||||
host_kernel(driver::module* program, const char* name);
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_kernel: public kernel {
|
||||
public:
|
||||
//Constructors
|
||||
cu_kernel(driver::module* program, const char * name);
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
18
include/triton/driver/llvm.h
Normal file
18
include/triton/driver/llvm.h
Normal file
@@ -0,0 +1,18 @@
|
||||
#include <string>
|
||||
#include "triton/driver/dispatch.h"
|
||||
|
||||
namespace llvm{
|
||||
class Module;
|
||||
}
|
||||
|
||||
namespace triton{
|
||||
namespace driver{
|
||||
|
||||
void init_llvm();
|
||||
std::string llir_to_ptx(llvm::Module* module, int cc, int version);
|
||||
CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
|
||||
std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
|
||||
hipModule_t amdgpu_to_hipmodule(const std::string& path);
|
||||
|
||||
}
|
||||
}
|
@@ -1,84 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_MODULE_H_
|
||||
#define _TRITON_DRIVER_MODULE_H_
|
||||
|
||||
#include <map>
|
||||
#include "triton/driver/handle.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class Module;
|
||||
template<class T>
|
||||
class SmallVectorImpl;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class cu_context;
|
||||
class cu_device;
|
||||
|
||||
// Base
|
||||
class module: public polymorphic_resource<CUmodule, host_module_t> {
|
||||
protected:
|
||||
void init_llvm();
|
||||
|
||||
enum file_type_t{
|
||||
Object,
|
||||
Assembly
|
||||
};
|
||||
|
||||
public:
|
||||
module(CUmodule mod, bool has_ownership);
|
||||
module(host_module_t mod, bool has_ownership);
|
||||
static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
|
||||
void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
|
||||
const std::string &proc, std::string layout,
|
||||
llvm::SmallVectorImpl<char> &buffer,
|
||||
const std::string &features,
|
||||
file_type_t file_type);
|
||||
virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
|
||||
int spilled() const { return spilled_; }
|
||||
|
||||
protected:
|
||||
int spilled_;
|
||||
};
|
||||
|
||||
// CPU
|
||||
class host_module: public module{
|
||||
public:
|
||||
host_module(std::unique_ptr<llvm::Module> module);
|
||||
std::unique_ptr<buffer> symbol(const char * name) const;
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_module: public module {
|
||||
std::string compile_llvm_module(llvm::Module* module, driver::device* device);
|
||||
void init_from_ptx(const std::string& ptx, cu_device *device);
|
||||
|
||||
public:
|
||||
cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
|
||||
cu_module(driver::device* device, const std::string& source);
|
||||
std::unique_ptr<buffer> symbol(const char * name) const;
|
||||
std::string llir() const { return llir_; }
|
||||
const std::string& ptx() const { return ptx_; }
|
||||
const std::string& cubin() const { return cubin_; }
|
||||
|
||||
private:
|
||||
std::string ptx_;
|
||||
std::string cubin_;
|
||||
std::string llir_;
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,58 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_PLATFORM_H_
|
||||
#define _TRITON_DRIVER_PLATFORM_H_
|
||||
|
||||
#include <vector>
|
||||
#include <string>
|
||||
|
||||
#include "triton/driver/handle.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class device;
|
||||
|
||||
class platform
|
||||
{
|
||||
public:
|
||||
// Constructor
|
||||
platform(const std::string& name): name_(name){ }
|
||||
// Accessors
|
||||
std::string name() const { return name_; }
|
||||
// Virtual methods
|
||||
virtual std::string version() const = 0;
|
||||
virtual void devices(std::vector<driver::device *> &devices) const = 0;
|
||||
private:
|
||||
std::string name_;
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_platform: public platform
|
||||
{
|
||||
public:
|
||||
cu_platform(): platform("CUDA") { }
|
||||
std::string version() const;
|
||||
void devices(std::vector<driver::device*> &devices) const;
|
||||
|
||||
private:
|
||||
handle<CUPlatform> cu_;
|
||||
};
|
||||
|
||||
// Host
|
||||
class host_platform: public platform
|
||||
{
|
||||
public:
|
||||
host_platform(): platform("CPU") { }
|
||||
std::string version() const;
|
||||
void devices(std::vector<driver::device*> &devices) const;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
@@ -1,68 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#ifndef _TRITON_DRIVER_STREAM_H_
|
||||
#define _TRITON_DRIVER_STREAM_H_
|
||||
|
||||
#include <map>
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/handle.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class kernel;
|
||||
class event;
|
||||
class Range;
|
||||
class cu_buffer;
|
||||
|
||||
// Base
|
||||
class stream: public polymorphic_resource<CUstream, host_stream_t> {
|
||||
public:
|
||||
stream(CUstream, bool has_ownership);
|
||||
stream(host_stream_t, bool has_ownership);
|
||||
// factory
|
||||
static driver::stream* create(backend_t backend);
|
||||
// methods
|
||||
virtual void synchronize() = 0;
|
||||
virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem = 0) = 0;
|
||||
virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
|
||||
virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
|
||||
// template helpers
|
||||
template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
|
||||
{ write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
||||
template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
|
||||
{ read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
||||
};
|
||||
|
||||
// Host
|
||||
class host_stream: public stream {
|
||||
public:
|
||||
host_stream();
|
||||
void synchronize();
|
||||
void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
|
||||
void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
|
||||
void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_stream: public stream {
|
||||
public:
|
||||
cu_stream(CUstream str, bool take_ownership);
|
||||
cu_stream();
|
||||
void synchronize();
|
||||
void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
|
||||
void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
|
||||
void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
1468
include/triton/external/CL/cl.h
vendored
1468
include/triton/external/CL/cl.h
vendored
File diff suppressed because it is too large
Load Diff
12947
include/triton/external/CL/cl.hpp
vendored
12947
include/triton/external/CL/cl.hpp
vendored
File diff suppressed because it is too large
Load Diff
9677
include/triton/external/CL/cl2.hpp
vendored
9677
include/triton/external/CL/cl2.hpp
vendored
File diff suppressed because it is too large
Load Diff
131
include/triton/external/CL/cl_d3d10.h
vendored
131
include/triton/external/CL/cl_d3d10.h
vendored
@@ -1,131 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
|
||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
|
||||
|
||||
#ifndef __OPENCL_CL_D3D10_H
|
||||
#define __OPENCL_CL_D3D10_H
|
||||
|
||||
#include <d3d10.h>
|
||||
#include "cl.h"
|
||||
#include "cl_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/******************************************************************************
|
||||
* cl_khr_d3d10_sharing */
|
||||
#define cl_khr_d3d10_sharing 1
|
||||
|
||||
typedef cl_uint cl_d3d10_device_source_khr;
|
||||
typedef cl_uint cl_d3d10_device_set_khr;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/* Error Codes */
|
||||
#define CL_INVALID_D3D10_DEVICE_KHR -1002
|
||||
#define CL_INVALID_D3D10_RESOURCE_KHR -1003
|
||||
#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR -1004
|
||||
#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR -1005
|
||||
|
||||
/* cl_d3d10_device_source_nv */
|
||||
#define CL_D3D10_DEVICE_KHR 0x4010
|
||||
#define CL_D3D10_DXGI_ADAPTER_KHR 0x4011
|
||||
|
||||
/* cl_d3d10_device_set_nv */
|
||||
#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR 0x4012
|
||||
#define CL_ALL_DEVICES_FOR_D3D10_KHR 0x4013
|
||||
|
||||
/* cl_context_info */
|
||||
#define CL_CONTEXT_D3D10_DEVICE_KHR 0x4014
|
||||
#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
|
||||
|
||||
/* cl_mem_info */
|
||||
#define CL_MEM_D3D10_RESOURCE_KHR 0x4015
|
||||
|
||||
/* cl_image_info */
|
||||
#define CL_IMAGE_D3D10_SUBRESOURCE_KHR 0x4016
|
||||
|
||||
/* cl_command_type */
|
||||
#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR 0x4017
|
||||
#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR 0x4018
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
|
||||
cl_platform_id platform,
|
||||
cl_d3d10_device_source_khr d3d_device_source,
|
||||
void * d3d_object,
|
||||
cl_d3d10_device_set_khr d3d_device_set,
|
||||
cl_uint num_entries,
|
||||
cl_device_id * devices,
|
||||
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Buffer * resource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Texture2D * resource,
|
||||
UINT subresource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D10Texture3D * resource,
|
||||
UINT subresource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_D3D10_H */
|
||||
|
131
include/triton/external/CL/cl_d3d11.h
vendored
131
include/triton/external/CL/cl_d3d11.h
vendored
@@ -1,131 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
|
||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
|
||||
|
||||
#ifndef __OPENCL_CL_D3D11_H
|
||||
#define __OPENCL_CL_D3D11_H
|
||||
|
||||
#include <d3d11.h>
|
||||
#include "cl.h"
|
||||
#include "cl_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/******************************************************************************
|
||||
* cl_khr_d3d11_sharing */
|
||||
#define cl_khr_d3d11_sharing 1
|
||||
|
||||
typedef cl_uint cl_d3d11_device_source_khr;
|
||||
typedef cl_uint cl_d3d11_device_set_khr;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/* Error Codes */
|
||||
#define CL_INVALID_D3D11_DEVICE_KHR -1006
|
||||
#define CL_INVALID_D3D11_RESOURCE_KHR -1007
|
||||
#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR -1008
|
||||
#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR -1009
|
||||
|
||||
/* cl_d3d11_device_source */
|
||||
#define CL_D3D11_DEVICE_KHR 0x4019
|
||||
#define CL_D3D11_DXGI_ADAPTER_KHR 0x401A
|
||||
|
||||
/* cl_d3d11_device_set */
|
||||
#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR 0x401B
|
||||
#define CL_ALL_DEVICES_FOR_D3D11_KHR 0x401C
|
||||
|
||||
/* cl_context_info */
|
||||
#define CL_CONTEXT_D3D11_DEVICE_KHR 0x401D
|
||||
#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
|
||||
|
||||
/* cl_mem_info */
|
||||
#define CL_MEM_D3D11_RESOURCE_KHR 0x401E
|
||||
|
||||
/* cl_image_info */
|
||||
#define CL_IMAGE_D3D11_SUBRESOURCE_KHR 0x401F
|
||||
|
||||
/* cl_command_type */
|
||||
#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR 0x4020
|
||||
#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR 0x4021
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
|
||||
cl_platform_id platform,
|
||||
cl_d3d11_device_source_khr d3d_device_source,
|
||||
void * d3d_object,
|
||||
cl_d3d11_device_set_khr d3d_device_set,
|
||||
cl_uint num_entries,
|
||||
cl_device_id * devices,
|
||||
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Buffer * resource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Texture2D * resource,
|
||||
UINT subresource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
ID3D11Texture3D * resource,
|
||||
UINT subresource,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_D3D11_H */
|
||||
|
132
include/triton/external/CL/cl_dx9_media_sharing.h
vendored
132
include/triton/external/CL/cl_dx9_media_sharing.h
vendored
@@ -1,132 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
|
||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
|
||||
|
||||
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
|
||||
#define __OPENCL_CL_DX9_MEDIA_SHARING_H
|
||||
|
||||
#include "cl.h"
|
||||
#include "cl_platform.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
/* cl_khr_dx9_media_sharing */
|
||||
#define cl_khr_dx9_media_sharing 1
|
||||
|
||||
typedef cl_uint cl_dx9_media_adapter_type_khr;
|
||||
typedef cl_uint cl_dx9_media_adapter_set_khr;
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <d3d9.h>
|
||||
typedef struct _cl_dx9_surface_info_khr
|
||||
{
|
||||
IDirect3DSurface9 *resource;
|
||||
HANDLE shared_handle;
|
||||
} cl_dx9_surface_info_khr;
|
||||
#endif
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
/* Error Codes */
|
||||
#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR -1010
|
||||
#define CL_INVALID_DX9_MEDIA_SURFACE_KHR -1011
|
||||
#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR -1012
|
||||
#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR -1013
|
||||
|
||||
/* cl_media_adapter_type_khr */
|
||||
#define CL_ADAPTER_D3D9_KHR 0x2020
|
||||
#define CL_ADAPTER_D3D9EX_KHR 0x2021
|
||||
#define CL_ADAPTER_DXVA_KHR 0x2022
|
||||
|
||||
/* cl_media_adapter_set_khr */
|
||||
#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2023
|
||||
#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR 0x2024
|
||||
|
||||
/* cl_context_info */
|
||||
#define CL_CONTEXT_ADAPTER_D3D9_KHR 0x2025
|
||||
#define CL_CONTEXT_ADAPTER_D3D9EX_KHR 0x2026
|
||||
#define CL_CONTEXT_ADAPTER_DXVA_KHR 0x2027
|
||||
|
||||
/* cl_mem_info */
|
||||
#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR 0x2028
|
||||
#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR 0x2029
|
||||
|
||||
/* cl_image_info */
|
||||
#define CL_IMAGE_DX9_MEDIA_PLANE_KHR 0x202A
|
||||
|
||||
/* cl_command_type */
|
||||
#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR 0x202B
|
||||
#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR 0x202C
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
|
||||
cl_platform_id platform,
|
||||
cl_uint num_media_adapters,
|
||||
cl_dx9_media_adapter_type_khr * media_adapter_type,
|
||||
void * media_adapters,
|
||||
cl_dx9_media_adapter_set_khr media_adapter_set,
|
||||
cl_uint num_entries,
|
||||
cl_device_id * devices,
|
||||
cl_uint * num_devices) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
|
||||
cl_context context,
|
||||
cl_mem_flags flags,
|
||||
cl_dx9_media_adapter_type_khr adapter_type,
|
||||
void * surface_info,
|
||||
cl_uint plane,
|
||||
cl_int * errcode_ret) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
|
||||
|
@@ -1,182 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2016 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
/*****************************************************************************\
|
||||
|
||||
Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
|
||||
|
||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
|
||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
File Name: cl_dx9_media_sharing_intel.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Notes:
|
||||
|
||||
\*****************************************************************************/
|
||||
|
||||
#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
|
||||
#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
|
||||
|
||||
#include <CL/cl.h>
|
||||
#include <CL/cl_platform.h>
|
||||
#include <d3d9.h>
|
||||
#include <dxvahd.h>
|
||||
#include <wtypes.h>
|
||||
#include <d3d9types.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/***************************************
|
||||
* cl_intel_dx9_media_sharing extension *
|
||||
****************************************/
|
||||
|
||||
#define cl_intel_dx9_media_sharing 1
|
||||
|
||||
typedef cl_uint cl_dx9_device_source_intel;
|
||||
typedef cl_uint cl_dx9_device_set_intel;
|
||||
|
||||
/* error codes */
|
||||
#define CL_INVALID_DX9_DEVICE_INTEL -1010
|
||||
#define CL_INVALID_DX9_RESOURCE_INTEL -1011
|
||||
#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL -1012
|
||||
#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL -1013
|
||||
|
||||
/* cl_dx9_device_source_intel */
|
||||
#define CL_D3D9_DEVICE_INTEL 0x4022
|
||||
#define CL_D3D9EX_DEVICE_INTEL 0x4070
|
||||
#define CL_DXVA_DEVICE_INTEL 0x4071
|
||||
|
||||
/* cl_dx9_device_set_intel */
|
||||
#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL 0x4024
|
||||
#define CL_ALL_DEVICES_FOR_DX9_INTEL 0x4025
|
||||
|
||||
/* cl_context_info */
|
||||
#define CL_CONTEXT_D3D9_DEVICE_INTEL 0x4026
|
||||
#define CL_CONTEXT_D3D9EX_DEVICE_INTEL 0x4072
|
||||
#define CL_CONTEXT_DXVA_DEVICE_INTEL 0x4073
|
||||
|
||||
/* cl_mem_info */
|
||||
#define CL_MEM_DX9_RESOURCE_INTEL 0x4027
|
||||
#define CL_MEM_DX9_SHARED_HANDLE_INTEL 0x4074
|
||||
|
||||
/* cl_image_info */
|
||||
#define CL_IMAGE_DX9_PLANE_INTEL 0x4075
|
||||
|
||||
/* cl_command_type */
|
||||
#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL 0x402A
|
||||
#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL 0x402B
|
||||
/******************************************************************************/
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetDeviceIDsFromDX9INTEL(
|
||||
cl_platform_id /* platform */,
|
||||
cl_dx9_device_source_intel /* dx9_device_source */,
|
||||
void* /* dx9_object */,
|
||||
cl_dx9_device_set_intel /* dx9_device_set */,
|
||||
cl_uint /* num_entries */,
|
||||
cl_device_id* /* devices */,
|
||||
cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
|
||||
cl_platform_id /* platform */,
|
||||
cl_dx9_device_source_intel /* dx9_device_source */,
|
||||
void* /* dx9_object */,
|
||||
cl_dx9_device_set_intel /* dx9_device_set */,
|
||||
cl_uint /* num_entries */,
|
||||
cl_device_id* /* devices */,
|
||||
cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromDX9MediaSurfaceINTEL(
|
||||
cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
IDirect3DSurface9* /* resource */,
|
||||
HANDLE /* sharedHandle */,
|
||||
UINT /* plane */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
|
||||
cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
IDirect3DSurface9* /* resource */,
|
||||
HANDLE /* sharedHandle */,
|
||||
UINT /* plane */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueAcquireDX9ObjectsINTEL(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueReleaseDX9ObjectsINTEL(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
|
||||
|
136
include/triton/external/CL/cl_egl.h
vendored
136
include/triton/external/CL/cl_egl.h
vendored
@@ -1,136 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
******************************************************************************/
|
||||
|
||||
#ifndef __OPENCL_CL_EGL_H
|
||||
#define __OPENCL_CL_EGL_H
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
#else
|
||||
#include "cl.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
|
||||
#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR 0x202F
|
||||
#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR 0x202D
|
||||
#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR 0x202E
|
||||
|
||||
/* Error type for clCreateFromEGLImageKHR */
|
||||
#define CL_INVALID_EGL_OBJECT_KHR -1093
|
||||
#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR -1092
|
||||
|
||||
/* CLeglImageKHR is an opaque handle to an EGLImage */
|
||||
typedef void* CLeglImageKHR;
|
||||
|
||||
/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
|
||||
typedef void* CLeglDisplayKHR;
|
||||
|
||||
/* CLeglSyncKHR is an opaque handle to an EGLSync object */
|
||||
typedef void* CLeglSyncKHR;
|
||||
|
||||
/* properties passed to clCreateFromEGLImageKHR */
|
||||
typedef intptr_t cl_egl_image_properties_khr;
|
||||
|
||||
|
||||
#define cl_khr_egl_image 1
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromEGLImageKHR(cl_context /* context */,
|
||||
CLeglDisplayKHR /* egldisplay */,
|
||||
CLeglImageKHR /* eglimage */,
|
||||
cl_mem_flags /* flags */,
|
||||
const cl_egl_image_properties_khr * /* properties */,
|
||||
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
|
||||
cl_context context,
|
||||
CLeglDisplayKHR egldisplay,
|
||||
CLeglImageKHR eglimage,
|
||||
cl_mem_flags flags,
|
||||
const cl_egl_image_properties_khr * properties,
|
||||
cl_int * errcode_ret);
|
||||
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event);
|
||||
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
|
||||
cl_command_queue command_queue,
|
||||
cl_uint num_objects,
|
||||
const cl_mem * mem_objects,
|
||||
cl_uint num_events_in_wait_list,
|
||||
const cl_event * event_wait_list,
|
||||
cl_event * event);
|
||||
|
||||
|
||||
#define cl_khr_egl_event 1
|
||||
|
||||
extern CL_API_ENTRY cl_event CL_API_CALL
|
||||
clCreateEventFromEGLSyncKHR(cl_context /* context */,
|
||||
CLeglSyncKHR /* sync */,
|
||||
CLeglDisplayKHR /* display */,
|
||||
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
|
||||
cl_context context,
|
||||
CLeglSyncKHR sync,
|
||||
CLeglDisplayKHR display,
|
||||
cl_int * errcode_ret);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_EGL_H */
|
670
include/triton/external/CL/cl_ext.h
vendored
670
include/triton/external/CL/cl_ext.h
vendored
@@ -1,670 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
******************************************************************************/
|
||||
|
||||
/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
|
||||
|
||||
/* cl_ext.h contains OpenCL extensions which don't have external */
|
||||
/* (OpenGL, D3D) dependencies. */
|
||||
|
||||
#ifndef __CL_EXT_H
|
||||
#define __CL_EXT_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#include <AvailabilityMacros.h>
|
||||
#else
|
||||
#include "cl.h"
|
||||
#endif
|
||||
|
||||
/* cl_khr_fp64 extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_DOUBLE_FP_CONFIG 0x1032
|
||||
|
||||
/* cl_khr_fp16 extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_HALF_FP_CONFIG 0x1033
|
||||
|
||||
/* Memory object destruction
|
||||
*
|
||||
* Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
|
||||
*
|
||||
* Registers a user callback function that will be called when the memory object is deleted and its resources
|
||||
* freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback
|
||||
* stack associated with memobj. The registered user callback functions are called in the reverse order in
|
||||
* which they were registered. The user callback functions are called and then the memory object is deleted
|
||||
* and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be
|
||||
* notified when the memory referenced by host_ptr, specified when the memory object is created and used as
|
||||
* the storage bits for the memory object, can be reused or freed.
|
||||
*
|
||||
* The application may not call CL api's with the cl_mem object passed to the pfn_notify.
|
||||
*
|
||||
* Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
|
||||
* before using.
|
||||
*/
|
||||
#define cl_APPLE_SetMemObjectDestructor 1
|
||||
cl_int CL_API_ENTRY clSetMemObjectDestructorAPPLE( cl_mem /* memobj */,
|
||||
void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/),
|
||||
void * /*user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
|
||||
|
||||
|
||||
/* Context Logging Functions
|
||||
*
|
||||
* The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
|
||||
* Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
|
||||
* before using.
|
||||
*
|
||||
* clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger
|
||||
*/
|
||||
#define cl_APPLE_ContextLoggingFunctions 1
|
||||
extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE( const char * /* errstr */,
|
||||
const void * /* private_info */,
|
||||
size_t /* cb */,
|
||||
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
|
||||
|
||||
/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
|
||||
extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE( const char * /* errstr */,
|
||||
const void * /* private_info */,
|
||||
size_t /* cb */,
|
||||
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
|
||||
|
||||
/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
|
||||
extern void CL_API_ENTRY clLogMessagesToStderrAPPLE( const char * /* errstr */,
|
||||
const void * /* private_info */,
|
||||
size_t /* cb */,
|
||||
void * /* user_data */ ) CL_EXT_SUFFIX__VERSION_1_0;
|
||||
|
||||
|
||||
/************************
|
||||
* cl_khr_icd extension *
|
||||
************************/
|
||||
#define cl_khr_icd 1
|
||||
|
||||
/* cl_platform_info */
|
||||
#define CL_PLATFORM_ICD_SUFFIX_KHR 0x0920
|
||||
|
||||
/* Additional Error Codes */
|
||||
#define CL_PLATFORM_NOT_FOUND_KHR -1001
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clIcdGetPlatformIDsKHR(cl_uint /* num_entries */,
|
||||
cl_platform_id * /* platforms */,
|
||||
cl_uint * /* num_platforms */);
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
|
||||
cl_uint /* num_entries */,
|
||||
cl_platform_id * /* platforms */,
|
||||
cl_uint * /* num_platforms */);
|
||||
|
||||
|
||||
/* Extension: cl_khr_image2D_buffer
|
||||
*
|
||||
* This extension allows a 2D image to be created from a cl_mem buffer without a copy.
|
||||
* The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
|
||||
* Both the sampler and sampler-less read_image built-in functions are supported for 2D images
|
||||
* and 2D images created from a buffer. Similarly, the write_image built-ins are also supported
|
||||
* for 2D images created from a buffer.
|
||||
*
|
||||
* When the 2D image from buffer is created, the client must specify the width,
|
||||
* height, image format (i.e. channel order and channel data type) and optionally the row pitch
|
||||
*
|
||||
* The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
|
||||
* The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
|
||||
*/
|
||||
|
||||
/*************************************
|
||||
* cl_khr_initalize_memory extension *
|
||||
*************************************/
|
||||
|
||||
#define CL_CONTEXT_MEMORY_INITIALIZE_KHR 0x2030
|
||||
|
||||
|
||||
/**************************************
|
||||
* cl_khr_terminate_context extension *
|
||||
**************************************/
|
||||
|
||||
#define CL_DEVICE_TERMINATE_CAPABILITY_KHR 0x2031
|
||||
#define CL_CONTEXT_TERMINATE_KHR 0x2032
|
||||
|
||||
#define cl_khr_terminate_context 1
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
|
||||
/*
|
||||
* Extension: cl_khr_spir
|
||||
*
|
||||
* This extension adds support to create an OpenCL program object from a
|
||||
* Standard Portable Intermediate Representation (SPIR) instance
|
||||
*/
|
||||
|
||||
#define CL_DEVICE_SPIR_VERSIONS 0x40E0
|
||||
#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE 0x40E1
|
||||
|
||||
|
||||
/*****************************************
|
||||
* cl_khr_create_command_queue extension *
|
||||
*****************************************/
|
||||
#define cl_khr_create_command_queue 1
|
||||
|
||||
typedef cl_bitfield cl_queue_properties_khr;
|
||||
|
||||
extern CL_API_ENTRY cl_command_queue CL_API_CALL
|
||||
clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
|
||||
cl_device_id /* device */,
|
||||
const cl_queue_properties_khr* /* properties */,
|
||||
cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
typedef CL_API_ENTRY cl_command_queue
|
||||
(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
|
||||
cl_device_id /* device */,
|
||||
const cl_queue_properties_khr* /* properties */,
|
||||
cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
|
||||
/******************************************
|
||||
* cl_nv_device_attribute_query extension *
|
||||
******************************************/
|
||||
/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
|
||||
#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
|
||||
#define CL_DEVICE_REGISTERS_PER_BLOCK_NV 0x4002
|
||||
#define CL_DEVICE_WARP_SIZE_NV 0x4003
|
||||
#define CL_DEVICE_GPU_OVERLAP_NV 0x4004
|
||||
#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV 0x4005
|
||||
#define CL_DEVICE_INTEGRATED_MEMORY_NV 0x4006
|
||||
|
||||
/*********************************
|
||||
* cl_amd_device_memory_flags *
|
||||
*********************************/
|
||||
#define cl_amd_device_memory_flags 1
|
||||
|
||||
#define CL_MEM_USE_PERSISTENT_MEM_AMD (1 << 6) // Alloc from GPU's CPU visible heap
|
||||
|
||||
/* cl_device_info */
|
||||
#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032
|
||||
|
||||
/*********************************
|
||||
* cl_amd_device_attribute_query *
|
||||
*********************************/
|
||||
#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD 0x4036
|
||||
#define CL_DEVICE_TOPOLOGY_AMD 0x4037
|
||||
#define CL_DEVICE_BOARD_NAME_AMD 0x4038
|
||||
#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD 0x4039
|
||||
#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD 0x4040
|
||||
#define CL_DEVICE_SIMD_WIDTH_AMD 0x4041
|
||||
#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
|
||||
#define CL_DEVICE_WAVEFRONT_WIDTH_AMD 0x4043
|
||||
#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD 0x4044
|
||||
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD 0x4045
|
||||
#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
|
||||
#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD 0x4047
|
||||
#define CL_DEVICE_LOCAL_MEM_BANKS_AMD 0x4048
|
||||
|
||||
typedef union
|
||||
{
|
||||
struct { cl_uint type; cl_uint data[5]; } raw;
|
||||
struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
|
||||
} cl_device_topology_amd;
|
||||
|
||||
#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD 1
|
||||
|
||||
|
||||
/**************************
|
||||
* cl_amd_offline_devices *
|
||||
**************************/
|
||||
#define CL_CONTEXT_OFFLINE_DEVICES_AMD 0x403F
|
||||
|
||||
/*********************************
|
||||
* cl_arm_printf extension
|
||||
*********************************/
|
||||
#define CL_PRINTF_CALLBACK_ARM 0x40B0
|
||||
#define CL_PRINTF_BUFFERSIZE_ARM 0x40B1
|
||||
|
||||
#ifdef CL_VERSION_1_1
|
||||
/***********************************
|
||||
* cl_ext_device_fission extension *
|
||||
***********************************/
|
||||
#define cl_ext_device_fission 1
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int
|
||||
(CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int
|
||||
(CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef cl_ulong cl_device_partition_property_ext;
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clCreateSubDevicesEXT( cl_device_id /*in_device*/,
|
||||
const cl_device_partition_property_ext * /* properties */,
|
||||
cl_uint /*num_entries*/,
|
||||
cl_device_id * /*out_devices*/,
|
||||
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
typedef CL_API_ENTRY cl_int
|
||||
( CL_API_CALL * clCreateSubDevicesEXT_fn)( cl_device_id /*in_device*/,
|
||||
const cl_device_partition_property_ext * /* properties */,
|
||||
cl_uint /*num_entries*/,
|
||||
cl_device_id * /*out_devices*/,
|
||||
cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
/* cl_device_partition_property_ext */
|
||||
#define CL_DEVICE_PARTITION_EQUALLY_EXT 0x4050
|
||||
#define CL_DEVICE_PARTITION_BY_COUNTS_EXT 0x4051
|
||||
#define CL_DEVICE_PARTITION_BY_NAMES_EXT 0x4052
|
||||
#define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT 0x4053
|
||||
|
||||
/* clDeviceGetInfo selectors */
|
||||
#define CL_DEVICE_PARENT_DEVICE_EXT 0x4054
|
||||
#define CL_DEVICE_PARTITION_TYPES_EXT 0x4055
|
||||
#define CL_DEVICE_AFFINITY_DOMAINS_EXT 0x4056
|
||||
#define CL_DEVICE_REFERENCE_COUNT_EXT 0x4057
|
||||
#define CL_DEVICE_PARTITION_STYLE_EXT 0x4058
|
||||
|
||||
/* error codes */
|
||||
#define CL_DEVICE_PARTITION_FAILED_EXT -1057
|
||||
#define CL_INVALID_PARTITION_COUNT_EXT -1058
|
||||
#define CL_INVALID_PARTITION_NAME_EXT -1059
|
||||
|
||||
/* CL_AFFINITY_DOMAINs */
|
||||
#define CL_AFFINITY_DOMAIN_L1_CACHE_EXT 0x1
|
||||
#define CL_AFFINITY_DOMAIN_L2_CACHE_EXT 0x2
|
||||
#define CL_AFFINITY_DOMAIN_L3_CACHE_EXT 0x3
|
||||
#define CL_AFFINITY_DOMAIN_L4_CACHE_EXT 0x4
|
||||
#define CL_AFFINITY_DOMAIN_NUMA_EXT 0x10
|
||||
#define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT 0x100
|
||||
|
||||
/* cl_device_partition_property_ext list terminators */
|
||||
#define CL_PROPERTIES_LIST_END_EXT ((cl_device_partition_property_ext) 0)
|
||||
#define CL_PARTITION_BY_COUNTS_LIST_END_EXT ((cl_device_partition_property_ext) 0)
|
||||
#define CL_PARTITION_BY_NAMES_LIST_END_EXT ((cl_device_partition_property_ext) 0 - 1)
|
||||
|
||||
/* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
|
||||
* no extension #define since they have no functions
|
||||
*/
|
||||
#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT 0x4032
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_ext_host_ptr extension
|
||||
*********************************/
|
||||
|
||||
#define CL_MEM_EXT_HOST_PTR_QCOM (1 << 29)
|
||||
|
||||
#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM 0x40A0
|
||||
#define CL_DEVICE_PAGE_SIZE_QCOM 0x40A1
|
||||
#define CL_IMAGE_ROW_ALIGNMENT_QCOM 0x40A2
|
||||
#define CL_IMAGE_SLICE_ALIGNMENT_QCOM 0x40A3
|
||||
#define CL_MEM_HOST_UNCACHED_QCOM 0x40A4
|
||||
#define CL_MEM_HOST_WRITEBACK_QCOM 0x40A5
|
||||
#define CL_MEM_HOST_WRITETHROUGH_QCOM 0x40A6
|
||||
#define CL_MEM_HOST_WRITE_COMBINING_QCOM 0x40A7
|
||||
|
||||
typedef cl_uint cl_image_pitch_info_qcom;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetDeviceImageInfoQCOM(cl_device_id device,
|
||||
size_t image_width,
|
||||
size_t image_height,
|
||||
const cl_image_format *image_format,
|
||||
cl_image_pitch_info_qcom param_name,
|
||||
size_t param_value_size,
|
||||
void *param_value,
|
||||
size_t *param_value_size_ret);
|
||||
|
||||
typedef struct _cl_mem_ext_host_ptr
|
||||
{
|
||||
/* Type of external memory allocation. */
|
||||
/* Legal values will be defined in layered extensions. */
|
||||
cl_uint allocation_type;
|
||||
|
||||
/* Host cache policy for this external memory allocation. */
|
||||
cl_uint host_cache_policy;
|
||||
|
||||
} cl_mem_ext_host_ptr;
|
||||
|
||||
/*********************************
|
||||
* cl_qcom_ion_host_ptr extension
|
||||
*********************************/
|
||||
|
||||
#define CL_MEM_ION_HOST_PTR_QCOM 0x40A8
|
||||
|
||||
typedef struct _cl_mem_ion_host_ptr
|
||||
{
|
||||
/* Type of external memory allocation. */
|
||||
/* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
|
||||
cl_mem_ext_host_ptr ext_host_ptr;
|
||||
|
||||
/* ION file descriptor */
|
||||
int ion_filedesc;
|
||||
|
||||
/* Host pointer to the ION allocated memory */
|
||||
void* ion_hostptr;
|
||||
|
||||
} cl_mem_ion_host_ptr;
|
||||
|
||||
#endif /* CL_VERSION_1_1 */
|
||||
|
||||
#if defined(CL_VERSION_1_2)
|
||||
|
||||
/******************************************
|
||||
* cl_img_yuv_image extension *
|
||||
******************************************/
|
||||
|
||||
/* Image formats used in clCreateImage */
|
||||
#define CL_NV21_IMG 0x40D0
|
||||
#define CL_YV12_IMG 0x40D1
|
||||
|
||||
/******************************************
|
||||
* cl_img_cached_allocations extension *
|
||||
******************************************/
|
||||
|
||||
/* Flag values used by clCreteBuffer */
|
||||
#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG (1 << 26)
|
||||
#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG (1 << 27)
|
||||
|
||||
/******************************************
|
||||
* cl_img_use_gralloc_ptr extension *
|
||||
******************************************/
|
||||
|
||||
/* Flag values used by clCreteBuffer */
|
||||
#define CL_MEM_USE_GRALLOC_PTR_IMG (1 << 28)
|
||||
|
||||
/* To be used by clGetEventInfo: */
|
||||
#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG 0x40D2
|
||||
#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG 0x40D3
|
||||
|
||||
/* Error code from clEnqueueReleaseGrallocObjectsIMG */
|
||||
#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG 0x40D4
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueAcquireGrallocObjectsIMG(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueReleaseGrallocObjectsIMG(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
#endif /* CL_VERSION_1_2 */
|
||||
|
||||
#ifdef CL_VERSION_2_0
|
||||
/*********************************
|
||||
* cl_khr_subgroups extension
|
||||
*********************************/
|
||||
#define cl_khr_subgroups 1
|
||||
|
||||
/* cl_kernel_sub_group_info is declared in CL.h. */
|
||||
|
||||
/* cl_kernel_sub_group_info */
|
||||
#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR 0x2033
|
||||
#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR 0x2034
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
|
||||
cl_device_id /*in_device*/,
|
||||
cl_kernel_sub_group_info /* param_name */,
|
||||
size_t /*input_value_size*/,
|
||||
const void * /*input_value*/,
|
||||
size_t /*param_value_size*/,
|
||||
void* /*param_value*/,
|
||||
size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
|
||||
|
||||
typedef CL_API_ENTRY cl_int
|
||||
( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
|
||||
cl_device_id /*in_device*/,
|
||||
cl_kernel_sub_group_info /* param_name */,
|
||||
size_t /*input_value_size*/,
|
||||
const void * /*input_value*/,
|
||||
size_t /*param_value_size*/,
|
||||
void* /*param_value*/,
|
||||
size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
|
||||
#endif /* CL_VERSION_2_0 */
|
||||
|
||||
#ifdef CL_VERSION_2_1
|
||||
/*********************************
|
||||
* cl_khr_priority_hints extension
|
||||
*********************************/
|
||||
#define cl_khr_priority_hints 1
|
||||
|
||||
typedef cl_uint cl_queue_priority_khr;
|
||||
|
||||
/* cl_command_queue_properties */
|
||||
#define CL_QUEUE_PRIORITY_KHR 0x1096
|
||||
|
||||
/* cl_queue_priority_khr */
|
||||
#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
|
||||
#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
|
||||
#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
|
||||
|
||||
#endif /* CL_VERSION_2_1 */
|
||||
|
||||
#ifdef CL_VERSION_2_1
|
||||
/*********************************
|
||||
* cl_khr_throttle_hints extension
|
||||
*********************************/
|
||||
#define cl_khr_throttle_hints 1
|
||||
|
||||
typedef cl_uint cl_queue_throttle_khr;
|
||||
|
||||
/* cl_command_queue_properties */
|
||||
#define CL_QUEUE_THROTTLE_KHR 0x1097
|
||||
|
||||
/* cl_queue_throttle_khr */
|
||||
#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
|
||||
#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
|
||||
#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
|
||||
|
||||
#endif /* CL_VERSION_2_1 */
|
||||
|
||||
#ifdef CL_VERSION_2_2
|
||||
/*********************************
|
||||
* cl_khr_subgroup_named_barrier
|
||||
*********************************/
|
||||
#define cl_khr_subgroup_named_barrier 1
|
||||
|
||||
/* cl_device_info */
|
||||
#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR 0x2035
|
||||
|
||||
#endif /* CL_VERSION_2_2 */
|
||||
|
||||
/**********************************
|
||||
* cl_arm_import_memory extension *
|
||||
**********************************/
|
||||
|
||||
#ifdef CL_VERSION_1_0
|
||||
|
||||
typedef intptr_t cl_import_properties_arm;
|
||||
|
||||
/* Default and valid proporties name for cl_arm_import_memory */
|
||||
#define CL_IMPORT_TYPE_ARM 0x40B2
|
||||
|
||||
/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
|
||||
#define CL_IMPORT_TYPE_HOST_ARM 0x40B3
|
||||
|
||||
/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
|
||||
#define CL_IMPORT_TYPE_DMA_BUF_ARM 0x40B4
|
||||
|
||||
/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
|
||||
#define CL_IMPORT_TYPE_SECURE_ARM 0x40B5
|
||||
|
||||
/* This extension adds a new function that allows for direct memory import into
|
||||
* OpenCL via the clImportMemoryARM function.
|
||||
*
|
||||
* Memory imported through this interface will be mapped into the device's page
|
||||
* tables directly, providing zero copy access. It will never fall back to copy
|
||||
* operations and aliased buffers.
|
||||
*
|
||||
* Types of memory supported for import are specified as additional extension
|
||||
* strings.
|
||||
*
|
||||
* This extension produces cl_mem allocations which are compatible with all other
|
||||
* users of cl_mem in the standard API.
|
||||
*
|
||||
* This extension maps pages with the same properties as the normal buffer creation
|
||||
* function clCreateBuffer.
|
||||
*/
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clImportMemoryARM( cl_context context,
|
||||
cl_mem_flags flags,
|
||||
const cl_import_properties_arm *properties,
|
||||
void *memory,
|
||||
size_t size,
|
||||
cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
|
||||
|
||||
|
||||
#endif /* CL_VERSION_1_0 */
|
||||
|
||||
/******************************************
|
||||
* cl_arm_shared_virtual_memory extension *
|
||||
******************************************/
|
||||
|
||||
#ifdef CL_VERSION_1_2
|
||||
|
||||
/* Used by clGetDeviceInfo */
|
||||
#define CL_DEVICE_SVM_CAPABILITIES_ARM 0x40B6
|
||||
|
||||
/* Used by clGetMemObjectInfo */
|
||||
#define CL_MEM_USES_SVM_POINTER_ARM 0x40B7
|
||||
|
||||
/* Used by clSetKernelExecInfoARM: */
|
||||
#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM 0x40B8
|
||||
#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM 0x40B9
|
||||
|
||||
/* To be used by clGetEventInfo: */
|
||||
#define CL_COMMAND_SVM_FREE_ARM 0x40BA
|
||||
#define CL_COMMAND_SVM_MEMCPY_ARM 0x40BB
|
||||
#define CL_COMMAND_SVM_MEMFILL_ARM 0x40BC
|
||||
#define CL_COMMAND_SVM_MAP_ARM 0x40BD
|
||||
#define CL_COMMAND_SVM_UNMAP_ARM 0x40BE
|
||||
|
||||
/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
|
||||
#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM (1 << 0)
|
||||
#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM (1 << 1)
|
||||
#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM (1 << 2)
|
||||
#define CL_DEVICE_SVM_ATOMICS_ARM (1 << 3)
|
||||
|
||||
/* Flag values used by clSVMAllocARM: */
|
||||
#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM (1 << 10)
|
||||
#define CL_MEM_SVM_ATOMICS_ARM (1 << 11)
|
||||
|
||||
typedef cl_bitfield cl_svm_mem_flags_arm;
|
||||
typedef cl_uint cl_kernel_exec_info_arm;
|
||||
typedef cl_bitfield cl_device_svm_capabilities_arm;
|
||||
|
||||
extern CL_API_ENTRY void * CL_API_CALL
|
||||
clSVMAllocARM(cl_context /* context */,
|
||||
cl_svm_mem_flags_arm /* flags */,
|
||||
size_t /* size */,
|
||||
cl_uint /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY void CL_API_CALL
|
||||
clSVMFreeARM(cl_context /* context */,
|
||||
void * /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueSVMFreeARM(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_svm_pointers */,
|
||||
void *[] /* svm_pointers[] */,
|
||||
void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
|
||||
cl_uint /* num_svm_pointers */,
|
||||
void *[] /* svm_pointers[] */,
|
||||
void * /* user_data */),
|
||||
void * /* user_data */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueSVMMemcpyARM(cl_command_queue /* command_queue */,
|
||||
cl_bool /* blocking_copy */,
|
||||
void * /* dst_ptr */,
|
||||
const void * /* src_ptr */,
|
||||
size_t /* size */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueSVMMemFillARM(cl_command_queue /* command_queue */,
|
||||
void * /* svm_ptr */,
|
||||
const void * /* pattern */,
|
||||
size_t /* pattern_size */,
|
||||
size_t /* size */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueSVMMapARM(cl_command_queue /* command_queue */,
|
||||
cl_bool /* blocking_map */,
|
||||
cl_map_flags /* flags */,
|
||||
void * /* svm_ptr */,
|
||||
size_t /* size */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueSVMUnmapARM(cl_command_queue /* command_queue */,
|
||||
void * /* svm_ptr */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clSetKernelArgSVMPointerARM(cl_kernel /* kernel */,
|
||||
cl_uint /* arg_index */,
|
||||
const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clSetKernelExecInfoARM(cl_kernel /* kernel */,
|
||||
cl_kernel_exec_info_arm /* param_name */,
|
||||
size_t /* param_value_size */,
|
||||
const void * /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
#endif /* CL_VERSION_1_2 */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#endif /* __CL_EXT_H */
|
429
include/triton/external/CL/cl_ext_intel.h
vendored
429
include/triton/external/CL/cl_ext_intel.h
vendored
@@ -1,429 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (c) 2008-2017 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
******************************************************************************/
|
||||
/*****************************************************************************\
|
||||
|
||||
Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
|
||||
|
||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
|
||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
File Name: cl_ext_intel.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Notes:
|
||||
|
||||
\*****************************************************************************/
|
||||
|
||||
#ifndef __CL_EXT_INTEL_H
|
||||
#define __CL_EXT_INTEL_H
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#include <OpenCL/cl_platform.h>
|
||||
#else
|
||||
#include "cl.h"
|
||||
#include "cl_platform.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/***************************************
|
||||
* cl_intel_thread_local_exec extension *
|
||||
****************************************/
|
||||
|
||||
#define cl_intel_thread_local_exec 1
|
||||
|
||||
#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL (((cl_bitfield)1) << 31)
|
||||
|
||||
/***********************************************
|
||||
* cl_intel_device_partition_by_names extension *
|
||||
************************************************/
|
||||
|
||||
#define cl_intel_device_partition_by_names 1
|
||||
|
||||
#define CL_DEVICE_PARTITION_BY_NAMES_INTEL 0x4052
|
||||
#define CL_PARTITION_BY_NAMES_LIST_END_INTEL -1
|
||||
|
||||
/************************************************
|
||||
* cl_intel_accelerator extension *
|
||||
* cl_intel_motion_estimation extension *
|
||||
* cl_intel_advanced_motion_estimation extension *
|
||||
*************************************************/
|
||||
|
||||
#define cl_intel_accelerator 1
|
||||
#define cl_intel_motion_estimation 1
|
||||
#define cl_intel_advanced_motion_estimation 1
|
||||
|
||||
typedef struct _cl_accelerator_intel* cl_accelerator_intel;
|
||||
typedef cl_uint cl_accelerator_type_intel;
|
||||
typedef cl_uint cl_accelerator_info_intel;
|
||||
|
||||
typedef struct _cl_motion_estimation_desc_intel {
|
||||
cl_uint mb_block_type;
|
||||
cl_uint subpixel_mode;
|
||||
cl_uint sad_adjust_mode;
|
||||
cl_uint search_path_type;
|
||||
} cl_motion_estimation_desc_intel;
|
||||
|
||||
/* error codes */
|
||||
#define CL_INVALID_ACCELERATOR_INTEL -1094
|
||||
#define CL_INVALID_ACCELERATOR_TYPE_INTEL -1095
|
||||
#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL -1096
|
||||
#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL -1097
|
||||
|
||||
/* cl_accelerator_type_intel */
|
||||
#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL 0x0
|
||||
|
||||
/* cl_accelerator_info_intel */
|
||||
#define CL_ACCELERATOR_DESCRIPTOR_INTEL 0x4090
|
||||
#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL 0x4091
|
||||
#define CL_ACCELERATOR_CONTEXT_INTEL 0x4092
|
||||
#define CL_ACCELERATOR_TYPE_INTEL 0x4093
|
||||
|
||||
/* cl_motion_detect_desc_intel flags */
|
||||
#define CL_ME_MB_TYPE_16x16_INTEL 0x0
|
||||
#define CL_ME_MB_TYPE_8x8_INTEL 0x1
|
||||
#define CL_ME_MB_TYPE_4x4_INTEL 0x2
|
||||
|
||||
#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
|
||||
#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
|
||||
#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL 0x2
|
||||
|
||||
#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
|
||||
#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x1
|
||||
|
||||
#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL 0x0
|
||||
#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL 0x1
|
||||
#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL 0x5
|
||||
|
||||
#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL 0x0
|
||||
#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL 0x1
|
||||
#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL 0x2
|
||||
#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL 0x4
|
||||
|
||||
#define CL_ME_FORWARD_INPUT_MODE_INTEL 0x1
|
||||
#define CL_ME_BACKWARD_INPUT_MODE_INTEL 0x2
|
||||
#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL 0x3
|
||||
|
||||
#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL 16
|
||||
#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL 21
|
||||
#define CL_ME_BIDIR_WEIGHT_HALF_INTEL 32
|
||||
#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 43
|
||||
#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 48
|
||||
|
||||
#define CL_ME_COST_PENALTY_NONE_INTEL 0x0
|
||||
#define CL_ME_COST_PENALTY_LOW_INTEL 0x1
|
||||
#define CL_ME_COST_PENALTY_NORMAL_INTEL 0x2
|
||||
#define CL_ME_COST_PENALTY_HIGH_INTEL 0x3
|
||||
|
||||
#define CL_ME_COST_PRECISION_QPEL_INTEL 0x0
|
||||
#define CL_ME_COST_PRECISION_HPEL_INTEL 0x1
|
||||
#define CL_ME_COST_PRECISION_PEL_INTEL 0x2
|
||||
#define CL_ME_COST_PRECISION_DPEL_INTEL 0x3
|
||||
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
|
||||
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
|
||||
#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
|
||||
|
||||
#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
|
||||
#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
|
||||
#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
|
||||
|
||||
/* cl_device_info */
|
||||
#define CL_DEVICE_ME_VERSION_INTEL 0x407E
|
||||
|
||||
#define CL_ME_VERSION_LEGACY_INTEL 0x0
|
||||
#define CL_ME_VERSION_ADVANCED_VER_1_INTEL 0x1
|
||||
#define CL_ME_VERSION_ADVANCED_VER_2_INTEL 0x2
|
||||
|
||||
extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
|
||||
clCreateAcceleratorINTEL(
|
||||
cl_context /* context */,
|
||||
cl_accelerator_type_intel /* accelerator_type */,
|
||||
size_t /* descriptor_size */,
|
||||
const void* /* descriptor */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
|
||||
cl_context /* context */,
|
||||
cl_accelerator_type_intel /* accelerator_type */,
|
||||
size_t /* descriptor_size */,
|
||||
const void* /* descriptor */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetAcceleratorInfoINTEL(
|
||||
cl_accelerator_intel /* accelerator */,
|
||||
cl_accelerator_info_intel /* param_name */,
|
||||
size_t /* param_value_size */,
|
||||
void* /* param_value */,
|
||||
size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
|
||||
cl_accelerator_intel /* accelerator */,
|
||||
cl_accelerator_info_intel /* param_name */,
|
||||
size_t /* param_value_size */,
|
||||
void* /* param_value */,
|
||||
size_t* /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clRetainAcceleratorINTEL(
|
||||
cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
|
||||
cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clReleaseAcceleratorINTEL(
|
||||
cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
|
||||
cl_accelerator_intel /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
/******************************************
|
||||
* cl_intel_simultaneous_sharing extension *
|
||||
*******************************************/
|
||||
|
||||
#define cl_intel_simultaneous_sharing 1
|
||||
|
||||
#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL 0x4104
|
||||
#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL 0x4105
|
||||
|
||||
/***********************************
|
||||
* cl_intel_egl_image_yuv extension *
|
||||
************************************/
|
||||
|
||||
#define cl_intel_egl_image_yuv 1
|
||||
|
||||
#define CL_EGL_YUV_PLANE_INTEL 0x4107
|
||||
|
||||
/********************************
|
||||
* cl_intel_packed_yuv extension *
|
||||
*********************************/
|
||||
|
||||
#define cl_intel_packed_yuv 1
|
||||
|
||||
#define CL_YUYV_INTEL 0x4076
|
||||
#define CL_UYVY_INTEL 0x4077
|
||||
#define CL_YVYU_INTEL 0x4078
|
||||
#define CL_VYUY_INTEL 0x4079
|
||||
|
||||
/********************************************
|
||||
* cl_intel_required_subgroup_size extension *
|
||||
*********************************************/
|
||||
|
||||
#define cl_intel_required_subgroup_size 1
|
||||
|
||||
#define CL_DEVICE_SUB_GROUP_SIZES_INTEL 0x4108
|
||||
#define CL_KERNEL_SPILL_MEM_SIZE_INTEL 0x4109
|
||||
#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL 0x410A
|
||||
|
||||
/****************************************
|
||||
* cl_intel_driver_diagnostics extension *
|
||||
*****************************************/
|
||||
|
||||
#define cl_intel_driver_diagnostics 1
|
||||
|
||||
typedef cl_uint cl_diagnostics_verbose_level;
|
||||
|
||||
#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL 0x4106
|
||||
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL ( 0xff )
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL ( 1 )
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL ( 1 << 1 )
|
||||
#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL ( 1 << 2 )
|
||||
|
||||
/********************************
|
||||
* cl_intel_planar_yuv extension *
|
||||
*********************************/
|
||||
|
||||
#define CL_NV12_INTEL 0x410E
|
||||
|
||||
#define CL_MEM_NO_ACCESS_INTEL ( 1 << 24 )
|
||||
#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL ( 1 << 25 )
|
||||
|
||||
#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL 0x417E
|
||||
#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL 0x417F
|
||||
|
||||
/*******************************************************
|
||||
* cl_intel_device_side_avc_motion_estimation extension *
|
||||
********************************************************/
|
||||
|
||||
#define CL_DEVICE_AVC_ME_VERSION_INTEL 0x410B
|
||||
#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
|
||||
#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL 0x410D
|
||||
|
||||
#define CL_AVC_ME_VERSION_0_INTEL 0x0; // No support.
|
||||
#define CL_AVC_ME_VERSION_1_INTEL 0x1; // First supported version.
|
||||
|
||||
#define CL_AVC_ME_MAJOR_16x16_INTEL 0x0
|
||||
#define CL_AVC_ME_MAJOR_16x8_INTEL 0x1
|
||||
#define CL_AVC_ME_MAJOR_8x16_INTEL 0x2
|
||||
#define CL_AVC_ME_MAJOR_8x8_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_MINOR_8x8_INTEL 0x0
|
||||
#define CL_AVC_ME_MINOR_8x4_INTEL 0x1
|
||||
#define CL_AVC_ME_MINOR_4x8_INTEL 0x2
|
||||
#define CL_AVC_ME_MINOR_4x4_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_MAJOR_FORWARD_INTEL 0x0
|
||||
#define CL_AVC_ME_MAJOR_BACKWARD_INTEL 0x1
|
||||
#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL 0x2
|
||||
|
||||
#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL 0x0
|
||||
#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL 0x7E
|
||||
#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL 0x7D
|
||||
#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL 0x7B
|
||||
#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL 0x77
|
||||
#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL 0x6F
|
||||
#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL 0x5F
|
||||
#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL 0x3F
|
||||
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL 0x0
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL 0x1
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL 0x2
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL 0x3
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL 0x4
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL 0x5
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL 0x6
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL 0x7
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL 0x8
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL 0x9
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL 0x2
|
||||
#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL 0xa
|
||||
|
||||
#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL 0x0
|
||||
#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL 0x2
|
||||
|
||||
#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL 0x0
|
||||
#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL 0x1
|
||||
#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL 0x0
|
||||
#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL 0x1
|
||||
#define CL_AVC_ME_COST_PRECISION_PEL_INTEL 0x2
|
||||
#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL 0x10
|
||||
#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL 0x15
|
||||
#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL 0x20
|
||||
#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL 0x2B
|
||||
#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL 0x30
|
||||
|
||||
#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL 0x0
|
||||
#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL 0x2
|
||||
#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL 0x4
|
||||
#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL 0x8
|
||||
|
||||
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL 0x0
|
||||
#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL 0x4000
|
||||
|
||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL ( 0x3 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL ( 0x55 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL ( 0xAA << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL ( 0xFF << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL ( 0x1 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL ( 0x2 << 24 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL ( 0x1 << 26 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL ( 0x2 << 26 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL ( 0x1 << 28 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL ( 0x2 << 28 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL ( 0x1 << 30 )
|
||||
#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL ( 0x2 << 30 )
|
||||
|
||||
#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL 0x00
|
||||
#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL 0x80
|
||||
|
||||
#define CL_AVC_ME_INTRA_16x16_INTEL 0x0
|
||||
#define CL_AVC_ME_INTRA_8x8_INTEL 0x1
|
||||
#define CL_AVC_ME_INTRA_4x4_INTEL 0x2
|
||||
|
||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL 0x6
|
||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL 0x5
|
||||
#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL 0x60
|
||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL 0x10
|
||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL 0x8
|
||||
#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL 0x4
|
||||
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL 0x0
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL 0x2
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL 0x3
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL 0x4
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL 0x5
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL 0x6
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL 0x7
|
||||
#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL 0x8
|
||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL 0x0
|
||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL 0x1
|
||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL 0x2
|
||||
#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_FRAME_FORWARD_INTEL 0x1
|
||||
#define CL_AVC_ME_FRAME_BACKWARD_INTEL 0x2
|
||||
#define CL_AVC_ME_FRAME_DUAL_INTEL 0x3
|
||||
|
||||
#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL 0x0
|
||||
#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL 0x1
|
||||
#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL 0x2
|
||||
|
||||
#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL 0x0
|
||||
#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL 0x1
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __CL_EXT_INTEL_H */
|
||||
|
167
include/triton/external/CL/cl_gl.h
vendored
167
include/triton/external/CL/cl_gl.h
vendored
@@ -1,167 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
|
||||
#ifndef __OPENCL_CL_GL_H
|
||||
#define __OPENCL_CL_GL_H
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl.h>
|
||||
#else
|
||||
#include "cl.h"
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef cl_uint cl_gl_object_type;
|
||||
typedef cl_uint cl_gl_texture_info;
|
||||
typedef cl_uint cl_gl_platform_info;
|
||||
typedef struct __GLsync *cl_GLsync;
|
||||
|
||||
/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken */
|
||||
#define CL_GL_OBJECT_BUFFER 0x2000
|
||||
#define CL_GL_OBJECT_TEXTURE2D 0x2001
|
||||
#define CL_GL_OBJECT_TEXTURE3D 0x2002
|
||||
#define CL_GL_OBJECT_RENDERBUFFER 0x2003
|
||||
#define CL_GL_OBJECT_TEXTURE2D_ARRAY 0x200E
|
||||
#define CL_GL_OBJECT_TEXTURE1D 0x200F
|
||||
#define CL_GL_OBJECT_TEXTURE1D_ARRAY 0x2010
|
||||
#define CL_GL_OBJECT_TEXTURE_BUFFER 0x2011
|
||||
|
||||
/* cl_gl_texture_info */
|
||||
#define CL_GL_TEXTURE_TARGET 0x2004
|
||||
#define CL_GL_MIPMAP_LEVEL 0x2005
|
||||
#define CL_GL_NUM_SAMPLES 0x2012
|
||||
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromGLBuffer(cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
cl_GLuint /* bufobj */,
|
||||
int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromGLTexture(cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
cl_GLenum /* target */,
|
||||
cl_GLint /* miplevel */,
|
||||
cl_GLuint /* texture */,
|
||||
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromGLRenderbuffer(cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
cl_GLuint /* renderbuffer */,
|
||||
cl_int * /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetGLObjectInfo(cl_mem /* memobj */,
|
||||
cl_gl_object_type * /* gl_object_type */,
|
||||
cl_GLuint * /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetGLTextureInfo(cl_mem /* memobj */,
|
||||
cl_gl_texture_info /* param_name */,
|
||||
size_t /* param_value_size */,
|
||||
void * /* param_value */,
|
||||
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueAcquireGLObjects(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueReleaseGLObjects(cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem * /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event * /* event_wait_list */,
|
||||
cl_event * /* event */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
|
||||
/* Deprecated OpenCL 1.1 APIs */
|
||||
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
|
||||
clCreateFromGLTexture2D(cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
cl_GLenum /* target */,
|
||||
cl_GLint /* miplevel */,
|
||||
cl_GLuint /* texture */,
|
||||
cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
|
||||
|
||||
extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
|
||||
clCreateFromGLTexture3D(cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
cl_GLenum /* target */,
|
||||
cl_GLint /* miplevel */,
|
||||
cl_GLuint /* texture */,
|
||||
cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
|
||||
|
||||
/* cl_khr_gl_sharing extension */
|
||||
|
||||
#define cl_khr_gl_sharing 1
|
||||
|
||||
typedef cl_uint cl_gl_context_info;
|
||||
|
||||
/* Additional Error Codes */
|
||||
#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR -1000
|
||||
|
||||
/* cl_gl_context_info */
|
||||
#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR 0x2006
|
||||
#define CL_DEVICES_FOR_GL_CONTEXT_KHR 0x2007
|
||||
|
||||
/* Additional cl_context_properties */
|
||||
#define CL_GL_CONTEXT_KHR 0x2008
|
||||
#define CL_EGL_DISPLAY_KHR 0x2009
|
||||
#define CL_GLX_DISPLAY_KHR 0x200A
|
||||
#define CL_WGL_HDC_KHR 0x200B
|
||||
#define CL_CGL_SHAREGROUP_KHR 0x200C
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
|
||||
cl_gl_context_info /* param_name */,
|
||||
size_t /* param_value_size */,
|
||||
void * /* param_value */,
|
||||
size_t * /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
|
||||
const cl_context_properties * properties,
|
||||
cl_gl_context_info param_name,
|
||||
size_t param_value_size,
|
||||
void * param_value,
|
||||
size_t * param_value_size_ret);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_GL_H */
|
74
include/triton/external/CL/cl_gl_ext.h
vendored
74
include/triton/external/CL/cl_gl_ext.h
vendored
@@ -1,74 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
|
||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
|
||||
|
||||
/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have */
|
||||
/* OpenGL dependencies. */
|
||||
|
||||
#ifndef __OPENCL_CL_GL_EXT_H
|
||||
#define __OPENCL_CL_GL_EXT_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
#include <OpenCL/cl_gl.h>
|
||||
#else
|
||||
#include "cl_gl.h"
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For each extension, follow this template
|
||||
* cl_VEN_extname extension */
|
||||
/* #define cl_VEN_extname 1
|
||||
* ... define new types, if any
|
||||
* ... define new tokens, if any
|
||||
* ... define new APIs, if any
|
||||
*
|
||||
* If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
|
||||
* This allows us to avoid having to decide whether to include GL headers or GLES here.
|
||||
*/
|
||||
|
||||
/*
|
||||
* cl_khr_gl_event extension
|
||||
* See section 9.9 in the OpenCL 1.1 spec for more information
|
||||
*/
|
||||
#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR 0x200D
|
||||
|
||||
extern CL_API_ENTRY cl_event CL_API_CALL
|
||||
clCreateEventFromGLsyncKHR(cl_context /* context */,
|
||||
cl_GLsync /* cl_GLsync */,
|
||||
cl_int * /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_GL_EXT_H */
|
1458
include/triton/external/CL/cl_platform.h
vendored
1458
include/triton/external/CL/cl_platform.h
vendored
File diff suppressed because it is too large
Load Diff
@@ -1,172 +0,0 @@
|
||||
/**********************************************************************************
|
||||
* Copyright (c) 2008-2016 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
**********************************************************************************/
|
||||
/*****************************************************************************\
|
||||
|
||||
Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
|
||||
|
||||
THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
|
||||
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
|
||||
MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
File Name: cl_va_api_media_sharing_intel.h
|
||||
|
||||
Abstract:
|
||||
|
||||
Notes:
|
||||
|
||||
\*****************************************************************************/
|
||||
|
||||
|
||||
#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
|
||||
#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
|
||||
|
||||
#include "cl.h"
|
||||
#include "cl_platform.h"
|
||||
#include <va/va.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/******************************************
|
||||
* cl_intel_va_api_media_sharing extension *
|
||||
*******************************************/
|
||||
|
||||
#define cl_intel_va_api_media_sharing 1
|
||||
|
||||
/* error codes */
|
||||
#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL -1098
|
||||
#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL -1099
|
||||
#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL -1100
|
||||
#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL -1101
|
||||
|
||||
/* cl_va_api_device_source_intel */
|
||||
#define CL_VA_API_DISPLAY_INTEL 0x4094
|
||||
|
||||
/* cl_va_api_device_set_intel */
|
||||
#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL 0x4095
|
||||
#define CL_ALL_DEVICES_FOR_VA_API_INTEL 0x4096
|
||||
|
||||
/* cl_context_info */
|
||||
#define CL_CONTEXT_VA_API_DISPLAY_INTEL 0x4097
|
||||
|
||||
/* cl_mem_info */
|
||||
#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL 0x4098
|
||||
|
||||
/* cl_image_info */
|
||||
#define CL_IMAGE_VA_API_PLANE_INTEL 0x4099
|
||||
|
||||
/* cl_command_type */
|
||||
#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL 0x409A
|
||||
#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL 0x409B
|
||||
|
||||
typedef cl_uint cl_va_api_device_source_intel;
|
||||
typedef cl_uint cl_va_api_device_set_intel;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
|
||||
cl_platform_id /* platform */,
|
||||
cl_va_api_device_source_intel /* media_adapter_type */,
|
||||
void* /* media_adapter */,
|
||||
cl_va_api_device_set_intel /* media_adapter_set */,
|
||||
cl_uint /* num_entries */,
|
||||
cl_device_id* /* devices */,
|
||||
cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
|
||||
cl_platform_id /* platform */,
|
||||
cl_va_api_device_source_intel /* media_adapter_type */,
|
||||
void* /* media_adapter */,
|
||||
cl_va_api_device_set_intel /* media_adapter_set */,
|
||||
cl_uint /* num_entries */,
|
||||
cl_device_id* /* devices */,
|
||||
cl_uint* /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_mem CL_API_CALL
|
||||
clCreateFromVA_APIMediaSurfaceINTEL(
|
||||
cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
VASurfaceID* /* surface */,
|
||||
cl_uint /* plane */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
|
||||
cl_context /* context */,
|
||||
cl_mem_flags /* flags */,
|
||||
VASurfaceID* /* surface */,
|
||||
cl_uint /* plane */,
|
||||
cl_int* /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueAcquireVA_APIMediaSurfacesINTEL(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
extern CL_API_ENTRY cl_int CL_API_CALL
|
||||
clEnqueueReleaseVA_APIMediaSurfacesINTEL(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
|
||||
cl_command_queue /* command_queue */,
|
||||
cl_uint /* num_objects */,
|
||||
const cl_mem* /* mem_objects */,
|
||||
cl_uint /* num_events_in_wait_list */,
|
||||
const cl_event* /* event_wait_list */,
|
||||
cl_event* /* event */) CL_EXT_SUFFIX__VERSION_1_2;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
|
||||
|
59
include/triton/external/CL/opencl.h
vendored
59
include/triton/external/CL/opencl.h
vendored
@@ -1,59 +0,0 @@
|
||||
/*******************************************************************************
|
||||
* Copyright (c) 2008-2015 The Khronos Group Inc.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and/or associated documentation files (the
|
||||
* "Materials"), to deal in the Materials without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Materials, and to
|
||||
* permit persons to whom the Materials are furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included
|
||||
* in all copies or substantial portions of the Materials.
|
||||
*
|
||||
* MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
|
||||
* KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
|
||||
* SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
|
||||
* https://www.khronos.org/registry/
|
||||
*
|
||||
* THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
|
||||
******************************************************************************/
|
||||
|
||||
/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
|
||||
|
||||
#ifndef __OPENCL_H
|
||||
#define __OPENCL_H
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
||||
#include <OpenCL/cl.h>
|
||||
#include <OpenCL/cl_gl.h>
|
||||
#include <OpenCL/cl_gl_ext.h>
|
||||
#include <OpenCL/cl_ext.h>
|
||||
|
||||
#else
|
||||
|
||||
#include "cl.h"
|
||||
#include "cl_gl.h"
|
||||
#include "cl_gl_ext.h"
|
||||
#include "cl_ext.h"
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __OPENCL_H */
|
||||
|
288
include/triton/external/hip.h
vendored
Normal file
288
include/triton/external/hip.h
vendored
Normal file
@@ -0,0 +1,288 @@
|
||||
/*
|
||||
* @brief hipError_t
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
|
||||
// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
|
||||
|
||||
// Ignoring error-code return values from hip APIs is discouraged. On C++17,
|
||||
// we can make that yield a warning
|
||||
|
||||
/*
|
||||
* @brief hipError_t
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
|
||||
// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
typedef enum hipError_t {
|
||||
hipSuccess = 0, ///< Successful completion.
|
||||
hipErrorInvalidValue = 1, ///< One or more of the parameters passed to the API call is NULL
|
||||
///< or not in an acceptable range.
|
||||
hipErrorOutOfMemory = 2,
|
||||
// Deprecated
|
||||
hipErrorMemoryAllocation = 2, ///< Memory allocation error.
|
||||
hipErrorNotInitialized = 3,
|
||||
// Deprecated
|
||||
hipErrorInitializationError = 3,
|
||||
hipErrorDeinitialized = 4,
|
||||
hipErrorProfilerDisabled = 5,
|
||||
hipErrorProfilerNotInitialized = 6,
|
||||
hipErrorProfilerAlreadyStarted = 7,
|
||||
hipErrorProfilerAlreadyStopped = 8,
|
||||
hipErrorInvalidConfiguration = 9,
|
||||
hipErrorInvalidPitchValue = 12,
|
||||
hipErrorInvalidSymbol = 13,
|
||||
hipErrorInvalidDevicePointer = 17, ///< Invalid Device Pointer
|
||||
hipErrorInvalidMemcpyDirection = 21, ///< Invalid memory copy direction
|
||||
hipErrorInsufficientDriver = 35,
|
||||
hipErrorMissingConfiguration = 52,
|
||||
hipErrorPriorLaunchFailure = 53,
|
||||
hipErrorInvalidDeviceFunction = 98,
|
||||
hipErrorNoDevice = 100, ///< Call to hipGetDeviceCount returned 0 devices
|
||||
hipErrorInvalidDevice = 101, ///< DeviceID must be in range 0...#compute-devices.
|
||||
hipErrorInvalidImage = 200,
|
||||
hipErrorInvalidContext = 201, ///< Produced when input context is invalid.
|
||||
hipErrorContextAlreadyCurrent = 202,
|
||||
hipErrorMapFailed = 205,
|
||||
// Deprecated
|
||||
hipErrorMapBufferObjectFailed = 205, ///< Produced when the IPC memory attach failed from ROCr.
|
||||
hipErrorUnmapFailed = 206,
|
||||
hipErrorArrayIsMapped = 207,
|
||||
hipErrorAlreadyMapped = 208,
|
||||
hipErrorNoBinaryForGpu = 209,
|
||||
hipErrorAlreadyAcquired = 210,
|
||||
hipErrorNotMapped = 211,
|
||||
hipErrorNotMappedAsArray = 212,
|
||||
hipErrorNotMappedAsPointer = 213,
|
||||
hipErrorECCNotCorrectable = 214,
|
||||
hipErrorUnsupportedLimit = 215,
|
||||
hipErrorContextAlreadyInUse = 216,
|
||||
hipErrorPeerAccessUnsupported = 217,
|
||||
hipErrorInvalidKernelFile = 218, ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
|
||||
hipErrorInvalidGraphicsContext = 219,
|
||||
hipErrorInvalidSource = 300,
|
||||
hipErrorFileNotFound = 301,
|
||||
hipErrorSharedObjectSymbolNotFound = 302,
|
||||
hipErrorSharedObjectInitFailed = 303,
|
||||
hipErrorOperatingSystem = 304,
|
||||
hipErrorInvalidHandle = 400,
|
||||
// Deprecated
|
||||
hipErrorInvalidResourceHandle = 400, ///< Resource handle (hipEvent_t or hipStream_t) invalid.
|
||||
hipErrorNotFound = 500,
|
||||
hipErrorNotReady = 600, ///< Indicates that asynchronous operations enqueued earlier are not
|
||||
///< ready. This is not actually an error, but is used to distinguish
|
||||
///< from hipSuccess (which indicates completion). APIs that return
|
||||
///< this error include hipEventQuery and hipStreamQuery.
|
||||
hipErrorIllegalAddress = 700,
|
||||
hipErrorLaunchOutOfResources = 701, ///< Out of resources error.
|
||||
hipErrorLaunchTimeOut = 702,
|
||||
hipErrorPeerAccessAlreadyEnabled =
|
||||
704, ///< Peer access was already enabled from the current device.
|
||||
hipErrorPeerAccessNotEnabled =
|
||||
705, ///< Peer access was never enabled from the current device.
|
||||
hipErrorSetOnActiveProcess = 708,
|
||||
hipErrorAssert = 710, ///< Produced when the kernel calls assert.
|
||||
hipErrorHostMemoryAlreadyRegistered =
|
||||
712, ///< Produced when trying to lock a page-locked memory.
|
||||
hipErrorHostMemoryNotRegistered =
|
||||
713, ///< Produced when trying to unlock a non-page-locked memory.
|
||||
hipErrorLaunchFailure =
|
||||
719, ///< An exception occurred on the device while executing a kernel.
|
||||
hipErrorCooperativeLaunchTooLarge =
|
||||
720, ///< This error indicates that the number of blocks launched per grid for a kernel
|
||||
///< that was launched via cooperative launch APIs exceeds the maximum number of
|
||||
///< allowed blocks for the current device
|
||||
hipErrorNotSupported = 801, ///< Produced when the hip API is not supported/implemented
|
||||
hipErrorUnknown = 999, //< Unknown error.
|
||||
// HSA Runtime Error Codes start here.
|
||||
hipErrorRuntimeMemory = 1052, ///< HSA runtime memory call returned error. Typically not seen
|
||||
///< in production systems.
|
||||
hipErrorRuntimeOther = 1053, ///< HSA runtime call other than memory returned error. Typically
|
||||
///< not seen in production systems.
|
||||
hipErrorTbd ///< Marker that more error codes are needed.
|
||||
} hipError_t;
|
||||
|
||||
|
||||
typedef struct ihipCtx_t* hipCtx_t;
|
||||
|
||||
// Note many APIs also use integer deviceIds as an alternative to the device pointer:
|
||||
typedef int hipDevice_t;
|
||||
|
||||
typedef enum hipDeviceP2PAttr {
|
||||
hipDevP2PAttrPerformanceRank = 0,
|
||||
hipDevP2PAttrAccessSupported,
|
||||
hipDevP2PAttrNativeAtomicSupported,
|
||||
hipDevP2PAttrHipArrayAccessSupported
|
||||
} hipDeviceP2PAttr;
|
||||
|
||||
typedef struct ihipStream_t* hipStream_t;
|
||||
|
||||
#define hipIpcMemLazyEnablePeerAccess 0
|
||||
|
||||
#define HIP_IPC_HANDLE_SIZE 64
|
||||
|
||||
typedef struct hipIpcMemHandle_st {
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
} hipIpcMemHandle_t;
|
||||
|
||||
typedef struct hipIpcEventHandle_st {
|
||||
char reserved[HIP_IPC_HANDLE_SIZE];
|
||||
} hipIpcEventHandle_t;
|
||||
|
||||
typedef struct ihipModule_t* hipModule_t;
|
||||
|
||||
typedef struct ihipModuleSymbol_t* hipFunction_t;
|
||||
|
||||
typedef struct hipFuncAttributes {
|
||||
int binaryVersion;
|
||||
int cacheModeCA;
|
||||
size_t constSizeBytes;
|
||||
size_t localSizeBytes;
|
||||
int maxDynamicSharedSizeBytes;
|
||||
int maxThreadsPerBlock;
|
||||
int numRegs;
|
||||
int preferredShmemCarveout;
|
||||
int ptxVersion;
|
||||
size_t sharedSizeBytes;
|
||||
} hipFuncAttributes;
|
||||
|
||||
typedef struct ihipEvent_t* hipEvent_t;
|
||||
|
||||
/*
|
||||
* @brief hipDeviceAttribute_t
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
typedef enum hipDeviceAttribute_t {
|
||||
hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block.
|
||||
hipDeviceAttributeMaxBlockDimX, ///< Maximum x-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimY, ///< Maximum y-dimension of a block.
|
||||
hipDeviceAttributeMaxBlockDimZ, ///< Maximum z-dimension of a block.
|
||||
hipDeviceAttributeMaxGridDimX, ///< Maximum x-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimY, ///< Maximum y-dimension of a grid.
|
||||
hipDeviceAttributeMaxGridDimZ, ///< Maximum z-dimension of a grid.
|
||||
hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in
|
||||
///< bytes.
|
||||
hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes.
|
||||
hipDeviceAttributeWarpSize, ///< Warp size in threads.
|
||||
hipDeviceAttributeMaxRegistersPerBlock, ///< Maximum number of 32-bit registers available to a
|
||||
///< thread block. This number is shared by all thread
|
||||
///< blocks simultaneously resident on a
|
||||
///< multiprocessor.
|
||||
hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz.
|
||||
hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz.
|
||||
hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits.
|
||||
hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device.
|
||||
hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in.
|
||||
hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
|
||||
///< cache.
|
||||
hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per
|
||||
///< multiprocessor.
|
||||
hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number.
|
||||
hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number.
|
||||
hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels
|
||||
///< concurrently.
|
||||
hipDeviceAttributePciBusId, ///< PCI Bus ID.
|
||||
hipDeviceAttributePciDeviceId, ///< PCI Device ID.
|
||||
hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory Per
|
||||
///< Multiprocessor.
|
||||
hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices.
|
||||
hipDeviceAttributeIntegrated, ///< iGPU
|
||||
hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch
|
||||
hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices
|
||||
hipDeviceAttributeMaxTexture1DWidth, ///< Maximum number of elements in 1D images
|
||||
hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D images in image elements
|
||||
hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension height of 2D images in image elements
|
||||
hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D images in image elements
|
||||
hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimensions height of 3D images in image elements
|
||||
hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimensions depth of 3D images in image elements
|
||||
|
||||
hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
|
||||
hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
|
||||
|
||||
hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies
|
||||
hipDeviceAttributeTextureAlignment, ///<Alignment requirement for textures
|
||||
hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
|
||||
hipDeviceAttributeKernelExecTimeout, ///<Run time limit for kernels executed on the device
|
||||
hipDeviceAttributeCanMapHostMemory, ///<Device can map host memory into device address space
|
||||
hipDeviceAttributeEccEnabled, ///<Device has ECC support enabled
|
||||
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, ///< Supports cooperative launch on multiple
|
||||
///devices with unmatched functions
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, ///< Supports cooperative launch on multiple
|
||||
///devices with unmatched grid dimensions
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, ///< Supports cooperative launch on multiple
|
||||
///devices with unmatched block dimensions
|
||||
hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple
|
||||
///devices with unmatched shared memories
|
||||
hipDeviceAttributeAsicRevision, ///< Revision of the GPU in this device
|
||||
hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system
|
||||
hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
|
||||
/// the device without migration
|
||||
hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory
|
||||
/// concurrently with the CPU
|
||||
hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory
|
||||
/// without calling hipHostRegister on it
|
||||
hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
|
||||
/// the host's page tables
|
||||
hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
|
||||
///< hipStreamWaitValue64() , '0' otherwise.
|
||||
|
||||
} hipDeviceAttribute_t;
|
||||
|
||||
typedef void* hipDeviceptr_t;
|
||||
|
||||
/*
|
||||
* @brief hipJitOption
|
||||
* @enum
|
||||
* @ingroup Enumerations
|
||||
*/
|
||||
typedef enum hipJitOption {
|
||||
hipJitOptionMaxRegisters = 0,
|
||||
hipJitOptionThreadsPerBlock,
|
||||
hipJitOptionWallTime,
|
||||
hipJitOptionInfoLogBuffer,
|
||||
hipJitOptionInfoLogBufferSizeBytes,
|
||||
hipJitOptionErrorLogBuffer,
|
||||
hipJitOptionErrorLogBufferSizeBytes,
|
||||
hipJitOptionOptimizationLevel,
|
||||
hipJitOptionTargetFromContext,
|
||||
hipJitOptionTarget,
|
||||
hipJitOptionFallbackStrategy,
|
||||
hipJitOptionGenerateDebugInfo,
|
||||
hipJitOptionLogVerbose,
|
||||
hipJitOptionGenerateLineInfo,
|
||||
hipJitOptionCacheMode,
|
||||
hipJitOptionSm3xOpt,
|
||||
hipJitOptionFastCompile,
|
||||
hipJitOptionNumOptions
|
||||
} hipJitOption;
|
||||
|
||||
/**
|
||||
* @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
|
||||
*/
|
||||
typedef enum hipFuncAttribute {
|
||||
hipFuncAttributeMaxDynamicSharedMemorySize = 8,
|
||||
hipFuncAttributePreferredSharedMemoryCarveout = 9,
|
||||
hipFuncAttributeMax
|
||||
} hipFuncAttribute;
|
||||
|
||||
/**
|
||||
* @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
|
||||
*/
|
||||
typedef enum hipFuncCache_t {
|
||||
hipFuncCachePreferNone, ///< no preference for shared memory or L1 (default)
|
||||
hipFuncCachePreferShared, ///< prefer larger shared memory and smaller L1 cache
|
||||
hipFuncCachePreferL1, ///< prefer larger L1 cache and smaller shared memory
|
||||
hipFuncCachePreferEqual, ///< prefer equal size L1 cache and shared memory
|
||||
} hipFuncCache_t;
|
||||
|
||||
|
||||
#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
|
||||
#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
|
||||
#define HIP_LAUNCH_PARAM_END ((void*)0x03)
|
@@ -13,45 +13,40 @@
|
||||
#include "triton/codegen/transform/peephole.h"
|
||||
#include "triton/codegen/transform/pipeline.h"
|
||||
#include "triton/codegen/transform/prefetch.h"
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/kernel.h"
|
||||
#include "triton/driver/module.h"
|
||||
#include "triton/ir/function.h"
|
||||
#include "triton/ir/module.h"
|
||||
#include "triton/ir/print.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
namespace triton {
|
||||
namespace codegen {
|
||||
|
||||
// TODO:
|
||||
// There should be a proper pass manager there!
|
||||
void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
|
||||
driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
|
||||
std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
|
||||
int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
|
||||
// generate llvm code
|
||||
llvm::LLVMContext ctx;
|
||||
std::string name = ir.get_function_list()[0]->get_name();
|
||||
std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
|
||||
// optimizations
|
||||
std::unique_ptr<codegen::target> target = dev->make_target();
|
||||
bool cts_use_async = target->as_nvidia()->sm() >= 80;
|
||||
bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
|
||||
// create passes
|
||||
codegen::analysis::align align;
|
||||
codegen::analysis::axes axes;
|
||||
codegen::transform::cts cts(cts_use_async);
|
||||
codegen::transform::pipeline pipeline(cts_use_async, num_stages);
|
||||
codegen::transform::disassociate disassociate;
|
||||
codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
|
||||
codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
|
||||
codegen::analysis::liveness liveness(&layouts);
|
||||
codegen::analysis::swizzle swizzle(&layouts, target.get());
|
||||
codegen::analysis::swizzle swizzle(&layouts, target);
|
||||
codegen::analysis::allocation allocation(&liveness);
|
||||
codegen::transform::dce dce;
|
||||
codegen::transform::peephole peephole(target.get(), &layouts);
|
||||
// codegen::transform::reassociate reassociate;
|
||||
codegen::transform::peephole peephole(target, &layouts);
|
||||
codegen::transform::coalesce coalesce(&align, &layouts);
|
||||
codegen::transform::prefetch prefetch_s(target.get());
|
||||
codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
|
||||
codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
|
||||
codegen::transform::prefetch prefetch_s(target);
|
||||
codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
|
||||
codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
|
||||
// run passes
|
||||
dce.run(ir);
|
||||
peephole.run(ir);
|
||||
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
|
||||
layouts.run(ir);
|
||||
coalesce.run(ir);
|
||||
dce.run(ir);
|
||||
// exit(1);
|
||||
|
||||
align.run(ir);
|
||||
dce.run(ir);
|
||||
if (target->is_gpu())
|
||||
cts.run(ir);
|
||||
dce.run(ir);
|
||||
align.run(ir);
|
||||
// ir::print(ir, std::cout);
|
||||
axes.run(ir);
|
||||
layouts.run(ir);
|
||||
peephole.run(ir);
|
||||
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
|
||||
allocation.run(ir);
|
||||
prefetch_s.run(ir);
|
||||
barriers.run(ir);
|
||||
// ir.print(std::cout);
|
||||
isel.visit(ir, *llvm);
|
||||
mod = driver::module::create(dev, std::move(llvm));
|
||||
ker = driver::kernel::create(&*mod, name.c_str());
|
||||
shared_mem = allocation.allocated_size();
|
||||
shared_static = allocation.allocated_size();
|
||||
return llvm;
|
||||
}
|
||||
|
||||
} // namespace codegen
|
||||
|
@@ -1,231 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <stdexcept>
|
||||
#include "triton/driver/dispatch.h"
|
||||
#include "triton/driver/backend.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/stream.h"
|
||||
#include "triton/driver/kernel.h"
|
||||
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/*-----------------------------------*/
|
||||
//----------- Platforms ------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::platforms::init() {
|
||||
if(!cache_.empty())
|
||||
return;
|
||||
//if CUDA is here
|
||||
if(dispatch::cuinit()){
|
||||
cache_.push_back(new cu_platform());
|
||||
}
|
||||
//if host should be added
|
||||
bool host_visible = true;
|
||||
if(host_visible){
|
||||
cache_.push_back(new host_platform());
|
||||
}
|
||||
|
||||
// //if OpenCL is here
|
||||
// if(dispatch::clinit()){
|
||||
// cl_uint num_platforms;
|
||||
// dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
|
||||
// std::vector<cl_platform_id> ids(num_platforms);
|
||||
// dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
|
||||
// for(cl_platform_id id: ids)
|
||||
// cache_.push_back(new cl_platform(id));
|
||||
// }
|
||||
|
||||
if(cache_.empty())
|
||||
throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
|
||||
}
|
||||
|
||||
void backend::platforms::get(std::vector<platform *> &results) {
|
||||
std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
|
||||
}
|
||||
|
||||
std::vector<driver::platform*> backend::platforms::cache_;
|
||||
|
||||
|
||||
/*-----------------------------------*/
|
||||
//----------- Devices --------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::devices::init(std::vector<platform*> const & platforms) {
|
||||
if(!cache_.empty())
|
||||
return;
|
||||
for(driver::platform* pf: platforms)
|
||||
pf->devices(cache_);
|
||||
if(cache_.empty())
|
||||
throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
|
||||
}
|
||||
|
||||
void backend::devices::get(std::vector<device*> &devs) {
|
||||
std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
|
||||
}
|
||||
|
||||
std::vector<driver::device*> backend::devices::cache_;
|
||||
|
||||
|
||||
|
||||
/*-----------------------------------*/
|
||||
//---------- Modules ----------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::modules::release(){
|
||||
for(auto & x: cache_)
|
||||
delete x.second;
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
std::map<std::tuple<driver::stream*, std::string>, driver::module*> backend::modules::cache_;
|
||||
|
||||
/*-----------------------------------*/
|
||||
//----------- Kernels --------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::kernels::release(){
|
||||
for(auto & x: cache_)
|
||||
delete x.second;
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
|
||||
std::tuple<driver::module*, std::string> key(mod, name);
|
||||
if(cache_.find(key)==cache_.end()){
|
||||
return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
|
||||
}
|
||||
return cache_.at(key);
|
||||
}
|
||||
|
||||
std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
|
||||
|
||||
/*-----------------------------------*/
|
||||
//------------ Queues --------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::streams::init(std::list<driver::context*> const & contexts){
|
||||
for(driver::context* ctx : contexts)
|
||||
if(cache_.find(ctx)==cache_.end())
|
||||
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
|
||||
}
|
||||
|
||||
void backend::streams::release(){
|
||||
for(auto & x: cache_)
|
||||
for(auto & y: x.second)
|
||||
delete y;
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
driver::stream* backend::streams::get_default()
|
||||
{ return get(contexts::get_default(), 0); }
|
||||
|
||||
driver::stream* backend::streams::get(driver::context* context, unsigned int id){
|
||||
init(std::list<driver::context*>(1,context));
|
||||
for(auto & x : cache_)
|
||||
if(x.first==context)
|
||||
return x.second[id];
|
||||
throw;
|
||||
}
|
||||
|
||||
void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
|
||||
init(std::list<driver::context*>(1,context));
|
||||
queues = cache_.at(context);
|
||||
}
|
||||
|
||||
std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
|
||||
|
||||
/*-----------------------------------*/
|
||||
//------------ Contexts ------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::contexts::init(std::vector<driver::device*> const & devices){
|
||||
for(driver::device* dvc: devices)
|
||||
cache_.push_back(driver::context::create(dvc));
|
||||
}
|
||||
|
||||
void backend::contexts::release(){
|
||||
for(auto & x: cache_)
|
||||
delete x;
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
driver::context* backend::contexts::get_default(){
|
||||
backend::init();
|
||||
auto it = cache_.begin();
|
||||
std::advance(it, default_device);
|
||||
return *it;
|
||||
}
|
||||
|
||||
void backend::contexts::get(std::list<driver::context*> & contexts){
|
||||
backend::init();
|
||||
contexts = cache_;
|
||||
}
|
||||
|
||||
std::list<driver::context*> backend::contexts::cache_;
|
||||
|
||||
|
||||
|
||||
/*-----------------------------------*/
|
||||
//------------ General -------------*/
|
||||
/*-----------------------------------*/
|
||||
|
||||
void backend::synchronize(driver::context* context){
|
||||
for(driver::stream * queue: streams::cache_.at(context))
|
||||
queue->synchronize();
|
||||
}
|
||||
|
||||
|
||||
void backend::release(){
|
||||
backend::kernels::release();
|
||||
// backend::programs::release();
|
||||
backend::streams::release();
|
||||
backend::contexts::release();
|
||||
}
|
||||
|
||||
|
||||
void backend::init(){
|
||||
if(!contexts::cache_.empty())
|
||||
return;
|
||||
// initialize platforms
|
||||
backend::platforms::init();
|
||||
// initialize devices
|
||||
backend::devices::init(platforms::cache_);
|
||||
// initialize contexts
|
||||
backend::contexts::init(devices::cache_);
|
||||
// initialize streams
|
||||
streams::init(contexts::cache_);
|
||||
}
|
||||
|
||||
unsigned int backend::default_device = 0;
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -1,90 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "triton/driver/stream.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/dispatch.h"
|
||||
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
|
||||
//
|
||||
|
||||
buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
|
||||
: polymorphic_resource(cu, take_ownership), size_(size) { }
|
||||
|
||||
buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
|
||||
: polymorphic_resource(hst, take_ownership), size_(size) { }
|
||||
|
||||
size_t buffer::size() {
|
||||
return size_;
|
||||
}
|
||||
|
||||
uintptr_t buffer::addr_as_uintptr_t() {
|
||||
switch(backend_){
|
||||
case CUDA: return *cu_;
|
||||
case Host: return (uintptr_t)hst_->data;
|
||||
default: return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
buffer* buffer::create(driver::context* ctx, size_t size) {
|
||||
switch(ctx->backend()){
|
||||
case CUDA: return new cu_buffer(size);
|
||||
case Host: return new host_buffer(size);
|
||||
default: throw std::runtime_error("unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
//
|
||||
|
||||
host_buffer::host_buffer(size_t size)
|
||||
: buffer(size, host_buffer_t(), true){
|
||||
hst_->data = new char[size];
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
|
||||
cu_buffer::cu_buffer(size_t size)
|
||||
: buffer(size, CUdeviceptr(), true) {
|
||||
dispatch::cuMemAlloc(&*cu_, size);
|
||||
}
|
||||
|
||||
cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
|
||||
: buffer(size, cu, take_ownership){
|
||||
}
|
||||
|
||||
void cu_buffer::set_zero(driver::stream* queue, size_t size){
|
||||
dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -1,118 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/module.h"
|
||||
#include "triton/tools/sys/getenv.hpp"
|
||||
#include "triton/tools/sys/mkdir.hpp"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/* ------------------------ */
|
||||
// BASE //
|
||||
/* ------------------------ */
|
||||
|
||||
context::context(driver::device *dev, CUcontext cu, bool take_ownership):
|
||||
polymorphic_resource(cu, take_ownership),
|
||||
dev_(dev), cache_path_(get_cache_path()) {
|
||||
}
|
||||
|
||||
context::context(driver::device *dev, host_context_t hst, bool take_ownership):
|
||||
polymorphic_resource(hst, take_ownership),
|
||||
dev_(dev), cache_path_(get_cache_path()){
|
||||
}
|
||||
|
||||
context* context::create(driver::device *dev){
|
||||
switch(dev->backend()){
|
||||
case CUDA: return new cu_context(dev);
|
||||
case Host: return new host_context(dev);
|
||||
default: throw std::runtime_error("unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
driver::device* context::device() const {
|
||||
return dev_;
|
||||
}
|
||||
|
||||
std::string context::get_cache_path(){
|
||||
//user-specified cache path
|
||||
std::string result = tools::getenv("TRITON_CACHE_PATH");
|
||||
if(!result.empty()){
|
||||
if(tools::mkpath(result)==0)
|
||||
return result;
|
||||
}
|
||||
//create in home
|
||||
result = tools::getenv("HOME");
|
||||
if(!result.empty())
|
||||
{
|
||||
result = result + "/.triton/cache/";
|
||||
if(tools::mkpath(result)==0)
|
||||
return result;
|
||||
}
|
||||
//couldn't find a directory
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string const & context::cache_path() const{
|
||||
return cache_path_;
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
|
||||
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
// import CUdevice
|
||||
CUdevice cu_context::get_device_of(CUcontext context){
|
||||
dispatch::cuCtxPushCurrent_v2(context);
|
||||
CUdevice res;
|
||||
dispatch::cuCtxGetDevice(&res);
|
||||
dispatch::cuCtxPopCurrent_v2(NULL);
|
||||
return res;
|
||||
}
|
||||
|
||||
// wrapper for cuda context
|
||||
cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
|
||||
context, take_ownership) {
|
||||
}
|
||||
|
||||
cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
|
||||
dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
|
||||
// dispatch::cuCtxPopCurrent_v2(NULL);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -1,192 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <map>
|
||||
#include <algorithm>
|
||||
#include <sstream>
|
||||
#include <cstring>
|
||||
#include <memory>
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/error.h"
|
||||
#include "triton/codegen/target.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
std::unique_ptr<codegen::target> host_device::make_target() const {
|
||||
return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
// information query
|
||||
template<CUdevice_attribute attr>
|
||||
int cu_device::cuGetInfo() const{
|
||||
int res;
|
||||
dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
|
||||
return res;
|
||||
}
|
||||
|
||||
// convert to nvml
|
||||
nvmlDevice_t cu_device::nvml_device() const{
|
||||
std::map<std::string, nvmlDevice_t> map;
|
||||
std::string key = pci_bus_id();
|
||||
if(map.find(key)==map.end()){
|
||||
nvmlDevice_t device;
|
||||
dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
|
||||
return map.insert(std::make_pair(key, device)).first->second;
|
||||
}
|
||||
return map.at(key);
|
||||
}
|
||||
|
||||
// number of address bits
|
||||
size_t cu_device::address_bits() const{
|
||||
return sizeof(size_t)*8;
|
||||
}
|
||||
|
||||
// name
|
||||
std::string cu_device::name() const {
|
||||
char tmp[128];
|
||||
dispatch::cuDeviceGetName(tmp, 128, *cu_);
|
||||
return std::string(tmp);
|
||||
}
|
||||
|
||||
// PCI bus ID
|
||||
std::string cu_device::pci_bus_id() const{
|
||||
char tmp[128];
|
||||
dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
|
||||
return std::string(tmp);
|
||||
}
|
||||
|
||||
// force the device to be interpreted as a particular cc
|
||||
void cu_device::interpret_as(int cc){
|
||||
interpreted_as_ = std::make_shared<int>(cc);
|
||||
}
|
||||
|
||||
// compute capability
|
||||
int cu_device::compute_capability() const {
|
||||
if(interpreted_as_)
|
||||
return *interpreted_as_;
|
||||
size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
|
||||
size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
|
||||
return major*10 + minor;
|
||||
}
|
||||
|
||||
// maximum number of threads per block
|
||||
size_t cu_device::max_threads_per_block() const {
|
||||
return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
|
||||
}
|
||||
|
||||
// maximum amount of shared memory per block
|
||||
size_t cu_device::max_shared_memory() const {
|
||||
return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
|
||||
}
|
||||
|
||||
// warp size
|
||||
size_t cu_device::warp_size() const {
|
||||
return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
|
||||
}
|
||||
|
||||
|
||||
// maximum block dimensions
|
||||
std::vector<size_t> cu_device::max_block_dim() const {
|
||||
std::vector<size_t> result(3);
|
||||
result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
|
||||
result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
|
||||
result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
|
||||
return result;
|
||||
}
|
||||
|
||||
// current SM clock
|
||||
size_t cu_device::current_sm_clock() const{
|
||||
unsigned int result;
|
||||
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// max SM clock
|
||||
size_t cu_device::max_sm_clock() const{
|
||||
unsigned int result;
|
||||
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// current memory clock
|
||||
size_t cu_device::current_mem_clock() const{
|
||||
unsigned int result;
|
||||
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// max memory clock
|
||||
size_t cu_device::max_mem_clock() const{
|
||||
unsigned int result;
|
||||
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
|
||||
return result;
|
||||
}
|
||||
|
||||
// max memory clock
|
||||
void cu_device::set_max_clock() {
|
||||
dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
|
||||
}
|
||||
|
||||
void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
|
||||
CUcontext context;
|
||||
dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
|
||||
try {
|
||||
dispatch::cuCtxEnablePeerAccess(context, 0);
|
||||
} catch (exception::cuda::peer_access_already_enabled) {}
|
||||
}
|
||||
|
||||
// print infos
|
||||
std::string cu_device::infos() const{
|
||||
std::ostringstream oss;
|
||||
std::vector<size_t> max_wi_sizes = max_block_dim();
|
||||
oss << "Platform: CUDA" << std::endl;
|
||||
oss << "Name: " << name() << std::endl;
|
||||
oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
|
||||
oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
|
||||
oss << "Local memory size: " << max_shared_memory() << std::endl;
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
// target
|
||||
std::unique_ptr<codegen::target> cu_device::make_target() const {
|
||||
return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -21,7 +21,6 @@
|
||||
*/
|
||||
|
||||
#include "triton/driver/dispatch.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/tools/sys/getenv.hpp"
|
||||
|
||||
namespace triton
|
||||
@@ -31,65 +30,65 @@ namespace driver
|
||||
|
||||
//Helpers for function definition
|
||||
#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
|
||||
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
|
||||
void* dispatch::fname ## _;
|
||||
|
||||
//Specialized helpers for CUDA
|
||||
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
|
||||
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
|
||||
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
|
||||
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
|
||||
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
|
||||
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
|
||||
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
|
||||
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
|
||||
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
|
||||
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
|
||||
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
|
||||
|
||||
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
|
||||
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
|
||||
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
|
||||
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
|
||||
|
||||
/* ------------------- *
|
||||
* CUDA
|
||||
* ------------------- */
|
||||
|
||||
bool dispatch::cuinit(){
|
||||
if(cuda_==nullptr){
|
||||
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
|
||||
return true;
|
||||
}
|
||||
|
||||
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
|
||||
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
|
||||
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
|
||||
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
|
||||
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
|
||||
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
|
||||
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
|
||||
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
|
||||
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
|
||||
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
|
||||
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
|
||||
|
||||
// context management
|
||||
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
|
||||
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
|
||||
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
|
||||
CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
|
||||
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
|
||||
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
|
||||
// device management
|
||||
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
|
||||
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
|
||||
|
||||
// link management
|
||||
CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
|
||||
CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
|
||||
CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
|
||||
CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
|
||||
// module management
|
||||
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
|
||||
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
|
||||
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
|
||||
CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
|
||||
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
|
||||
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
||||
// stream management
|
||||
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
|
||||
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
|
||||
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
|
||||
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
|
||||
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
|
||||
// function management
|
||||
CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
|
||||
CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
|
||||
CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
|
||||
// memory management
|
||||
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
|
||||
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
|
||||
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
|
||||
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
|
||||
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
|
||||
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
|
||||
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
|
||||
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
|
||||
// event management
|
||||
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
|
||||
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
|
||||
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
|
||||
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
|
||||
|
||||
|
||||
|
||||
/* ------------------- *
|
||||
* NVML
|
||||
* ------------------- */
|
||||
bool dispatch::nvmlinit(){
|
||||
if(nvml_==nullptr)
|
||||
nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
|
||||
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
|
||||
return res;
|
||||
}
|
||||
|
||||
//CUDA
|
||||
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
|
||||
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
|
||||
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
|
||||
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
|
||||
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
|
||||
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
|
||||
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
|
||||
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
|
||||
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
|
||||
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
|
||||
CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
|
||||
CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
|
||||
CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
|
||||
|
||||
CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
|
||||
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
|
||||
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
|
||||
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
|
||||
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
|
||||
CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
|
||||
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
|
||||
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
|
||||
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
|
||||
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
|
||||
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
|
||||
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
|
||||
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
|
||||
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
||||
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
|
||||
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
|
||||
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
|
||||
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
|
||||
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
|
||||
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
|
||||
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
|
||||
CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
|
||||
CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
|
||||
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
|
||||
CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
|
||||
CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
|
||||
CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
|
||||
CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
|
||||
CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
|
||||
CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
|
||||
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
|
||||
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
|
||||
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
|
||||
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
|
||||
|
||||
NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
|
||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
|
||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
|
||||
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
|
||||
|
||||
/* ------------------- *
|
||||
* HIP
|
||||
* ------------------- */
|
||||
bool dispatch::hipinit(){
|
||||
if(hip_==nullptr)
|
||||
hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
|
||||
if(hip_ == nullptr)
|
||||
return false;
|
||||
hipError_t (*fptr)();
|
||||
hipInit_ = dlsym(hip_, "hipInit");
|
||||
*reinterpret_cast<void **>(&fptr) = hipInit_;
|
||||
hipError_t res = (*fptr)();
|
||||
check(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
|
||||
#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
|
||||
#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
|
||||
#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
|
||||
#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
|
||||
#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
|
||||
#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
|
||||
#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
|
||||
#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
|
||||
#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
|
||||
#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
|
||||
|
||||
// context management
|
||||
HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
|
||||
HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
|
||||
HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
|
||||
HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
|
||||
HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
|
||||
HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
|
||||
HIP_DEFINE1(hipError_t, hipInit, unsigned int)
|
||||
HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
|
||||
// device management
|
||||
HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
|
||||
HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
|
||||
HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
|
||||
HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
|
||||
HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
|
||||
// module management
|
||||
HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
|
||||
HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
|
||||
HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
|
||||
HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
|
||||
HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
|
||||
HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
|
||||
// stream management
|
||||
HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
|
||||
HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
|
||||
HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
|
||||
HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
|
||||
// function management
|
||||
HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
|
||||
HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
|
||||
// memory management
|
||||
HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
|
||||
HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
|
||||
HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
|
||||
HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
|
||||
HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
|
||||
HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
|
||||
HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
|
||||
HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
|
||||
// event management
|
||||
HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
|
||||
HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
|
||||
HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
|
||||
HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
|
||||
|
||||
|
||||
/* ------------------- *
|
||||
* COMMON
|
||||
* ------------------- */
|
||||
|
||||
// Release
|
||||
void dispatch::release(){
|
||||
@@ -190,61 +291,9 @@ void dispatch::release(){
|
||||
|
||||
void* dispatch::cuda_;
|
||||
void* dispatch::nvml_;
|
||||
|
||||
//CUDA
|
||||
void* dispatch::cuCtxGetCurrent_;
|
||||
void* dispatch::cuCtxSetCurrent_;
|
||||
void* dispatch::cuCtxDestroy_v2_;
|
||||
void* dispatch::cuEventCreate_;
|
||||
void* dispatch::cuDeviceGet_;
|
||||
void* dispatch::cuMemcpyDtoH_v2_;
|
||||
void* dispatch::cuStreamCreate_;
|
||||
void* dispatch::cuEventElapsedTime_;
|
||||
void* dispatch::cuMemFree_v2_;
|
||||
void* dispatch::cuMemcpyDtoHAsync_v2_;
|
||||
void* dispatch::cuDriverGetVersion_;
|
||||
void* dispatch::cuDeviceGetName_;
|
||||
void* dispatch::cuDeviceGetPCIBusId_;
|
||||
void* dispatch::cuModuleGetGlobal_v2_;
|
||||
|
||||
void* dispatch::cuLinkAddData_v2_;
|
||||
void* dispatch::cuLinkCreate_v2_;
|
||||
void* dispatch::cuLinkDestroy_;
|
||||
void* dispatch::cuModuleLoadData_;
|
||||
void* dispatch::cuLinkComplete_;
|
||||
|
||||
void* dispatch::cuMemcpyHtoDAsync_v2_;
|
||||
void* dispatch::cuModuleLoad_;
|
||||
void* dispatch::cuLaunchKernel_;
|
||||
void* dispatch::cuModuleUnload_;
|
||||
void* dispatch::cuModuleLoadDataEx_;
|
||||
void* dispatch::cuDeviceGetAttribute_;
|
||||
void* dispatch::cuDeviceGetCount_;
|
||||
void* dispatch::cuMemcpyHtoD_v2_;
|
||||
void* dispatch::cuInit_;
|
||||
void* dispatch::cuEventRecord_;
|
||||
void* dispatch::cuCtxCreate_v2_;
|
||||
void* dispatch::cuModuleGetFunction_;
|
||||
void* dispatch::cuStreamSynchronize_;
|
||||
void* dispatch::cuStreamDestroy_v2_;
|
||||
void* dispatch::cuStreamGetCtx_;
|
||||
void* dispatch::cuEventDestroy_v2_;
|
||||
void* dispatch::cuMemAlloc_v2_;
|
||||
void* dispatch::cuPointerGetAttribute_;
|
||||
void* dispatch::cuCtxGetDevice_;
|
||||
void* dispatch::cuMemsetD8Async_;
|
||||
void* dispatch::cuCtxPushCurrent_v2_;
|
||||
void* dispatch::cuCtxPopCurrent_v2_;
|
||||
void* dispatch::cuFuncGetAttribute_;
|
||||
void* dispatch::cuFuncSetAttribute_;
|
||||
void* dispatch::cuFuncSetCacheConfig_;
|
||||
void* dispatch::cuCtxEnablePeerAccess_;
|
||||
|
||||
void* dispatch::nvmlInit_v2_;
|
||||
void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
|
||||
void* dispatch::nvmlDeviceGetClockInfo_;
|
||||
void* dispatch::nvmlDeviceGetMaxClockInfo_;
|
||||
void* dispatch::nvmlDeviceSetApplicationsClocks_;
|
||||
void* dispatch::hip_;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -94,6 +94,73 @@ void check(CUresult err)
|
||||
}
|
||||
}
|
||||
|
||||
void check(hipError_t error) {
|
||||
using namespace exception::hip;
|
||||
switch(error)
|
||||
{
|
||||
case hipSuccess : break;
|
||||
case hipErrorInvalidValue : throw invalid_value();
|
||||
case hipErrorMemoryAllocation : throw out_of_memory();
|
||||
case hipErrorNotInitialized : throw not_initialized();
|
||||
case hipErrorDeinitialized : throw deinitialized();
|
||||
case hipErrorProfilerDisabled : throw profiler_disabled();
|
||||
case hipErrorProfilerNotInitialized : throw profiler_not_initialized();
|
||||
case hipErrorProfilerAlreadyStarted : throw profiler_already_started();
|
||||
case hipErrorProfilerAlreadyStopped : throw profiler_already_stopped();
|
||||
case hipErrorNoDevice : throw no_device();
|
||||
case hipErrorInvalidSymbol : throw invalid_symbol();
|
||||
case hipErrorInvalidDevice : throw invalid_device();
|
||||
case hipErrorInvalidImage : throw invalid_image();
|
||||
case hipErrorInvalidContext : throw invalid_context();
|
||||
case hipErrorContextAlreadyCurrent : throw context_already_current();
|
||||
case hipErrorMapFailed : throw map_failed();
|
||||
case hipErrorUnmapFailed : throw unmap_failed();
|
||||
case hipErrorArrayIsMapped : throw array_is_mapped();
|
||||
case hipErrorAlreadyMapped : throw already_mapped();
|
||||
case hipErrorNoBinaryForGpu : throw no_binary_for_gpu();
|
||||
case hipErrorAlreadyAcquired : throw already_acquired();
|
||||
case hipErrorNotMapped : throw not_mapped();
|
||||
case hipErrorNotMappedAsArray : throw not_mapped_as_array();
|
||||
case hipErrorNotMappedAsPointer : throw not_mapped_as_pointer();
|
||||
case hipErrorECCNotCorrectable : throw ecc_uncorrectable();
|
||||
case hipErrorUnsupportedLimit : throw unsupported_limit();
|
||||
case hipErrorContextAlreadyInUse : throw context_already_in_use();
|
||||
case hipErrorPeerAccessUnsupported : throw peer_access_unsupported();
|
||||
case hipErrorInvalidKernelFile : throw invalid_ptx();
|
||||
case hipErrorInvalidGraphicsContext : throw invalid_graphics_context();
|
||||
case hipErrorInvalidSource : throw invalid_source();
|
||||
case hipErrorFileNotFound : throw file_not_found();
|
||||
case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
|
||||
case hipErrorSharedObjectInitFailed : throw shared_object_init_failed();
|
||||
case hipErrorOperatingSystem : throw operating_system();
|
||||
case hipErrorInvalidResourceHandle : throw invalid_handle();
|
||||
case hipErrorNotFound : throw not_found();
|
||||
case hipErrorNotReady : throw not_ready();
|
||||
case hipErrorIllegalAddress : throw illegal_address();
|
||||
case hipErrorLaunchOutOfResources : throw launch_out_of_resources();
|
||||
case hipErrorLaunchTimeOut : throw launch_timeout();
|
||||
// case hipErrorLaunchIncompatibleTexturing : throw launch_incompatible_texturing();
|
||||
case hipErrorPeerAccessAlreadyEnabled : throw peer_access_already_enabled();
|
||||
case hipErrorPeerAccessNotEnabled : throw peer_access_not_enabled();
|
||||
// case hipErrorPrimaryContextActive : throw primary_context_active();
|
||||
// case hipErrorContextIsDestroyed : throw context_is_destroyed();
|
||||
case hipErrorAssert : throw assert_error();
|
||||
// case hipErrorTooManyPeers : throw too_many_peers();
|
||||
case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
|
||||
case hipErrorHostMemoryNotRegistered : throw host_memory_not_registered();
|
||||
// case hipErrorHardwareStackError : throw hardware_stack_error();
|
||||
// case hipErrorIllegalInstruction : throw illegal_instruction();
|
||||
// case hipErrorMisalignedAddress : throw misaligned_address();
|
||||
// case hipErrorInvalidAddressSpace : throw invalid_address_space();
|
||||
// case hipErrorInvalidPc : throw invalid_pc();
|
||||
case hipErrorLaunchFailure : throw launch_failed();
|
||||
// case hipErrorNotPermitted : throw not_permitted();
|
||||
case hipErrorNotSupported : throw not_supported();
|
||||
case hipErrorUnknown : throw unknown();
|
||||
default : throw unknown();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -1,91 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "triton/driver/handle.h"
|
||||
#include "triton/driver/error.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
//Host
|
||||
inline void _delete(host_platform_t) { }
|
||||
inline void _delete(host_device_t) { }
|
||||
inline void _delete(host_context_t) { }
|
||||
inline void _delete(host_module_t) { }
|
||||
inline void _delete(host_stream_t) { }
|
||||
inline void _delete(host_buffer_t x) { if(x.data) delete[] x.data; }
|
||||
inline void _delete(host_function_t) { }
|
||||
|
||||
//CUDA
|
||||
inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
|
||||
inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
|
||||
inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
|
||||
inline void _delete(CUdevice) { }
|
||||
inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
|
||||
inline void _delete(CUfunction) { }
|
||||
inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
|
||||
inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
|
||||
inline void _delete(CUPlatform){}
|
||||
|
||||
//Constructor
|
||||
template<class T>
|
||||
handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
|
||||
{ }
|
||||
|
||||
template<class T>
|
||||
handle<T>::handle(): has_ownership_(false){ }
|
||||
|
||||
|
||||
template<class T>
|
||||
handle<T>::~handle(){
|
||||
try{
|
||||
if(has_ownership_ && h_ && h_.unique())
|
||||
_delete(*h_);
|
||||
}catch(const exception::cuda::base&){
|
||||
// order of destruction for global variables
|
||||
// is not guaranteed
|
||||
}
|
||||
}
|
||||
|
||||
template class handle<CUdeviceptr>;
|
||||
template class handle<CUstream>;
|
||||
template class handle<CUcontext>;
|
||||
template class handle<CUdevice>;
|
||||
template class handle<cu_event_t>;
|
||||
template class handle<CUfunction>;
|
||||
template class handle<CUmodule>;
|
||||
template class handle<CUPlatform>;
|
||||
|
||||
template class handle<host_platform_t>;
|
||||
template class handle<host_device_t>;
|
||||
template class handle<host_context_t>;
|
||||
template class handle<host_module_t>;
|
||||
template class handle<host_stream_t>;
|
||||
template class handle<host_buffer_t>;
|
||||
template class handle<host_function_t>;
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -1,94 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
#include "triton/driver/kernel.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// Base //
|
||||
/* ------------------------ */
|
||||
|
||||
kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
|
||||
polymorphic_resource(fn, has_ownership), program_(program){
|
||||
}
|
||||
|
||||
|
||||
kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
|
||||
polymorphic_resource(fn, has_ownership), program_(program){
|
||||
}
|
||||
|
||||
kernel* kernel::create(driver::module* program, const char* name) {
|
||||
switch(program->backend()){
|
||||
case CUDA: return new cu_kernel(program, name);
|
||||
case Host: return new host_kernel(program, name);
|
||||
default: throw std::runtime_error("unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
driver::module* kernel::module() {
|
||||
return program_;
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
|
||||
hst_->fn = program->hst()->functions.at(name);
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
|
||||
dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
|
||||
dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
|
||||
// properties
|
||||
int shared_total, shared_optin, shared_static;
|
||||
int n_spills, n_reg;
|
||||
CUdevice dev;
|
||||
dispatch::cuCtxGetDevice(&dev);
|
||||
dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
|
||||
dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
|
||||
dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
|
||||
dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, *cu_);
|
||||
dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
|
||||
// std::cout << n_reg << std::endl;
|
||||
if (shared_optin > 49152){
|
||||
// std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
|
||||
dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
324
lib/driver/llvm.cc
Normal file
324
lib/driver/llvm.cc
Normal file
@@ -0,0 +1,324 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
#include <memory>
|
||||
#include <regex>
|
||||
#include "triton/driver/llvm.h"
|
||||
#include "triton/driver/dispatch.h"
|
||||
#include "triton/driver/error.h"
|
||||
#include "triton/tools/sha1.hpp"
|
||||
#include "triton/tools/sys/getenv.hpp"
|
||||
#include "triton/tools/sys/mkdir.hpp"
|
||||
#include "triton/tools/sys/exec.hpp"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IR/IRPrintingPasses.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/CodeGen.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
|
||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
|
||||
// begin AMD stuff
|
||||
#include "llvm/Support/FileSystem.h"
|
||||
#include "llvm/Support/FormattedStream.h"
|
||||
#include "llvm/Support/Program.h"
|
||||
#include "llvm/Support/ToolOutputFile.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
#include "llvm/Analysis/TargetLibraryInfo.h"
|
||||
// end AMD stuff
|
||||
|
||||
namespace triton{
|
||||
namespace driver{
|
||||
|
||||
void init_llvm() {
|
||||
static bool init = false;
|
||||
if(!init){
|
||||
LLVMInitializeNVPTXTargetInfo();
|
||||
LLVMInitializeNVPTXTarget();
|
||||
LLVMInitializeNVPTXTargetMC();
|
||||
LLVMInitializeNVPTXAsmPrinter();
|
||||
LLVMInitializeAMDGPUTargetInfo();
|
||||
LLVMInitializeAMDGPUTarget();
|
||||
LLVMInitializeAMDGPUTargetMC();
|
||||
LLVMInitializeAMDGPUAsmPrinter();
|
||||
init = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
|
||||
size_t start_replace = str.find(begin);
|
||||
size_t end_replace = str.find(end, start_replace);
|
||||
if(start_replace == std::string::npos)
|
||||
return false;
|
||||
str.replace(start_replace, end_replace + 1 - start_replace, target);
|
||||
return true;
|
||||
}
|
||||
|
||||
int vptx(int version){
|
||||
if(version >= 11030) return 73;
|
||||
if(version >= 11020) return 72;
|
||||
if(version >= 11010) return 71;
|
||||
if(version >= 11000) return 70;
|
||||
if(version >= 10020) return 65;
|
||||
if(version >= 10010) return 64;
|
||||
if(version >= 10000) return 63;
|
||||
throw std::runtime_error("Triton requires CUDA 10+");
|
||||
}
|
||||
|
||||
std::string llir_to_ptx(llvm::Module* module, int cc, int version){
|
||||
// LLVM version in use may not officially support target hardware
|
||||
int max_nvvm_cc = 75;
|
||||
int max_nvvm_ptx = 64;
|
||||
// options
|
||||
auto options = llvm::cl::getRegisteredOptions();
|
||||
auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
|
||||
assert(short_ptr);
|
||||
short_ptr->setValue(true);
|
||||
// compute capability
|
||||
std::string sm = "sm_" + std::to_string(cc);
|
||||
// max PTX version
|
||||
int ptx = vptx(version);
|
||||
int ptx_major = ptx / 10;
|
||||
int ptx_minor = ptx % 10;
|
||||
// create
|
||||
llvm::SmallVector<char, 0> buffer;
|
||||
std::string triple = "nvptx64-nvidia-cuda";
|
||||
std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
|
||||
std::string layout = "";
|
||||
std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
|
||||
init_llvm();
|
||||
// verify and store llvm
|
||||
llvm::legacy::PassManager pm;
|
||||
pm.add(llvm::createVerifierPass());
|
||||
pm.run(*module);
|
||||
// create machine
|
||||
module->setTargetTriple(triple);
|
||||
std::string error;
|
||||
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
||||
llvm::TargetOptions opt;
|
||||
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
opt.UnsafeFPMath = false;
|
||||
opt.NoInfsFPMath = false;
|
||||
opt.NoNaNsFPMath = true;
|
||||
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
|
||||
llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
|
||||
// set data layout
|
||||
if(layout.empty())
|
||||
module->setDataLayout(machine->createDataLayout());
|
||||
else
|
||||
module->setDataLayout(layout);
|
||||
// emit machine code
|
||||
for (llvm::Function &f : module->functions())
|
||||
f.addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
llvm::legacy::PassManager pass;
|
||||
llvm::raw_svector_ostream stream(buffer);
|
||||
// emit
|
||||
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
|
||||
pass.run(*module);
|
||||
|
||||
// post-process
|
||||
std::string result(buffer.begin(), buffer.end());
|
||||
find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
|
||||
find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
|
||||
while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
|
||||
while(find_and_replace(result, "\t// end inline asm", "\n", ""));
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
|
||||
// JIT compile source-code
|
||||
try{
|
||||
// use ptxas if present in PATH. Otherwise, use JIT from the driver
|
||||
std::string ptxas = "ptxas";
|
||||
std::string version;
|
||||
int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
|
||||
|
||||
// Use PTXAS via system call
|
||||
if(use_system_ptxas){
|
||||
// compile ptx with ptxas
|
||||
char _fsrc[] = "/tmp/triton_k_XXXXXX";
|
||||
char _flog[] = "/tmp/triton_l_XXXXXX";
|
||||
mkstemp(_fsrc);
|
||||
mkstemp(_flog);
|
||||
std::string fsrc = _fsrc;
|
||||
std::string flog = _flog;
|
||||
std::ofstream ofs(fsrc);
|
||||
ofs << ptx;
|
||||
ofs.close();
|
||||
std::string cmd;
|
||||
int err;
|
||||
cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
|
||||
err = system(cmd.c_str());
|
||||
CUmodule ret;
|
||||
dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
|
||||
unlink(_fsrc);
|
||||
unlink(_flog);
|
||||
return ret;
|
||||
}
|
||||
|
||||
// Use PTXAS included in driver
|
||||
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
|
||||
CU_JIT_LOG_VERBOSE};
|
||||
unsigned int errbufsize = 8192;
|
||||
unsigned int logbufsize = 8192;
|
||||
char _err[errbufsize];
|
||||
char _log[logbufsize];
|
||||
void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
|
||||
CUmodule ret;
|
||||
dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
|
||||
return ret;
|
||||
}
|
||||
catch(exception::cuda::invalid_ptx const &){
|
||||
std::cout << ptx << std::endl;
|
||||
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// HIP //
|
||||
/* ------------------------ */
|
||||
|
||||
std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
|
||||
init_llvm();
|
||||
|
||||
// proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
|
||||
// features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
|
||||
|
||||
// create
|
||||
llvm::SmallVector<char, 0> buffer;
|
||||
std::string triple = "amdgcn-amd-amdhsa";
|
||||
std::string layout = "";
|
||||
std::string features;
|
||||
std::string proc = "gfx908";
|
||||
// verify and store llvm
|
||||
llvm::legacy::PassManager pm;
|
||||
pm.add(llvm::createVerifierPass());
|
||||
pm.run(*module);
|
||||
// create machine
|
||||
module->setTargetTriple(triple);
|
||||
std::string error;
|
||||
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
||||
llvm::TargetOptions opt;
|
||||
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
opt.UnsafeFPMath = false;
|
||||
opt.NoInfsFPMath = false;
|
||||
opt.NoNaNsFPMath = true;
|
||||
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
|
||||
llvm::Reloc::PIC_, llvm::None,
|
||||
llvm::CodeGenOpt::Aggressive);
|
||||
// set data layout
|
||||
if(layout.empty())
|
||||
module->setDataLayout(machine->createDataLayout());
|
||||
else
|
||||
module->setDataLayout(layout);
|
||||
// emit machine code
|
||||
for (llvm::Function &f : module->functions())
|
||||
f.addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
llvm::legacy::PassManager pass;
|
||||
llvm::raw_svector_ostream stream(buffer);
|
||||
|
||||
// create dump files
|
||||
std::string module_name = module->getModuleIdentifier();
|
||||
std::error_code ec;
|
||||
|
||||
// Save GCN ISA binary.
|
||||
std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
|
||||
std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
|
||||
new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
|
||||
if (ec)
|
||||
{
|
||||
std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
|
||||
}
|
||||
|
||||
// emit
|
||||
machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
|
||||
pass.run(*module);
|
||||
// Save GCN ISA.
|
||||
std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
|
||||
std::string result(buffer.begin(), buffer.end());
|
||||
std::ofstream amdgcn(amdgcn_path);
|
||||
amdgcn << result;
|
||||
amdgcn.close();
|
||||
|
||||
// generate HASCO file
|
||||
std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
|
||||
std::string error_message;
|
||||
int lld_result =
|
||||
llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
|
||||
{"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
|
||||
llvm::None, {}, 0, 0, &error_message);
|
||||
if (lld_result)
|
||||
{
|
||||
std::cout << "ld.lld execute fail: " << std::endl;
|
||||
std::cout << error_message << std::endl;
|
||||
std::cout << lld_result << std::endl;
|
||||
}
|
||||
|
||||
return hsaco_path;
|
||||
}
|
||||
|
||||
|
||||
hipModule_t amdgpu_to_hipmodule(const std::string& path) {
|
||||
// Read HSACO.
|
||||
std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
|
||||
std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
|
||||
|
||||
std::vector<unsigned char> hsaco(hsaco_file_size);
|
||||
hsaco_file.seekg(0, std::ios::beg);
|
||||
hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
|
||||
hsaco_file.close();
|
||||
hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
|
||||
hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
|
||||
hipJitOptionLogVerbose};
|
||||
unsigned int errbufsize = 8192;
|
||||
unsigned int logbufsize = 8192;
|
||||
char _err[errbufsize];
|
||||
char _log[logbufsize];
|
||||
void* optval[] = {(void*)(uintptr_t)errbufsize,
|
||||
(void*)_err, (void*)(uintptr_t)logbufsize,
|
||||
(void*)_log, (void*)1};
|
||||
hipModule_t ret;
|
||||
dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -1,375 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#include <fstream>
|
||||
#include <unistd.h>
|
||||
#include <memory>
|
||||
#include <regex>
|
||||
#include "triton/driver/module.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/error.h"
|
||||
#include "triton/tools/sha1.hpp"
|
||||
#include "triton/tools/sys/getenv.hpp"
|
||||
#include "triton/tools/sys/mkdir.hpp"
|
||||
#include "triton/tools/sys/exec.hpp"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
#include "llvm/IR/IRPrintingPasses.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/Support/CodeGen.h"
|
||||
#include "llvm/Support/CommandLine.h"
|
||||
#include "llvm/Support/SourceMgr.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "llvm/Support/TargetRegistry.h"
|
||||
#include "llvm/Support/TargetSelect.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
#include "llvm/Target/TargetOptions.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
|
||||
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
|
||||
#include "llvm/Transforms/Utils/Cloning.h"
|
||||
|
||||
std::string exec(const char* cmd) {
|
||||
std::array<char, 128> buffer;
|
||||
std::string result;
|
||||
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
|
||||
if (!pipe) {
|
||||
throw std::runtime_error("popen() failed!");
|
||||
}
|
||||
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
|
||||
result += buffer.data();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
void LLVMInitializeNVPTXTargetInfo();
|
||||
void LLVMInitializeNVPTXTarget();
|
||||
void LLVMInitializeNVPTXTargetMC();
|
||||
void LLVMInitializeNVPTXAsmPrinter();
|
||||
void LLVMInitializeNVPTXAsmParser();
|
||||
|
||||
|
||||
namespace triton
|
||||
{
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/* ------------------------ */
|
||||
// Base //
|
||||
/* ------------------------ */
|
||||
|
||||
|
||||
void module::init_llvm() {
|
||||
static bool init = false;
|
||||
if(!init){
|
||||
LLVMInitializeNVPTXTargetInfo();
|
||||
LLVMInitializeNVPTXTarget();
|
||||
LLVMInitializeNVPTXTargetMC();
|
||||
LLVMInitializeNVPTXAsmPrinter();
|
||||
init = true;
|
||||
}
|
||||
}
|
||||
|
||||
module::module(CUmodule mod, bool has_ownership)
|
||||
: polymorphic_resource(mod, has_ownership), spilled_(0) {
|
||||
}
|
||||
|
||||
module::module(host_module_t mod, bool has_ownership)
|
||||
: polymorphic_resource(mod, has_ownership), spilled_(0) {
|
||||
}
|
||||
|
||||
|
||||
module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
|
||||
switch(device->backend()){
|
||||
case CUDA: return new cu_module(device, std::move(src));
|
||||
case Host: return new host_module(std::move(src));
|
||||
default: throw std::runtime_error("unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
|
||||
const std::string &proc, std::string layout,
|
||||
llvm::SmallVectorImpl<char> &buffer,
|
||||
const std::string& features,
|
||||
file_type_t ft) {
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
|
||||
throw std::runtime_error("CPU unsupported");
|
||||
// init_llvm();
|
||||
// // create kernel wrapper
|
||||
// llvm::LLVMContext &ctx = src->getContext();
|
||||
// llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
|
||||
// llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
|
||||
// llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
|
||||
// std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
|
||||
// llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
|
||||
// llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
|
||||
// llvm::Function* fn = &*src->getFunctionList().begin();
|
||||
// llvm::FunctionType *fn_ty = fn->getFunctionType();
|
||||
// std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
|
||||
// std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
|
||||
// llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
|
||||
// llvm::IRBuilder<> ir_builder(ctx);
|
||||
// ir_builder.SetInsertPoint(entry);
|
||||
// auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
|
||||
// llvm::Value* base = main->arg_begin();
|
||||
// llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
|
||||
|
||||
// size_t offset = 0;
|
||||
// for(unsigned i = 0; i < ptrs.size(); i++){
|
||||
// ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
|
||||
// size_t nbytes = get_size(fn_ty->getParamType(i));
|
||||
// offset += nbytes;
|
||||
// if(i < ptrs.size() - 1){
|
||||
// size_t np1bytes = get_size(fn_ty->getParamType(i+1));
|
||||
// offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
|
||||
// }
|
||||
// }
|
||||
// for(unsigned i = 0; i < ptrs.size(); i++)
|
||||
// ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
|
||||
// for(unsigned i = 0; i < ptrs.size(); i++)
|
||||
// fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
|
||||
|
||||
// fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
|
||||
// fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
|
||||
// fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
|
||||
// ir_builder.CreateCall(fn, fn_args);
|
||||
// ir_builder.CreateRetVoid();
|
||||
|
||||
//// llvm::legacy::PassManager pm;
|
||||
//// pm.add(llvm::createPrintModulePass(llvm::outs()));
|
||||
//// pm.add(llvm::createVerifierPass());
|
||||
//// pm.run(*src);
|
||||
|
||||
//// create execution engine
|
||||
// for(llvm::Function& fn: src->functions())
|
||||
// hst_->functions[fn.getName().str()] = &fn;
|
||||
|
||||
//// llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
|
||||
//// auto DL = JTMB.getDefaultDataLayoutForTarget();
|
||||
//// auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
|
||||
//// hst_->ES = new llvm::orc::ExecutionSession();
|
||||
//// hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
|
||||
//// hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
|
||||
//// hst_->DL = new llvm::DataLayout(std::move(*DL));
|
||||
//// hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
|
||||
//// hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
|
||||
//// hst_->MainJD = &hst_->ES->createJITDylib("<main>");
|
||||
//// hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
|
||||
//// hst_->DL->getGlobalPrefix())));
|
||||
//// llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
|
||||
//// hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
|
||||
|
||||
|
||||
|
||||
// llvm::EngineBuilder builder(std::move(src));
|
||||
// builder.setErrorStr(&hst_->error);
|
||||
// builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
|
||||
// builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
|
||||
// builder.setEngineKind(llvm::EngineKind::JIT);
|
||||
// hst_->engine = builder.create();
|
||||
// hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
|
||||
}
|
||||
|
||||
std::unique_ptr<buffer> host_module::symbol(const char *name) const {
|
||||
throw std::runtime_error("not implemented");
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
|
||||
size_t start_replace = str.find(begin);
|
||||
size_t end_replace = str.find(end, start_replace);
|
||||
if(start_replace == std::string::npos)
|
||||
return false;
|
||||
str.replace(start_replace, end_replace + 1 - start_replace, target);
|
||||
return true;
|
||||
}
|
||||
|
||||
//static std::map<int, int> vptx = {
|
||||
// {10000, 63},
|
||||
// {10010, 64},
|
||||
// {10020, 65},
|
||||
// {11000, 70},
|
||||
// {11010, 71},
|
||||
// {11020, 72},
|
||||
// {11030, 73},
|
||||
// {11040, 73}
|
||||
//};
|
||||
|
||||
int vptx(int version){
|
||||
if(version >= 11030) return 73;
|
||||
if(version >= 11020) return 72;
|
||||
if(version >= 11010) return 71;
|
||||
if(version >= 11000) return 70;
|
||||
if(version >= 10020) return 65;
|
||||
if(version >= 10010) return 64;
|
||||
if(version >= 10000) return 63;
|
||||
throw std::runtime_error("Triton requires CUDA 10+");
|
||||
}
|
||||
|
||||
std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
|
||||
// LLVM version in use may not officially support target hardware
|
||||
int max_nvvm_cc = 75;
|
||||
int max_nvvm_ptx = 64;
|
||||
// options
|
||||
auto options = llvm::cl::getRegisteredOptions();
|
||||
auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
|
||||
assert(short_ptr);
|
||||
short_ptr->setValue(true);
|
||||
// compute capability
|
||||
int cc = ((driver::cu_device*)device)->compute_capability();
|
||||
std::string sm = "sm_" + std::to_string(cc);
|
||||
// driver version
|
||||
int version;
|
||||
dispatch::cuDriverGetVersion(&version);
|
||||
int ptx = vptx(version);
|
||||
int ptx_major = ptx / 10;
|
||||
int ptx_minor = ptx % 10;
|
||||
// create
|
||||
llvm::SmallVector<char, 0> buffer;
|
||||
std::string triple = "nvptx64-nvidia-cuda";
|
||||
std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
|
||||
std::string layout = "";
|
||||
std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
|
||||
init_llvm();
|
||||
// verify and store llvm
|
||||
llvm::legacy::PassManager pm;
|
||||
pm.add(llvm::createVerifierPass());
|
||||
pm.run(*module);
|
||||
// create machine
|
||||
module->setTargetTriple(triple);
|
||||
std::string error;
|
||||
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
||||
llvm::TargetOptions opt;
|
||||
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
|
||||
opt.UnsafeFPMath = false;
|
||||
opt.NoInfsFPMath = false;
|
||||
opt.NoNaNsFPMath = true;
|
||||
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
|
||||
llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
|
||||
// set data layout
|
||||
if(layout.empty())
|
||||
module->setDataLayout(machine->createDataLayout());
|
||||
else
|
||||
module->setDataLayout(layout);
|
||||
// emit machine code
|
||||
for (llvm::Function &f : module->functions())
|
||||
f.addFnAttr(llvm::Attribute::AlwaysInline);
|
||||
llvm::legacy::PassManager pass;
|
||||
llvm::raw_svector_ostream stream(buffer);
|
||||
// emit
|
||||
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
|
||||
pass.run(*module);
|
||||
|
||||
// post-process
|
||||
std::string result(buffer.begin(), buffer.end());
|
||||
find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
|
||||
find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
|
||||
while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
|
||||
while(find_and_replace(result, "\t// end inline asm", "\n", ""));
|
||||
return result;
|
||||
}
|
||||
|
||||
void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
|
||||
// JIT compile source-code
|
||||
try{
|
||||
// use ptxas if present in PATH. Otherwise, use JIT from the driver
|
||||
std::string ptxas = "ptxas";
|
||||
std::string version;
|
||||
int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
|
||||
|
||||
// Use PTXAS via system call
|
||||
if(use_system_ptxas){
|
||||
// compile ptx with ptxas
|
||||
char _fsrc[] = "/tmp/triton_k_XXXXXX";
|
||||
char _flog[] = "/tmp/triton_l_XXXXXX";
|
||||
mkstemp(_fsrc);
|
||||
mkstemp(_flog);
|
||||
std::string fsrc = _fsrc;
|
||||
std::string flog = _flog;
|
||||
std::ofstream ofs(fsrc);
|
||||
ofs << ptx;
|
||||
ofs.close();
|
||||
std::string cmd;
|
||||
int err;
|
||||
std::string cc = std::to_string(device->compute_capability());
|
||||
cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
|
||||
err = system(cmd.c_str());
|
||||
dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
|
||||
unlink(_fsrc);
|
||||
unlink(_flog);
|
||||
return;
|
||||
}
|
||||
|
||||
// Use PTXAS included in driver
|
||||
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
|
||||
CU_JIT_LOG_VERBOSE};
|
||||
unsigned int errbufsize = 8192;
|
||||
unsigned int logbufsize = 8192;
|
||||
char _err[errbufsize];
|
||||
char _log[logbufsize];
|
||||
void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
|
||||
dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
|
||||
}
|
||||
catch(exception::cuda::invalid_ptx const &){
|
||||
//#ifdef TRITON_LOG_PTX_ERROR
|
||||
std::cout << ptx << std::endl;
|
||||
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
|
||||
// exit(1);
|
||||
//#endif
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
||||
cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
|
||||
llvm::raw_string_ostream oss(llir_);
|
||||
oss << *ll_module;
|
||||
oss.flush();
|
||||
ptx_ = compile_llvm_module(ll_module.get(), device);
|
||||
init_from_ptx(ptx_, (driver::cu_device*)device);
|
||||
}
|
||||
|
||||
cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
|
||||
init_from_ptx(ptx_, (driver::cu_device*)device);
|
||||
}
|
||||
|
||||
std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
|
||||
CUdeviceptr handle;
|
||||
size_t size;
|
||||
dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
|
||||
std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
|
||||
return std::move(res);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -1,68 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <string>
|
||||
#include "triton/driver/platform.h"
|
||||
#include "triton/driver/device.h"
|
||||
|
||||
|
||||
namespace triton
|
||||
{
|
||||
namespace driver
|
||||
{
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
std::string cu_platform::version() const{
|
||||
int version;
|
||||
dispatch::cuDriverGetVersion(&version);
|
||||
return std::to_string(version);
|
||||
}
|
||||
|
||||
void cu_platform::devices(std::vector<device *> &devices) const{
|
||||
int N;
|
||||
dispatch::cuDeviceGetCount(&N);
|
||||
for(int i = 0 ; i < N ; ++i){
|
||||
CUdevice dvc;
|
||||
dispatch::cuDeviceGet(&dvc, i);
|
||||
devices.push_back(new driver::cu_device(dvc));
|
||||
}
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
std::string host_platform::version() const {
|
||||
return "1.0";
|
||||
}
|
||||
|
||||
void host_platform::devices(std::vector<driver::device*> &devices) const {
|
||||
devices.push_back(new driver::host_device());
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -1,142 +0,0 @@
|
||||
/* Copyright 2015-2017 Philippe Tillet
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files
|
||||
* (the "Software"), to deal in the Software without restriction,
|
||||
* including without limitation the rights to use, copy, modify, merge,
|
||||
* publish, distribute, sublicense, and/or sell copies of the Software,
|
||||
* and to permit persons to whom the Software is furnished to do so,
|
||||
* subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
|
||||
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
|
||||
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <cassert>
|
||||
#include <unistd.h>
|
||||
#include <array>
|
||||
#include "triton/driver/backend.h"
|
||||
#include "triton/driver/stream.h"
|
||||
#include "triton/driver/context.h"
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/kernel.h"
|
||||
#include "triton/driver/buffer.h"
|
||||
#include "llvm/ExecutionEngine/ExecutionEngine.h"
|
||||
#include "llvm/ExecutionEngine/GenericValue.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
|
||||
namespace driver
|
||||
{
|
||||
|
||||
/* ------------------------ */
|
||||
// Base //
|
||||
/* ------------------------ */
|
||||
|
||||
stream::stream(CUstream cu, bool has_ownership)
|
||||
: polymorphic_resource(cu, has_ownership) {
|
||||
}
|
||||
|
||||
|
||||
stream::stream(host_stream_t cl, bool has_ownership)
|
||||
: polymorphic_resource(cl, has_ownership) {
|
||||
}
|
||||
|
||||
driver::stream* stream::create(backend_t backend) {
|
||||
switch(backend){
|
||||
case CUDA: return new cu_stream();
|
||||
case Host: return new host_stream();
|
||||
default: throw std::runtime_error("unknown backend");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// Host //
|
||||
/* ------------------------ */
|
||||
|
||||
host_stream::host_stream(): stream(host_stream_t(), true) {
|
||||
hst_->pool.reset(new ThreadPool(1));
|
||||
hst_->futures.reset(new std::vector<std::future<void>>());
|
||||
}
|
||||
|
||||
void host_stream::synchronize() {
|
||||
for(auto& x: *hst_->futures)
|
||||
x.wait();
|
||||
hst_->futures->clear();
|
||||
hst_->args.clear();
|
||||
}
|
||||
|
||||
void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
|
||||
auto hst = kernel->module()->hst();
|
||||
hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
|
||||
char* params = new char[args_size];
|
||||
std::memcpy((void*)params, (void*)args, args_size);
|
||||
for(size_t i = 0; i < grid[0]; i++)
|
||||
for(size_t j = 0; j < grid[1]; j++)
|
||||
for(size_t k = 0; k < grid[2]; k++)
|
||||
hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
|
||||
}
|
||||
|
||||
void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
|
||||
std::memcpy((void*)buffer->hst()->data, ptr, size);
|
||||
}
|
||||
|
||||
void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
|
||||
std::memcpy(ptr, (const void*)buffer->hst()->data, size);
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
|
||||
cu_stream::cu_stream(CUstream str, bool take_ownership):
|
||||
stream(str, take_ownership) {
|
||||
}
|
||||
|
||||
cu_stream::cu_stream(): stream(CUstream(), true) {
|
||||
dispatch::cuStreamCreate(&*cu_, 0);
|
||||
}
|
||||
|
||||
void cu_stream::synchronize() {
|
||||
dispatch::cuStreamSynchronize(*cu_);
|
||||
}
|
||||
|
||||
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
|
||||
void *config[] = {
|
||||
CU_LAUNCH_PARAM_BUFFER_POINTER, args,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||
CU_LAUNCH_PARAM_END
|
||||
};
|
||||
dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
|
||||
}
|
||||
|
||||
void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
|
||||
if(blocking)
|
||||
dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
|
||||
else
|
||||
dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
|
||||
}
|
||||
|
||||
void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
|
||||
if(blocking)
|
||||
dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
|
||||
else
|
||||
dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
@@ -1,7 +1,7 @@
|
||||
#include "triton/codegen/pass.h"
|
||||
#include "triton/driver/kernel.h"
|
||||
#include "triton/driver/module.h"
|
||||
#include "triton/driver/stream.h"
|
||||
#include "triton/codegen/target.h"
|
||||
#include "triton/driver/error.h"
|
||||
#include "triton/driver/llvm.h"
|
||||
#include "triton/ir/builder.h"
|
||||
#include "triton/ir/dispatch.h"
|
||||
#include "triton/ir/enums.h"
|
||||
@@ -15,7 +15,9 @@
|
||||
#include <pybind11/stl.h>
|
||||
#include <regex>
|
||||
#include <string>
|
||||
#include <sstream>
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/LegacyPassManager.h"
|
||||
#include "llvm/IR/Verifier.h"
|
||||
|
||||
namespace py = pybind11;
|
||||
namespace ir = triton::ir;
|
||||
@@ -24,72 +26,213 @@ namespace drv = triton::driver;
|
||||
/*****************************************************************************/
|
||||
/* Python bindings for triton::driver */
|
||||
/*****************************************************************************/
|
||||
// information query
|
||||
template<CUdevice_attribute attr>
|
||||
int cuGetInfo(CUdevice device) {
|
||||
int res;
|
||||
drv::dispatch::cuDeviceGetAttribute(&res, attr, device);
|
||||
return res;
|
||||
}
|
||||
|
||||
void init_triton_driver(py::module &&m) {
|
||||
// base device
|
||||
py::class_<drv::device>(m, "device");
|
||||
// cuda device
|
||||
py::class_<drv::cu_device, drv::device>(m, "cu_device")
|
||||
.def(py::init([](int dev_id, bool take_ownership) {
|
||||
CUdevice handle;
|
||||
drv::dispatch::cuDeviceGet(&handle, dev_id);
|
||||
return new drv::cu_device(handle, take_ownership);
|
||||
}))
|
||||
.def("max_shared_memory", [](drv::cu_device *self) {
|
||||
return self->max_shared_memory();
|
||||
})
|
||||
.def("enable_peer_access", [](drv::cu_device *self, unsigned long long int peer_mem_ptr) {
|
||||
self->enable_peer_access(peer_mem_ptr);
|
||||
});
|
||||
// host device
|
||||
py::class_<drv::host_device, drv::device>(m, "host_device")
|
||||
.def(py::init<>());
|
||||
template<hipDeviceAttribute_t attr>
|
||||
int hipGetInfo(hipDevice_t device) {
|
||||
int res;
|
||||
drv::dispatch::hipDeviceGetAttribute(&res, attr, device);
|
||||
return res;
|
||||
}
|
||||
|
||||
// base stream
|
||||
py::class_<drv::stream>(m, "stream");
|
||||
// host stream
|
||||
py::class_<drv::host_stream, drv::stream>(m, "host_stream")
|
||||
.def(py::init<>());
|
||||
// cuda stream
|
||||
py::class_<drv::cu_stream, drv::stream>(m, "cu_stream")
|
||||
// py doesn't support opaque pointer (e.g., CUstream) so
|
||||
// we assume it has been converted to uint64_t
|
||||
.def(py::init([](uint64_t handle, bool take_ownership) {
|
||||
return std::unique_ptr<drv::cu_stream>(new drv::cu_stream((CUstream)handle, take_ownership));
|
||||
}))
|
||||
.def("enqueue", [](drv::cu_stream *self, drv::kernel *kernel,
|
||||
size_t grid_0, size_t grid_1, size_t grid_2,
|
||||
size_t block_0, size_t block_1, size_t block_2,
|
||||
const std::string &args,
|
||||
size_t shared_mem) {
|
||||
return self->enqueue(kernel, {grid_0, grid_1, grid_2}, {block_0, block_1, block_2},
|
||||
(void *)args.data(), args.size(), shared_mem);
|
||||
enum backend_t {
|
||||
HOST,
|
||||
CUDA,
|
||||
ROCM,
|
||||
};
|
||||
|
||||
void cu_enable_peer_access(uint64_t peer_ptr){
|
||||
CUcontext context;
|
||||
drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_ptr);
|
||||
try {
|
||||
drv::dispatch::cuCtxEnablePeerAccess(context, 0);
|
||||
} catch (drv::exception::cuda::peer_access_already_enabled) {}
|
||||
}
|
||||
|
||||
void host_enqueue(uint64_t stream, uint64_t kernel,
|
||||
uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
|
||||
uint64_t block_0, uint64_t block_1, uint64_t block_2,
|
||||
void* args_ptr, size_t args_size, int64_t shared_mem){
|
||||
throw std::runtime_error("unsupported");
|
||||
// auto hst = kernel->module()->hst();
|
||||
// hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
|
||||
// char* params = new char[args_size];
|
||||
// std::memcpy((void*)params, (void*)args, args_size);
|
||||
// for(size_t i = 0; i < grid[0]; i++)
|
||||
// for(size_t j = 0; j < grid[1]; j++)
|
||||
// for(size_t k = 0; k < grid[2]; k++)
|
||||
// hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
|
||||
}
|
||||
|
||||
void cu_enqueue(uint64_t stream, uint64_t kernel,
|
||||
uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
|
||||
uint64_t block_0, uint64_t block_1, uint64_t block_2,
|
||||
void* args_ptr, size_t args_size, int64_t shared_mem){
|
||||
void *config[] = {
|
||||
CU_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||
CU_LAUNCH_PARAM_END
|
||||
};
|
||||
drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2,
|
||||
block_0, block_1, block_2,
|
||||
shared_mem, (CUstream)stream, nullptr, config);
|
||||
}
|
||||
|
||||
void hip_enqueue(uint64_t stream, uint64_t kernel,
|
||||
uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
|
||||
uint64_t block_0, uint64_t block_1, uint64_t block_2,
|
||||
void* args_ptr, size_t args_size, int64_t shared_mem) {
|
||||
void *config[] = {
|
||||
HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
|
||||
HIP_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||
HIP_LAUNCH_PARAM_END
|
||||
};
|
||||
drv::dispatch::hipModuleLaunchKernel((hipFunction_t)kernel, grid_0, grid_1, grid_2,
|
||||
block_0, block_1, block_2,
|
||||
shared_mem, (hipStream_t)stream, nullptr, config);
|
||||
|
||||
}
|
||||
|
||||
void init_triton_runtime(py::module &&m) {
|
||||
|
||||
// wrap backend_t
|
||||
py::enum_<backend_t>(m, "backend")
|
||||
.value("HOST", HOST)
|
||||
.value("CUDA", CUDA)
|
||||
.value("ROCM", ROCM)
|
||||
.export_values();
|
||||
|
||||
// enable peer-to-peer
|
||||
m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
|
||||
if (backend != CUDA)
|
||||
throw std::runtime_error("P2P only supported on CUDA devices!");
|
||||
cu_enable_peer_access(peer_ptr);
|
||||
}
|
||||
);
|
||||
|
||||
// query maximum shared memory
|
||||
m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
|
||||
if (backend == HOST)
|
||||
return 0;
|
||||
if(backend == CUDA)
|
||||
return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(device);
|
||||
if(backend == ROCM)
|
||||
return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
|
||||
return -1;
|
||||
});
|
||||
|
||||
py::class_<drv::module>(m, "module");
|
||||
// enqueue
|
||||
m.def("enqueue", [](backend_t backend, uint64_t stream, uint64_t kernel,
|
||||
uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
|
||||
uint64_t block_0, uint64_t block_1, uint64_t block_2,
|
||||
const std::string &args, int64_t shared_mem){
|
||||
void* args_ptr = (void*)args.data();
|
||||
size_t args_size = args.size();
|
||||
if(backend == HOST)
|
||||
host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
|
||||
if(backend == CUDA)
|
||||
cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
|
||||
if(backend == ROCM)
|
||||
hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
|
||||
});
|
||||
|
||||
py::class_<drv::cu_module, drv::module>(m, "cu_module")
|
||||
.def("ptx", &drv::cu_module::ptx)
|
||||
.def("cubin", [](drv::cu_module *self) { return py::bytes(self->cubin()); })
|
||||
.def("llir", &drv::cu_module::llir);
|
||||
|
||||
py::class_<drv::kernel>(m, "kernel");
|
||||
}
|
||||
|
||||
/*****************************************************************************/
|
||||
/* Python bindings for triton::codegen */
|
||||
/*****************************************************************************/
|
||||
typedef std::map<std::string, std::string> asm_map_t;
|
||||
|
||||
|
||||
std::tuple<uint64_t, uint64_t> cu_compile_llir(const std::string& name, size_t n_shared_bytes, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map, int cc, int version){
|
||||
// LLVM-IR -> PTX
|
||||
std::string ptx = drv::llir_to_ptx(llvm, cc, version);
|
||||
asm_map["ptx"] = ptx;
|
||||
// PTX -> Binary
|
||||
CUmodule mod = drv::ptx_to_cumodule(ptx, cc);
|
||||
// Handle to the kernel
|
||||
CUfunction fun;
|
||||
drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
|
||||
// Dynamic shared memory
|
||||
int shared_optin;
|
||||
drv::dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
|
||||
if(n_shared_bytes > 49152 && shared_optin > 49152){
|
||||
drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
|
||||
int shared_total, shared_static;
|
||||
int n_spills, n_reg;
|
||||
drv::dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
|
||||
drv::dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
|
||||
drv::dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, fun);
|
||||
drv::dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, fun);
|
||||
drv::dispatch::cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
|
||||
}
|
||||
|
||||
// record asm
|
||||
return std::make_tuple((uint64_t)mod, (uint64_t)fun);
|
||||
}
|
||||
|
||||
std::tuple<uint64_t, uint64_t> hip_compile_llir(const std::string& name, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map){
|
||||
// LLVM-IR -> HSA-CO
|
||||
std::string path = drv::llir_to_amdgpu(llvm, "gfx908");
|
||||
// HSA-CO -> hipModule
|
||||
hipModule_t mod = drv::amdgpu_to_hipmodule(path);
|
||||
// Handle to the kernel
|
||||
hipFunction_t fun;
|
||||
drv::dispatch::hipModuleGetFunction(&fun, mod, name.c_str());
|
||||
// record asm
|
||||
return std::make_tuple((uint64_t)mod, (uint64_t)fun);
|
||||
}
|
||||
|
||||
void init_triton_codegen(py::module &&m) {
|
||||
m.def(
|
||||
"add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps, int num_stages, bool force_nc_cache) {
|
||||
drv::module *mod;
|
||||
drv::kernel *ker;
|
||||
size_t shared_mem;
|
||||
triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, num_stages, force_nc_cache, mod, ker, shared_mem);
|
||||
std::stringstream ss;
|
||||
ir::print(ir, ss);
|
||||
return std::make_tuple(mod, ker, shared_mem, ss.str());
|
||||
"compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages, bool force_nc_cache) {
|
||||
std::string name = ir.get_function_list()[0]->get_name();
|
||||
// record asm as we generate
|
||||
asm_map_t asm_map;
|
||||
std::ostringstream ttir;
|
||||
ir::print(ir, ttir);
|
||||
asm_map["ttir"] = ttir.str();
|
||||
llvm::LLVMContext ctx;
|
||||
if(backend == CUDA){
|
||||
// device properties
|
||||
CUdevice dev = (CUdevice)device;
|
||||
size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
|
||||
size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
|
||||
size_t cc = major*10 + minor;
|
||||
int version;
|
||||
drv::dispatch::cuDriverGetVersion(&version);
|
||||
// Triton-IR -> NVPTX LLVM-IR
|
||||
triton::codegen::nvidia_cu_target target(cc);
|
||||
int n_shared_bytes;
|
||||
auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc, num_warps, num_stages, force_nc_cache, n_shared_bytes);
|
||||
llvm::raw_string_ostream llir(asm_map["llir"]);
|
||||
llir << *llvm;
|
||||
llir.flush();
|
||||
// LLVM-IR -> Bin
|
||||
uint64_t mod, fun;
|
||||
std::tie(mod, fun) = cu_compile_llir(name, n_shared_bytes, &*llvm, device, asm_map, cc, version);
|
||||
return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
|
||||
}
|
||||
if(backend == ROCM){
|
||||
// Triton-IR -> NVPTX LLVM-IR
|
||||
triton::codegen::amd_cl_target target;
|
||||
int n_shared_bytes;
|
||||
auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70, num_warps, num_stages, force_nc_cache, n_shared_bytes);
|
||||
llvm::raw_string_ostream llir(asm_map["llir"]);
|
||||
llir << *llvm;
|
||||
llir.flush();
|
||||
// LLVM-IR -> Bin
|
||||
uint64_t mod, fun;
|
||||
std::tie(mod, fun) = hip_compile_llir(name, &*llvm, device, asm_map);
|
||||
return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
|
||||
}
|
||||
},
|
||||
py::return_value_policy::take_ownership);
|
||||
}
|
||||
@@ -302,7 +445,7 @@ void init_triton_ir(py::module &&m) {
|
||||
void init_triton(py::module &m) {
|
||||
py::module subm = m.def_submodule("triton");
|
||||
init_triton_codegen(std::move(subm.def_submodule("code_gen")));
|
||||
init_triton_driver(std::move(subm.def_submodule("driver")));
|
||||
init_triton_runtime(std::move(subm.def_submodule("runtime")));
|
||||
init_triton_ir(std::move(subm.def_submodule("ir")));
|
||||
init_triton_frontend(std::move(subm.def_submodule("frontend")));
|
||||
}
|
||||
|
@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
|
||||
return kernel
|
||||
|
||||
|
||||
|
||||
|
||||
# generic test functions
|
||||
def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
|
||||
SIZE = 128
|
||||
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
|
||||
# compare
|
||||
triton.testing.assert_almost_equal(z_tri, z_ref)
|
||||
# parse ptx to make sure ld/st are vectorized
|
||||
ptx = pgm.asm('ptx')
|
||||
ptx = pgm.asm['ptx']
|
||||
assert 'ld.global.v4' in ptx
|
||||
assert 'st.global.v4' in ptx
|
||||
|
||||
@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
|
||||
z_ref += z[0,:][None, :]
|
||||
z_ref = z_ref.to(torch.float16)
|
||||
# compare
|
||||
ptx = pgm.asm('ptx')
|
||||
ptx = pgm.asm['ptx']
|
||||
# print(ptx)
|
||||
triton.testing.assert_almost_equal(z_tri, z_ref)
|
||||
# make sure ld/st are vectorized
|
||||
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
|
||||
# ---------------
|
||||
# test while
|
||||
# ---------------
|
||||
|
||||
# ---------------
|
||||
# test noop
|
||||
#----------------
|
||||
def test_noop(device='cuda'):
|
||||
@triton.jit
|
||||
def kernel(**meta):
|
||||
pass
|
||||
x = triton.testing.random((1,), dtype=torch.int32, device=device)
|
||||
kernel[(1, )](x)
|
@@ -411,9 +411,9 @@ class CodeGenerator(ast.NodeVisitor):
|
||||
|
||||
|
||||
class Binary:
|
||||
def __init__(self, module, kernel, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm):
|
||||
def __init__(self, backend, module, kernel, asm, num_warps, num_stages, force_nc_cache, shared_mem):
|
||||
# cache ir asm
|
||||
self.ir_asm = ir_asm
|
||||
self.asm = asm
|
||||
self.module = module
|
||||
self.kernel = kernel
|
||||
self.shared_mem = shared_mem
|
||||
@@ -421,29 +421,13 @@ class Binary:
|
||||
self.num_stages = num_stages
|
||||
self.force_nc_cache = force_nc_cache
|
||||
self.sass = None
|
||||
|
||||
def asm(self, mode):
|
||||
if mode == 'ttir':
|
||||
return self.ir_asm
|
||||
if mode == 'ptx':
|
||||
return self.module.ptx()
|
||||
if mode == 'sass':
|
||||
if self.sass is None:
|
||||
cubin = self.module.cubin()
|
||||
# get a temporary file name
|
||||
fd, path = tempfile.mkstemp(suffix='.cubin')
|
||||
f = open(path, 'wb')
|
||||
f.write(cubin)
|
||||
f.close()
|
||||
# extract SASS from cubin
|
||||
self.sass = extract(path, None)
|
||||
return self.sass
|
||||
if mode == 'llir':
|
||||
return self.module.llir()
|
||||
raise ValueError('Unsupported mode ' + mode)
|
||||
self.backend = backend
|
||||
|
||||
def __call__(self, stream, args, grid_0, grid_1=1, grid_2=1):
|
||||
stream.enqueue(self.kernel, grid_0, grid_1, grid_2, self.num_warps * 32, 1, 1, args, self.shared_mem)
|
||||
_triton.runtime.enqueue(self.backend, stream, self.kernel,
|
||||
grid_0, grid_1, grid_2,
|
||||
self.num_warps * 32, 1, 1,
|
||||
args, self.shared_mem)
|
||||
|
||||
|
||||
class CompilationError(Exception):
|
||||
@@ -548,10 +532,15 @@ class Kernel:
|
||||
raise e
|
||||
raise CompilationError(self.fn.src, node, e)
|
||||
# Compile to machine code
|
||||
mod, ker, shared_mem, ir_asm = _triton.code_gen.add_passes_to_emit_bin(generator.module, device, num_warps, num_stages, force_nc_cache)
|
||||
if shared_mem > device.max_shared_memory():
|
||||
raise OutOfResources(shared_mem, device.max_shared_memory(), "shared memory")
|
||||
return Binary(mod, ker, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm)
|
||||
if torch.version.hip is None:
|
||||
backend = _triton.runtime.backend.CUDA
|
||||
else:
|
||||
backend = _triton.runtime.backend.ROCM
|
||||
mod, ker, asm, shared_mem = _triton.code_gen.compile_ttir(backend, generator.module, device, num_warps, num_stages, force_nc_cache)
|
||||
max_shared_memory = _triton.runtime.max_shared_memory(backend, device)
|
||||
if shared_mem > max_shared_memory:
|
||||
raise OutOfResources(shared_mem, max_shared_memory, "shared memory")
|
||||
return Binary(backend, mod, ker, asm, num_warps, num_stages, force_nc_cache, shared_mem)
|
||||
|
||||
def __call__(self, *wargs, grid, num_warps=4, num_stages=2, force_nc_cache=False, **meta):
|
||||
# device inference
|
||||
@@ -571,19 +560,20 @@ class Kernel:
|
||||
" Only CUDA is supported at the moment")
|
||||
|
||||
device = torch.device('cuda', torch.cuda.current_device())
|
||||
tt_device = _triton.driver.cu_device(device.index, False)
|
||||
if len(set(device_ids)) != 1 or device_ids[0] != device.index:
|
||||
device_ty = device.type
|
||||
device_idx = device.index
|
||||
if len(set(device_ids)) != 1 or device_ids[0] != device_idx:
|
||||
# try to enable P2P communication
|
||||
for arg_idx, dst_idx in zip(tensor_idxs, device_ids):
|
||||
if dst_idx != device.index:
|
||||
if dst_idx != device_idx:
|
||||
try:
|
||||
tt_device.enable_peer_access(wargs[arg_idx].data_ptr())
|
||||
_triton.runtime.enable_peer_access(self.backend, wargs[arg_idx].data_ptr())
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError("Cannot enable P2P access from device {} to device {}: {}"
|
||||
.format(device.index, dst_idx, str(e)))
|
||||
.format(device_idx, dst_idx, str(e)))
|
||||
|
||||
# enqueue kernel on the current device
|
||||
torch.cuda.set_device(device.index)
|
||||
torch.cuda.set_device(device_idx)
|
||||
# attributes
|
||||
args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
|
||||
attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) if isinstance(a, int)}
|
||||
@@ -594,12 +584,12 @@ class Kernel:
|
||||
attr_key = frozenset(attributes.items())
|
||||
meta_key = frozenset(meta.items())
|
||||
const_key = frozenset(constants.items())
|
||||
key = (device.type, device.index, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
|
||||
key = (device_ty, device_idx, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
|
||||
cache = self.fn.cache
|
||||
if key not in cache:
|
||||
# compile and cache configuration if necessary
|
||||
cache[key] = self._compile(
|
||||
*wargs, device=tt_device, attributes=attributes,
|
||||
*wargs, device=device_idx, attributes=attributes,
|
||||
num_warps=num_warps, num_stages=num_stages, force_nc_cache=force_nc_cache,
|
||||
constants=constants, **meta
|
||||
)
|
||||
@@ -608,8 +598,7 @@ class Kernel:
|
||||
params = struct.pack(fmt, *args)
|
||||
# enqueue cached function into stream
|
||||
binary = cache[key]
|
||||
cu_stream = torch.cuda.current_stream(device.index).cuda_stream
|
||||
stream = _triton.driver.cu_stream(cu_stream, False)
|
||||
stream = torch.cuda.current_stream(device_idx).cuda_stream
|
||||
grid = grid(meta) if hasattr(grid, '__call__') else grid
|
||||
binary(stream, params, *grid)
|
||||
return binary
|
||||
|
@@ -64,7 +64,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
|
||||
# - each torch.tensor object is implicitly converted into a pointer to its first element.
|
||||
# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
|
||||
# - don't forget to pass meta-parameters as keywords arguments
|
||||
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
|
||||
pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
|
||||
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
|
||||
# running asynchronously at this point.
|
||||
return output
|
||||
@@ -85,6 +85,7 @@ print(
|
||||
f'The maximum difference between torch and triton is '
|
||||
f'{torch.max(torch.abs(output_torch - output_triton))}'
|
||||
)
|
||||
exit()
|
||||
|
||||
# %%
|
||||
# Seems like we're good to go!
|
||||
|
Reference in New Issue
Block a user