[DRIVER] Simplified Driver API by substantially removing reliance on driver::context
This commit is contained in:
@@ -16,15 +16,14 @@ class stream;
|
|||||||
// Base
|
// Base
|
||||||
class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
|
class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
|
||||||
public:
|
public:
|
||||||
buffer(driver::context* ctx, size_t size, CUdeviceptr cl, bool take_ownership);
|
buffer(size_t size, CUdeviceptr cl, bool take_ownership);
|
||||||
buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership);
|
buffer(size_t size, host_buffer_t hst, bool take_ownership);
|
||||||
uintptr_t addr_as_uintptr_t();
|
uintptr_t addr_as_uintptr_t();
|
||||||
static buffer* create(driver::context* ctx, size_t size);
|
static buffer* create(driver::context* ctx, size_t size);
|
||||||
driver::context* context();
|
driver::context* context();
|
||||||
size_t size();
|
size_t size();
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
driver::context* context_;
|
|
||||||
size_t size_;
|
size_t size_;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -32,15 +31,15 @@ protected:
|
|||||||
class host_buffer: public buffer
|
class host_buffer: public buffer
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
host_buffer(driver::context* context, size_t size);
|
host_buffer(size_t size);
|
||||||
};
|
};
|
||||||
|
|
||||||
// CUDA
|
// CUDA
|
||||||
class cu_buffer: public buffer
|
class cu_buffer: public buffer
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
cu_buffer(driver::context* context, size_t size);
|
cu_buffer(size_t size);
|
||||||
cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership);
|
cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
|
||||||
void set_zero(triton::driver::stream *queue, size_t size);
|
void set_zero(triton::driver::stream *queue, size_t size);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -93,6 +93,7 @@ public:
|
|||||||
static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
|
static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
|
||||||
static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
||||||
static CUresult cuStreamSynchronize(CUstream hStream);
|
static CUresult cuStreamSynchronize(CUstream hStream);
|
||||||
|
static CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx);
|
||||||
static CUresult cuStreamDestroy_v2(CUstream hStream);
|
static CUresult cuStreamDestroy_v2(CUstream hStream);
|
||||||
static CUresult cuEventDestroy_v2(CUevent hEvent);
|
static CUresult cuEventDestroy_v2(CUevent hEvent);
|
||||||
static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
|
static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
|
||||||
@@ -154,6 +155,7 @@ private:
|
|||||||
static void* cuModuleGetFunction_;
|
static void* cuModuleGetFunction_;
|
||||||
static void* cuStreamSynchronize_;
|
static void* cuStreamSynchronize_;
|
||||||
static void* cuStreamDestroy_v2_;
|
static void* cuStreamDestroy_v2_;
|
||||||
|
static void* cuStreamGetCtx_;
|
||||||
static void* cuEventDestroy_v2_;
|
static void* cuEventDestroy_v2_;
|
||||||
static void* cuMemAlloc_v2_;
|
static void* cuMemAlloc_v2_;
|
||||||
static void* cuPointerGetAttribute_;
|
static void* cuPointerGetAttribute_;
|
||||||
|
@@ -35,26 +35,21 @@ protected:
|
|||||||
};
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
module(driver::context* ctx, CUmodule mod, bool has_ownership);
|
module(CUmodule mod, bool has_ownership);
|
||||||
module(driver::context* ctx, host_module_t mod, bool has_ownership);
|
module(host_module_t mod, bool has_ownership);
|
||||||
static module* create(driver::context* ctx, std::unique_ptr<llvm::Module> src);
|
static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
|
||||||
driver::context* context() const;
|
|
||||||
void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
|
void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
|
||||||
const std::string &proc, std::string layout,
|
const std::string &proc, std::string layout,
|
||||||
llvm::SmallVectorImpl<char> &buffer,
|
llvm::SmallVectorImpl<char> &buffer,
|
||||||
const std::string &features,
|
const std::string &features,
|
||||||
file_type_t file_type);
|
file_type_t file_type);
|
||||||
virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
|
virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
|
||||||
|
|
||||||
|
|
||||||
protected:
|
|
||||||
driver::context* ctx_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// CPU
|
// CPU
|
||||||
class host_module: public module{
|
class host_module: public module{
|
||||||
public:
|
public:
|
||||||
host_module(driver::context* context, std::unique_ptr<llvm::Module> module);
|
host_module(std::unique_ptr<llvm::Module> module);
|
||||||
std::unique_ptr<buffer> symbol(const char * name) const;
|
std::unique_ptr<buffer> symbol(const char * name) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -63,8 +58,8 @@ class cu_module: public module {
|
|||||||
std::string compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device);
|
std::string compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
cu_module(driver::context* context, std::unique_ptr<llvm::Module> module);
|
cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
|
||||||
cu_module(driver::context* context, const std::string& source);
|
cu_module(const std::string& source);
|
||||||
std::unique_ptr<buffer> symbol(const char * name) const;
|
std::unique_ptr<buffer> symbol(const char * name) const;
|
||||||
const std::string& source() const { return source_; }
|
const std::string& source() const { return source_; }
|
||||||
|
|
||||||
|
@@ -23,10 +23,10 @@ class cu_buffer;
|
|||||||
// Base
|
// Base
|
||||||
class stream: public polymorphic_resource<CUstream, host_stream_t> {
|
class stream: public polymorphic_resource<CUstream, host_stream_t> {
|
||||||
public:
|
public:
|
||||||
stream(driver::context *ctx, CUstream, bool has_ownership);
|
stream(CUstream, bool has_ownership);
|
||||||
stream(driver::context *ctx, host_stream_t, bool has_ownership);
|
stream(host_stream_t, bool has_ownership);
|
||||||
// factory
|
// factory
|
||||||
static driver::stream* create(driver::context* ctx);
|
static driver::stream* create(backend_t backend);
|
||||||
// accessors
|
// accessors
|
||||||
driver::context* context() const;
|
driver::context* context() const;
|
||||||
// methods
|
// methods
|
||||||
@@ -39,16 +39,13 @@ public:
|
|||||||
{ write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
{ write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
||||||
template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
|
template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
|
||||||
{ read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
{ read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
|
||||||
|
|
||||||
protected:
|
|
||||||
driver::context *ctx_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
// Host
|
// Host
|
||||||
class host_stream: public stream {
|
class host_stream: public stream {
|
||||||
public:
|
public:
|
||||||
// Constructors
|
// Constructors
|
||||||
host_stream(driver::context *ctx);
|
host_stream();
|
||||||
|
|
||||||
// Overridden
|
// Overridden
|
||||||
void synchronize();
|
void synchronize();
|
||||||
@@ -62,7 +59,7 @@ class cu_stream: public stream {
|
|||||||
public:
|
public:
|
||||||
// Constructors
|
// Constructors
|
||||||
cu_stream(CUstream str, bool take_ownership);
|
cu_stream(CUstream str, bool take_ownership);
|
||||||
cu_stream(driver::context* context);
|
cu_stream();
|
||||||
|
|
||||||
// Overridden
|
// Overridden
|
||||||
void synchronize();
|
void synchronize();
|
||||||
|
@@ -87,11 +87,11 @@ private:
|
|||||||
class caller {
|
class caller {
|
||||||
public:
|
public:
|
||||||
// constructors
|
// constructors
|
||||||
caller(driver::context* ctx, std::ifstream& ifs, const options_t& opt);
|
caller(std::ifstream& ifs, const options_t& opt);
|
||||||
caller(ir::function *ir, std::shared_ptr<driver::module> program, const options_t& opt);
|
caller(ir::function *ir, std::shared_ptr<driver::module> program, const options_t& opt);
|
||||||
// serialization
|
// serialization
|
||||||
void write(std::ofstream& ofs);
|
void write(std::ofstream& ofs);
|
||||||
void read(driver::context* ctx, std::ifstream& ifs);
|
void read(std::ifstream& ifs);
|
||||||
// accessors
|
// accessors
|
||||||
const options_t opt() const { return opt_; }
|
const options_t opt() const { return opt_; }
|
||||||
const driver::module* parent() const { return &*parent_; }
|
const driver::module* parent() const { return &*parent_; }
|
||||||
@@ -101,7 +101,7 @@ private:
|
|||||||
|
|
||||||
std::vector<int> retune() const { return retune_; }
|
std::vector<int> retune() const { return retune_; }
|
||||||
// entry points
|
// entry points
|
||||||
void operator()(driver::stream *stream, const grid_t& grid, void **args, size_t args_size) const;
|
void operator()(driver::stream *stream, const grid_t& grid, void **args, size_t args_size, const std::map<std::string, std::vector<char>>& = {}) const;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::shared_ptr<driver::kernel> bin_;
|
std::shared_ptr<driver::kernel> bin_;
|
||||||
@@ -121,9 +121,9 @@ private:
|
|||||||
// make
|
// make
|
||||||
triton::lang::translation_unit *make_ast(const std::string &src);
|
triton::lang::translation_unit *make_ast(const std::string &src);
|
||||||
std::unique_ptr<ir::module> make_ir(Parser &parser);
|
std::unique_ptr<ir::module> make_ir(Parser &parser);
|
||||||
std::unique_ptr<driver::module> make_bin(ir::module &function, driver::context *context, const options_t &opt);
|
std::unique_ptr<driver::module> make_bin(ir::module &function, driver::device *device, const options_t &opt);
|
||||||
void make(driver::stream *stream, options_t opt);
|
void make(driver::device *device, options_t opt);
|
||||||
void precompile(driver::stream *stream, const options_space_t& tuning_space);
|
void precompile(driver::device *device, const options_space_t& tuning_space);
|
||||||
// autotune
|
// autotune
|
||||||
caller* autotune(driver::stream *stream, const grid_fn_ty& grid, void **args, size_t args_size);
|
caller* autotune(driver::stream *stream, const grid_fn_ty& grid, void **args, size_t args_size);
|
||||||
|
|
||||||
@@ -132,10 +132,10 @@ public:
|
|||||||
|
|
||||||
public:
|
public:
|
||||||
function(const std::string& src, const options_space_t& opt, const std::string &cache_ref = "");
|
function(const std::string& src, const options_space_t& opt, const std::string &cache_ref = "");
|
||||||
void operator()(void** args, size_t args_size, const grid_t& grid, driver::stream* stream);
|
void operator()(void** args, size_t args_size, const grid_t& grid, driver::stream* stream, driver::device* device);
|
||||||
void operator()(void** args, size_t args_size, const grid_fn_ty& grid, driver::stream *stream);
|
void operator()(void** args, size_t args_size, const grid_fn_ty& grid, driver::stream *stream, driver::device* device);
|
||||||
void set_cst(const char* name, void* data, size_t n_bytes);
|
void set_cst(const char* name, void* data, size_t n_bytes);
|
||||||
std::string ptx(driver::stream *stream, const options_t& opt);
|
std::string ptx(driver::device *device, const options_t& opt);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
std::map<std::string, std::vector<char>> cst_;
|
std::map<std::string, std::vector<char>> cst_;
|
||||||
|
@@ -134,7 +134,7 @@ std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::ker
|
|||||||
void backend::streams::init(std::list<driver::context*> const & contexts){
|
void backend::streams::init(std::list<driver::context*> const & contexts){
|
||||||
for(driver::context* ctx : contexts)
|
for(driver::context* ctx : contexts)
|
||||||
if(cache_.find(ctx)==cache_.end())
|
if(cache_.find(ctx)==cache_.end())
|
||||||
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx)}));
|
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
|
||||||
}
|
}
|
||||||
|
|
||||||
void backend::streams::release(){
|
void backend::streams::release(){
|
||||||
|
@@ -35,16 +35,11 @@ namespace driver
|
|||||||
|
|
||||||
//
|
//
|
||||||
|
|
||||||
buffer::buffer(driver::context* ctx, size_t size, CUdeviceptr cu, bool take_ownership)
|
buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
|
||||||
: polymorphic_resource(cu, take_ownership), context_(ctx), size_(size) { }
|
: polymorphic_resource(cu, take_ownership), size_(size) { }
|
||||||
|
|
||||||
buffer::buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership)
|
buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
|
||||||
: polymorphic_resource(hst, take_ownership), context_(ctx), size_(size) { }
|
: polymorphic_resource(hst, take_ownership), size_(size) { }
|
||||||
|
|
||||||
|
|
||||||
driver::context* buffer::context() {
|
|
||||||
return context_;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t buffer::size() {
|
size_t buffer::size() {
|
||||||
return size_;
|
return size_;
|
||||||
@@ -61,35 +56,32 @@ uintptr_t buffer::addr_as_uintptr_t() {
|
|||||||
|
|
||||||
buffer* buffer::create(driver::context* ctx, size_t size) {
|
buffer* buffer::create(driver::context* ctx, size_t size) {
|
||||||
switch(ctx->backend()){
|
switch(ctx->backend()){
|
||||||
case CUDA: return new cu_buffer(ctx, size);
|
case CUDA: return new cu_buffer(size);
|
||||||
case Host: return new host_buffer(ctx, size);
|
case Host: return new host_buffer(size);
|
||||||
default: throw std::runtime_error("unknown backend");
|
default: throw std::runtime_error("unknown backend");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
||||||
host_buffer::host_buffer(driver::context *context, size_t size)
|
host_buffer::host_buffer(size_t size)
|
||||||
: buffer(context, size, host_buffer_t(), true){
|
: buffer(size, host_buffer_t(), true){
|
||||||
hst_->data = new char[size];
|
hst_->data = new char[size];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
|
|
||||||
cu_buffer::cu_buffer(driver::context* context, size_t size)
|
cu_buffer::cu_buffer(size_t size)
|
||||||
: buffer(context, size, CUdeviceptr(), true) {
|
: buffer(size, CUdeviceptr(), true) {
|
||||||
cu_context::context_switcher ctx_switch(*context_);
|
|
||||||
dispatch::cuMemAlloc(&*cu_, size);
|
dispatch::cuMemAlloc(&*cu_, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
cu_buffer::cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership)
|
cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
|
||||||
: buffer(context, size, cu, take_ownership){
|
: buffer(size, cu, take_ownership){
|
||||||
}
|
}
|
||||||
|
|
||||||
void cu_buffer::set_zero(driver::stream* queue, size_t size)
|
void cu_buffer::set_zero(driver::stream* queue, size_t size){
|
||||||
{
|
|
||||||
cu_context::context_switcher ctx_switch(*context_);
|
|
||||||
dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
|
dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -121,7 +121,7 @@ cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(
|
|||||||
|
|
||||||
cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
|
cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
|
||||||
dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
|
dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
|
||||||
dispatch::cuCtxPopCurrent_v2(NULL);
|
// dispatch::cuCtxPopCurrent_v2(NULL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -154,6 +154,7 @@ CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
|
|||||||
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
|
||||||
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
|
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
|
||||||
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
|
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
|
||||||
|
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
|
||||||
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
|
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
|
||||||
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
|
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
|
||||||
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
|
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
|
||||||
@@ -223,6 +224,7 @@ void* dispatch::cuCtxCreate_v2_;
|
|||||||
void* dispatch::cuModuleGetFunction_;
|
void* dispatch::cuModuleGetFunction_;
|
||||||
void* dispatch::cuStreamSynchronize_;
|
void* dispatch::cuStreamSynchronize_;
|
||||||
void* dispatch::cuStreamDestroy_v2_;
|
void* dispatch::cuStreamDestroy_v2_;
|
||||||
|
void* dispatch::cuStreamGetCtx_;
|
||||||
void* dispatch::cuEventDestroy_v2_;
|
void* dispatch::cuEventDestroy_v2_;
|
||||||
void* dispatch::cuMemAlloc_v2_;
|
void* dispatch::cuMemAlloc_v2_;
|
||||||
void* dispatch::cuPointerGetAttribute_;
|
void* dispatch::cuPointerGetAttribute_;
|
||||||
|
@@ -62,22 +62,19 @@ void module::init_llvm() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
module::module(driver::context* ctx, CUmodule mod, bool has_ownership)
|
module::module(CUmodule mod, bool has_ownership)
|
||||||
: polymorphic_resource(mod, has_ownership), ctx_(ctx) {
|
: polymorphic_resource(mod, has_ownership) {
|
||||||
}
|
}
|
||||||
|
|
||||||
module::module(driver::context* ctx, host_module_t mod, bool has_ownership)
|
module::module(host_module_t mod, bool has_ownership)
|
||||||
: polymorphic_resource(mod, has_ownership), ctx_(ctx) {
|
: polymorphic_resource(mod, has_ownership) {
|
||||||
}
|
}
|
||||||
|
|
||||||
driver::context* module::context() const {
|
|
||||||
return ctx_;
|
|
||||||
}
|
|
||||||
|
|
||||||
module* module::create(driver::context* ctx, std::unique_ptr<llvm::Module> src) {
|
module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
|
||||||
switch(ctx->backend()){
|
switch(device->backend()){
|
||||||
case CUDA: return new cu_module(ctx, std::move(src));
|
case CUDA: return new cu_module(device, std::move(src));
|
||||||
case Host: return new host_module(ctx, std::move(src));
|
case Host: return new host_module(std::move(src));
|
||||||
default: throw std::runtime_error("unknown backend");
|
default: throw std::runtime_error("unknown backend");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -130,7 +127,7 @@ void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std
|
|||||||
// Host //
|
// Host //
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
|
|
||||||
host_module::host_module(driver::context * context, std::unique_ptr<llvm::Module> src): module(context, host_module_t(), true) {
|
host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
|
||||||
init_llvm();
|
init_llvm();
|
||||||
// create kernel wrapper
|
// create kernel wrapper
|
||||||
llvm::LLVMContext &ctx = src->getContext();
|
llvm::LLVMContext &ctx = src->getContext();
|
||||||
@@ -269,10 +266,9 @@ std::string cu_module::compile_llvm_module(std::unique_ptr<llvm::Module> module,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
cu_module::cu_module(driver::context * context, std::unique_ptr<llvm::Module> ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { }
|
cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): cu_module(compile_llvm_module(std::move(ll_module), device)) { }
|
||||||
|
|
||||||
cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
|
cu_module::cu_module(std::string const & source) : module(CUmodule(), true), source_(source){
|
||||||
cu_context::context_switcher ctx(*context);
|
|
||||||
// JIT compile source-code
|
// JIT compile source-code
|
||||||
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
|
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
|
||||||
unsigned int errbufsize = 8096;
|
unsigned int errbufsize = 8096;
|
||||||
@@ -285,6 +281,7 @@ cu_module::cu_module(driver::context * context, std::string const & source) : mo
|
|||||||
std::cout << source << std::endl;
|
std::cout << source << std::endl;
|
||||||
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
|
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
|
||||||
std::cerr << errbuf << std::endl;
|
std::cerr << errbuf << std::endl;
|
||||||
|
// exit(1);
|
||||||
//#endif
|
//#endif
|
||||||
throw;
|
throw;
|
||||||
}
|
}
|
||||||
@@ -294,7 +291,7 @@ std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
|
|||||||
CUdeviceptr handle;
|
CUdeviceptr handle;
|
||||||
size_t size;
|
size_t size;
|
||||||
dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
|
dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
|
||||||
std::unique_ptr<buffer> res(new cu_buffer(ctx_, size, handle, false));
|
std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
|
||||||
return std::move(res);
|
return std::move(res);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -43,32 +43,29 @@ namespace driver
|
|||||||
// Base //
|
// Base //
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
|
|
||||||
stream::stream(driver::context *ctx, CUstream cu, bool has_ownership)
|
stream::stream(CUstream cu, bool has_ownership)
|
||||||
: polymorphic_resource(cu, has_ownership), ctx_(ctx) {
|
: polymorphic_resource(cu, has_ownership) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
stream::stream(driver::context *ctx, host_stream_t cl, bool has_ownership)
|
stream::stream(host_stream_t cl, bool has_ownership)
|
||||||
: polymorphic_resource(cl, has_ownership), ctx_(ctx) {
|
: polymorphic_resource(cl, has_ownership) {
|
||||||
}
|
}
|
||||||
|
|
||||||
driver::stream* stream::create(driver::context* ctx) {
|
driver::stream* stream::create(backend_t backend) {
|
||||||
switch(ctx->backend()){
|
switch(backend){
|
||||||
case CUDA: return new cu_stream(ctx);
|
case CUDA: return new cu_stream();
|
||||||
case Host: return new host_stream(ctx);
|
case Host: return new host_stream();
|
||||||
default: throw std::runtime_error("unknown backend");
|
default: throw std::runtime_error("unknown backend");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
driver::context* stream::context() const {
|
|
||||||
return ctx_;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
// Host //
|
// Host //
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
|
|
||||||
host_stream::host_stream(driver::context *ctx): stream(ctx, host_stream_t(), true) {
|
host_stream::host_stream(): stream(host_stream_t(), true) {
|
||||||
hst_->pool.reset(new ThreadPool(1));
|
hst_->pool.reset(new ThreadPool(1));
|
||||||
hst_->futures.reset(new std::vector<std::future<void>>());
|
hst_->futures.reset(new std::vector<std::future<void>>());
|
||||||
}
|
}
|
||||||
@@ -104,28 +101,20 @@ void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset
|
|||||||
// CUDA //
|
// CUDA //
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
|
|
||||||
inline CUcontext get_context() {
|
|
||||||
CUcontext result;
|
|
||||||
dispatch::cuCtxGetCurrent(&result);
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
cu_stream::cu_stream(CUstream str, bool take_ownership):
|
cu_stream::cu_stream(CUstream str, bool take_ownership):
|
||||||
stream(backend::contexts::import(get_context()), str, take_ownership) {
|
stream(str, take_ownership) {
|
||||||
}
|
}
|
||||||
|
|
||||||
cu_stream::cu_stream(driver::context *context): stream((driver::cu_context*)context, CUstream(), true) {
|
cu_stream::cu_stream(): stream(CUstream(), true) {
|
||||||
cu_context::context_switcher ctx_switch(*ctx_);
|
|
||||||
dispatch::cuStreamCreate(&*cu_, 0);
|
dispatch::cuStreamCreate(&*cu_, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cu_stream::synchronize() {
|
void cu_stream::synchronize() {
|
||||||
cu_context::context_switcher ctx_switch(*ctx_);
|
|
||||||
dispatch::cuStreamSynchronize(*cu_);
|
dispatch::cuStreamSynchronize(*cu_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event, void** args, size_t args_size) {
|
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event, void** args, size_t args_size) {
|
||||||
cu_context::context_switcher ctx_switch(*ctx_);
|
|
||||||
void *config[] = {
|
void *config[] = {
|
||||||
CU_LAUNCH_PARAM_BUFFER_POINTER, args,
|
CU_LAUNCH_PARAM_BUFFER_POINTER, args,
|
||||||
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
|
||||||
@@ -139,7 +128,6 @@ void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std:
|
|||||||
}
|
}
|
||||||
|
|
||||||
void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
|
void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
|
||||||
cu_context::context_switcher ctx_switch(*ctx_);
|
|
||||||
if(blocking)
|
if(blocking)
|
||||||
dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
|
dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
|
||||||
else
|
else
|
||||||
@@ -147,7 +135,6 @@ void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset,
|
|||||||
}
|
}
|
||||||
|
|
||||||
void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
|
void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
|
||||||
cu_context::context_switcher ctx_switch(*ctx_);
|
|
||||||
if(blocking)
|
if(blocking)
|
||||||
dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
|
dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
|
||||||
else
|
else
|
||||||
|
@@ -122,7 +122,7 @@ void function::caller::write(std::ofstream &ofs) {
|
|||||||
ofs << source;
|
ofs << source;
|
||||||
}
|
}
|
||||||
|
|
||||||
void function::caller::read(driver::context* ctx, std::ifstream &ifs) {
|
void function::caller::read(std::ifstream &ifs) {
|
||||||
// read name
|
// read name
|
||||||
std::getline(ifs, name_);
|
std::getline(ifs, name_);
|
||||||
// read signature
|
// read signature
|
||||||
@@ -136,14 +136,14 @@ void function::caller::read(driver::context* ctx, std::ifstream &ifs) {
|
|||||||
// read module
|
// read module
|
||||||
std::string src((std::istreambuf_iterator<char>(ifs)),
|
std::string src((std::istreambuf_iterator<char>(ifs)),
|
||||||
std::istreambuf_iterator<char>());
|
std::istreambuf_iterator<char>());
|
||||||
parent_.reset(new driver::cu_module(ctx, src));
|
parent_.reset(new driver::cu_module(src));
|
||||||
bin_.reset(driver::kernel::create(&*parent_, name_.c_str()));
|
bin_.reset(driver::kernel::create(&*parent_, name_.c_str()));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function::caller::caller(driver::context* ctx, std::ifstream &ifs, const options_t& opt)
|
function::caller::caller(std::ifstream &ifs, const options_t& opt)
|
||||||
: opt_(opt) {
|
: opt_(opt) {
|
||||||
read(ctx, ifs);
|
read(ifs);
|
||||||
}
|
}
|
||||||
|
|
||||||
function::caller::caller(ir::function *ir,
|
function::caller::caller(ir::function *ir,
|
||||||
@@ -163,7 +163,12 @@ function::caller::caller(ir::function *ir,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void function::caller::operator ()(driver::stream *stream, const grid_t& _grid, void** args, size_t args_size) const {
|
void function::caller::operator ()(driver::stream *stream, const grid_t& _grid, void** args, size_t args_size, const std::map<std::string, std::vector<char>>& csts) const {
|
||||||
|
// copy constants
|
||||||
|
for(const auto& cst: csts){
|
||||||
|
std::unique_ptr<driver::buffer> buffer = parent()->symbol(cst.first.c_str());
|
||||||
|
stream->write(&*buffer, true, 0, cst.second);
|
||||||
|
}
|
||||||
// set grid
|
// set grid
|
||||||
if(_grid.size() > 3)
|
if(_grid.size() > 3)
|
||||||
throw std::runtime_error("grid size must be no greater than 3");
|
throw std::runtime_error("grid size must be no greater than 3");
|
||||||
@@ -188,10 +193,8 @@ std::unique_ptr<ir::module> function::make_ir(Parser& parser) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// create Binary from Triton-IR
|
// create Binary from Triton-IR
|
||||||
std::unique_ptr<driver::module> function::make_bin(ir::module &module,
|
std::unique_ptr<driver::module> function::make_bin(ir::module &module, driver::device* device, const options_t& opt) {
|
||||||
driver::context *context,
|
std::unique_ptr<codegen::target> target = device->make_target();
|
||||||
const options_t& opt) {
|
|
||||||
std::unique_ptr<codegen::target> target = context->device()->make_target();
|
|
||||||
// generate llvm code
|
// generate llvm code
|
||||||
llvm::LLVMContext ctx;
|
llvm::LLVMContext ctx;
|
||||||
std::unique_ptr<llvm::Module> llvm(new llvm::Module(module.get_name(), ctx));
|
std::unique_ptr<llvm::Module> llvm(new llvm::Module(module.get_name(), ctx));
|
||||||
@@ -236,17 +239,17 @@ std::unique_ptr<driver::module> function::make_bin(ir::module &module,
|
|||||||
layouts.run(module);
|
layouts.run(module);
|
||||||
liveness.run(module);
|
liveness.run(module);
|
||||||
allocation.run(module);
|
allocation.run(module);
|
||||||
if(allocation.allocated_size() > context->device()->max_shared_memory())
|
if(allocation.allocated_size() > device->max_shared_memory())
|
||||||
throw std::runtime_error("using too much shared memory");
|
throw std::runtime_error("using too much shared memory");
|
||||||
barriers.run(module);
|
barriers.run(module);
|
||||||
isel.visit(module, *llvm);
|
isel.visit(module, *llvm);
|
||||||
std::unique_ptr<driver::module> res(driver::module::create(context, std::move(llvm)));
|
std::unique_ptr<driver::module> res(driver::module::create(device, std::move(llvm)));
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
// create Binary from options
|
// create Binary from options
|
||||||
void function::make(driver::stream *stream, options_t opt) {
|
void function::make(driver::device *device, options_t opt) {
|
||||||
if(callers_.find(opt) != callers_.end())
|
if(callers_.find(opt) != callers_.end())
|
||||||
return;
|
return;
|
||||||
// pre-process
|
// pre-process
|
||||||
@@ -263,25 +266,17 @@ void function::make(driver::stream *stream, options_t opt) {
|
|||||||
// triton-ir -> binary
|
// triton-ir -> binary
|
||||||
std::unique_ptr<driver::module> bin;
|
std::unique_ptr<driver::module> bin;
|
||||||
// try{
|
// try{
|
||||||
bin = make_bin(*ir, stream->context(), opt);
|
bin = make_bin(*ir, device, opt);
|
||||||
// }catch(const std::runtime_error&){
|
// }catch(const std::runtime_error&){
|
||||||
// return nullptr;
|
// return nullptr;
|
||||||
// }
|
// }
|
||||||
// create callable
|
// create callable
|
||||||
ir::function *tmp = ir->get_function_list()[0];
|
ir::function *tmp = ir->get_function_list()[0];
|
||||||
callers_[opt].reset(new caller(tmp, std::move(bin), opt));
|
callers_[opt].reset(new caller(tmp, std::move(bin), opt));
|
||||||
auto& call = callers_[opt];
|
|
||||||
// copy constants
|
|
||||||
if(call)
|
|
||||||
for(const auto& cst: cst_){
|
|
||||||
std::unique_ptr<driver::buffer> buffer = call->parent()->symbol(cst.first.c_str());
|
|
||||||
stream->write(&*buffer, true, 0, cst.second);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// precompile all kernels spanned by given options space
|
// precompile all kernels spanned by given options space
|
||||||
void function::precompile(driver::stream* stream,
|
void function::precompile(driver::device* device, const options_space_t& space) {
|
||||||
const options_space_t& space) {
|
|
||||||
// all ranges
|
// all ranges
|
||||||
std::vector<size_t> ranges;
|
std::vector<size_t> ranges;
|
||||||
ranges.push_back(space.num_warps.size());
|
ranges.push_back(space.num_warps.size());
|
||||||
@@ -296,7 +291,7 @@ void function::precompile(driver::stream* stream,
|
|||||||
for(auto D: space.defines)
|
for(auto D: space.defines)
|
||||||
opt.defines[D.first] = D.second[params[i++]];
|
opt.defines[D.first] = D.second[params[i++]];
|
||||||
// compile
|
// compile
|
||||||
make(stream, opt);
|
make(device, opt);
|
||||||
};
|
};
|
||||||
// multi-threaded compilation
|
// multi-threaded compilation
|
||||||
_loop_nest(ranges, do_make);
|
_loop_nest(ranges, do_make);
|
||||||
@@ -304,8 +299,8 @@ void function::precompile(driver::stream* stream,
|
|||||||
throw std::runtime_error("could not compile kernel");
|
throw std::runtime_error("could not compile kernel");
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string function::ptx(driver::stream* stream, const options_t& opt) {
|
std::string function::ptx(driver::device* device, const options_t& opt) {
|
||||||
make(stream, opt);
|
make(device, opt);
|
||||||
const auto& fn = callers_.at(opt);
|
const auto& fn = callers_.at(opt);
|
||||||
if(!fn)
|
if(!fn)
|
||||||
return "";
|
return "";
|
||||||
@@ -325,7 +320,7 @@ function::caller* function::autotune(driver::stream* stream, const grid_fn_ty& g
|
|||||||
if(x.second == nullptr)
|
if(x.second == nullptr)
|
||||||
throw std::runtime_error("configuration not compiled");
|
throw std::runtime_error("configuration not compiled");
|
||||||
caller* current = &*x.second;
|
caller* current = &*x.second;
|
||||||
double ts = tools::bench([&]() { (*current)(stream, grid_fn(x.first), args, args_size); },
|
double ts = tools::bench([&]() { (*current)(stream, grid_fn(x.first), args, args_size, cst_); },
|
||||||
stream, true);
|
stream, true);
|
||||||
ret = (ts < best_ts) ? current : ret;
|
ret = (ts < best_ts) ? current : ret;
|
||||||
best_ts = std::min(ts, best_ts);
|
best_ts = std::min(ts, best_ts);
|
||||||
@@ -422,14 +417,14 @@ function::function(const std::string &src,
|
|||||||
src_ = preheader() + src_;
|
src_ = preheader() + src_;
|
||||||
}
|
}
|
||||||
|
|
||||||
void function::operator()(void** args, size_t args_size, const grid_fn_ty& grid_fn, driver::stream *stream) {
|
void function::operator()(void** args, size_t args_size, const grid_fn_ty& grid_fn, driver::stream *stream, driver::device *device) {
|
||||||
// pre-compile kernels
|
// pre-compile kernels
|
||||||
if(callers_.empty()){
|
if(callers_.empty()){
|
||||||
precompile(stream, opt_);
|
precompile(device, opt_);
|
||||||
}
|
}
|
||||||
// re-tuning key
|
// re-tuning key
|
||||||
cache_key_t key;
|
cache_key_t key;
|
||||||
key.first = stream->context()->device();
|
key.first = device;
|
||||||
key.second = callers_.begin()->second->retune();
|
key.second = callers_.begin()->second->retune();
|
||||||
// auto-tune if necessary
|
// auto-tune if necessary
|
||||||
auto it = cache_.find(key);
|
auto it = cache_.find(key);
|
||||||
@@ -438,14 +433,14 @@ void function::operator()(void** args, size_t args_size, const grid_fn_ty& grid_
|
|||||||
it = cache_.insert({key, best}).first;
|
it = cache_.insert({key, best}).first;
|
||||||
}
|
}
|
||||||
// run
|
// run
|
||||||
(*it->second)(stream, grid_fn(it->second->opt()), args, args_size);
|
(*it->second)(stream, grid_fn(it->second->opt()), args, args_size, cst_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void function::operator()(void** args,
|
void function::operator()(void** args,
|
||||||
size_t args_size,
|
size_t args_size,
|
||||||
const grid_t& grid,
|
const grid_t& grid,
|
||||||
driver::stream *stream) {
|
driver::stream* stream, driver::device *device) {
|
||||||
return this->operator()(args, args_size, [&grid](const options_t&){ return grid; }, stream);
|
return this->operator()(args, args_size, [&grid](const options_t&){ return grid; }, stream, device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@@ -21,6 +21,7 @@ std::map<map_key_t, std::shared_ptr<rt::function::grid_fn_ty>> id_grid_map;
|
|||||||
std::map<map_key_t, std::shared_ptr<rt::function>> id_fn_map;
|
std::map<map_key_t, std::shared_ptr<rt::function>> id_fn_map;
|
||||||
|
|
||||||
CUstream torch_get_cuda_stream(int64_t dev_id);
|
CUstream torch_get_cuda_stream(int64_t dev_id);
|
||||||
|
CUdevice torch_get_cuda_device(int64_t dev_id);
|
||||||
|
|
||||||
/* Grid utilities */
|
/* Grid utilities */
|
||||||
|
|
||||||
@@ -47,8 +48,8 @@ void delete_fn(const map_key_t& key) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
std::string get_fn_ptx(const map_key_t& key, const rt::function::options_t& opt) {
|
std::string get_fn_ptx(const map_key_t& key, const rt::function::options_t& opt) {
|
||||||
triton::driver::cu_stream stream(torch_get_cuda_stream(key.second), false);
|
triton::driver::cu_device device(torch_get_cuda_device(key.second), false);
|
||||||
return id_fn_map[key]->ptx(&stream, opt);
|
return id_fn_map[key]->ptx(&device, opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
void cleanup() {
|
void cleanup() {
|
||||||
|
@@ -31,12 +31,18 @@ void init_host_stream() {
|
|||||||
if(!host_stream){
|
if(!host_stream){
|
||||||
host_device.reset(new drv::host_device());
|
host_device.reset(new drv::host_device());
|
||||||
host_context.reset(drv::context::create(&*host_device));
|
host_context.reset(drv::context::create(&*host_device));
|
||||||
host_stream.reset(drv::stream::create(&*host_context));
|
host_stream.reset(drv::stream::create(host_context->backend()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
CUstream torch_get_cuda_stream(int64_t dev_id) {
|
CUstream torch_get_cuda_stream(int64_t dev_id) {
|
||||||
return (CUstream)at::cuda::getCurrentCUDAStream(dev_id).stream();
|
return (CUstream)c10::cuda::getCurrentCUDAStream(dev_id).stream();
|
||||||
|
}
|
||||||
|
|
||||||
|
CUdeviceptr torch_get_cuda_device(int64_t dev_id) {
|
||||||
|
CUdevice ret;
|
||||||
|
triton::driver::dispatch::cuDeviceGet(&ret, dev_id);
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void synchronize(int64_t dev_id) {
|
void synchronize(int64_t dev_id) {
|
||||||
@@ -60,12 +66,12 @@ void launch_kernel(int64_t op_id, int64_t dev_id, const std::string& args,
|
|||||||
}
|
}
|
||||||
if(dev_id == -1){
|
if(dev_id == -1){
|
||||||
init_host_stream();
|
init_host_stream();
|
||||||
(*fn)((void**)args.c_str(), args.size(), *id_grid_map.at({op_id, dev_id}), &*host_stream);
|
(*fn)((void**)args.c_str(), args.size(), *id_grid_map.at({op_id, dev_id}), &*host_stream, &*host_device);
|
||||||
}
|
}
|
||||||
else{
|
else{
|
||||||
triton::driver::cu_stream stream(torch_get_cuda_stream(dev_id), false);
|
triton::driver::cu_stream stream(torch_get_cuda_stream(dev_id), false);
|
||||||
triton::driver::context* ctx = stream.context();
|
triton::driver::cu_device device(torch_get_cuda_device(dev_id), false);
|
||||||
(*fn)((void**)args.c_str(), args.size(), *id_grid_map.at({op_id, dev_id}), &stream);
|
(*fn)((void**)args.c_str(), args.size(), *id_grid_map.at({op_id, dev_id}), &stream, &device);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to benchmark
|
// shapes to benchmark
|
||||||
typedef std::tuple<int, int, int, int, int, int, int, int, int, int, int> config_t;
|
typedef std::tuple<int, int, int, int, int, int, int, int, int, int, int> config_t;
|
||||||
std::vector<config_t> configs = {
|
std::vector<config_t> configs = {
|
||||||
@@ -32,7 +32,7 @@ int main() {
|
|||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w) = c;
|
std::tie(Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w) = c;
|
||||||
std::cout << "// " << c ;
|
std::cout << "// " << c ;
|
||||||
for(auto perf: bench_conv(stream, HALF, Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w))
|
for(auto perf: bench_conv(context, stream, HALF, Z, H, W, CO, CI, R, S, pad_h, pad_w, stride_h, stride_w))
|
||||||
std::cout << ", " << perf << std::flush;
|
std::cout << ", " << perf << std::flush;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
@@ -7,7 +7,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to benchmark
|
// shapes to benchmark
|
||||||
typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> config_t;
|
typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>> config_t;
|
||||||
std::vector<config_t> configs = {
|
std::vector<config_t> configs = {
|
||||||
@@ -29,7 +29,7 @@ int main() {
|
|||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(shape, ord_x, ord_y) = c;
|
std::tie(shape, ord_x, ord_y) = c;
|
||||||
std::cout << "// " << c << std::flush;
|
std::cout << "// " << c << std::flush;
|
||||||
for(auto perf: bench_copy_nd(stream, HALF, shape, ord_x, ord_y))
|
for(auto perf: bench_copy_nd(context, stream, HALF, shape, ord_x, ord_y))
|
||||||
std::cout << ", " << perf << std::flush;
|
std::cout << ", " << perf << std::flush;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
@@ -5,7 +5,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to benchmark
|
// shapes to benchmark
|
||||||
typedef std::tuple<std::vector<int>, bool, bool, int, int, int> config_t;
|
typedef std::tuple<std::vector<int>, bool, bool, int, int, int> config_t;
|
||||||
std::vector<config_t> configs;
|
std::vector<config_t> configs;
|
||||||
@@ -65,7 +65,7 @@ int main() {
|
|||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(ord, AT, BT, M, N, K) = c;
|
std::tie(ord, AT, BT, M, N, K) = c;
|
||||||
std::cout << "// " << c ;
|
std::cout << "// " << c ;
|
||||||
for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord))
|
for(auto perf: bench_dot(context, stream, HALF, AT, BT, M, N, K, ord, ord))
|
||||||
std::cout << ", " << perf << std::flush;
|
std::cout << ", " << perf << std::flush;
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
}
|
}
|
||||||
|
@@ -66,13 +66,13 @@ template<> struct to_string<double>{
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
void triton_conv(drv::stream* stream,
|
void triton_conv(drv::context* context, drv::stream* stream,
|
||||||
int Z, int CI, int H, int W, int CO, int R, int S,
|
int Z, int CI, int H, int W, int CO, int R, int S,
|
||||||
int pad_h, int pad_w, int stride_h, int stride_w,
|
int pad_h, int pad_w, int stride_h, int stride_w,
|
||||||
run_mode_t mode, std::vector<double>& bench, bool &test){
|
run_mode_t mode, std::vector<double>& bench, bool &test){
|
||||||
std::string ty = to_string<T>::value;
|
std::string ty = to_string<T>::value;
|
||||||
size_t dt_nbytes = sizeof(T);
|
size_t dt_nbytes = sizeof(T);
|
||||||
drv::context* context = stream->context();
|
drv::device* device = context->device();
|
||||||
|
|
||||||
int P = (H + 2*pad_h - R)/stride_h + 1;
|
int P = (H + 2*pad_h - R)/stride_h + 1;
|
||||||
int Q = (W + 2*pad_w - S)/stride_w + 1;
|
int Q = (W + 2*pad_w - S)/stride_w + 1;
|
||||||
@@ -131,19 +131,19 @@ void triton_conv(drv::stream* stream,
|
|||||||
(size_t)x.D<int>("TZ")};
|
(size_t)x.D<int>("TZ")};
|
||||||
};
|
};
|
||||||
auto tflops = [&](double nanosec) { return 2.*Z*P*Q*CI*CO*R*S / nanosec * 1e-3; };
|
auto tflops = [&](double nanosec) { return 2.*Z*P*Q*CI*CO*R*S / nanosec * 1e-3; };
|
||||||
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream);}, stream);
|
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream, device);}, stream);
|
||||||
bench.push_back(tflops(triton_ns));
|
bench.push_back(tflops(triton_ns));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> bench_conv(drv::stream* stream, dtype_t dtype,
|
std::vector<double> bench_conv(drv::context* context, drv::stream* stream, dtype_t dtype,
|
||||||
int32_t Z, int32_t H, int32_t W, int32_t CO, int32_t CI, int32_t R, int32_t S,
|
int32_t Z, int32_t H, int32_t W, int32_t CO, int32_t CI, int32_t R, int32_t S,
|
||||||
int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w) {
|
int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w) {
|
||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
bool test;
|
bool test;
|
||||||
switch(dtype){
|
switch(dtype){
|
||||||
case HALF: triton_conv<half_float::half>(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
case HALF: triton_conv<half_float::half>(context, stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
||||||
case FLOAT: triton_conv<float>(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
case FLOAT: triton_conv<float>(context, stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
||||||
case DOUBLE: triton_conv<double>(stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
case DOUBLE: triton_conv<double>(context, stream, Z, CI, H, W, CO, R, S, pad_h, pad_w, stride_h, stride_w, BENCH, bench, test); break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
return bench;
|
return bench;
|
||||||
|
@@ -79,13 +79,13 @@ template<> struct to_string<double>{
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<typename T>
|
template<typename T>
|
||||||
void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
void triton_copy_nd(drv::context* context, drv::stream* stream, const std::vector<int32_t>& shape,
|
||||||
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
||||||
std::vector<std::vector<std::string>> TS,
|
std::vector<std::vector<std::string>> TS,
|
||||||
run_mode_t mode, std::vector<double>& bench, bool &test) {
|
run_mode_t mode, std::vector<double>& bench, bool &test) {
|
||||||
std::string ty = to_string<T>::value;
|
std::string ty = to_string<T>::value;
|
||||||
size_t dtsize = sizeof(T);
|
size_t dtsize = sizeof(T);
|
||||||
drv::context* context = stream->context();
|
drv::device* device = context->device();
|
||||||
|
|
||||||
// rank
|
// rank
|
||||||
size_t rank = shape.size();
|
size_t rank = shape.size();
|
||||||
@@ -133,7 +133,7 @@ void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
|||||||
// metrics
|
// metrics
|
||||||
if(mode == BENCH){
|
if(mode == BENCH){
|
||||||
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
|
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
|
||||||
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream);}, stream);
|
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream, device);}, stream);
|
||||||
bench.push_back(gbps(triton_ns));
|
bench.push_back(gbps(triton_ns));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -145,7 +145,7 @@ void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
|||||||
for(size_t i = 0; i < hx.size(); i++)
|
for(size_t i = 0; i < hx.size(); i++)
|
||||||
hx[i] = static_cast<T>((float)rand()/RAND_MAX);
|
hx[i] = static_cast<T>((float)rand()/RAND_MAX);
|
||||||
stream->write(&*dx, true, 0, hx);
|
stream->write(&*dx, true, 0, hx);
|
||||||
function((void**)&args, sizeof(args), grid, stream);
|
function((void**)&args, sizeof(args), grid, stream, device);
|
||||||
stream->synchronize();
|
stream->synchronize();
|
||||||
stream->read(&*dy, true, 0, hy);
|
stream->read(&*dy, true, 0, hy);
|
||||||
cc_copy_nd(hx, ry, shape, x_order, y_order);
|
cc_copy_nd(hx, ry, shape, x_order, y_order);
|
||||||
@@ -153,23 +153,23 @@ void triton_copy_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> bench_copy_nd(drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
std::vector<double> bench_copy_nd(drv::context* context, drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
||||||
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
bool test;
|
bool test;
|
||||||
switch(dtype){
|
switch(dtype){
|
||||||
case HALF:
|
case HALF:
|
||||||
triton_copy_nd<half_float::half>(stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
triton_copy_nd<half_float::half>(context, stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
||||||
break;
|
break;
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
triton_copy_nd<float>(stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
triton_copy_nd<float>(context, stream, shape, x_order, y_order, {}, BENCH, bench, test);
|
||||||
break;
|
break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
return bench;
|
return bench;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool test_copy_nd(drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
bool test_copy_nd(drv::context* context, drv::stream* stream, dtype_t dtype, const std::vector<int32_t>& shape,
|
||||||
const std::vector<int32_t>& TS,
|
const std::vector<int32_t>& TS,
|
||||||
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order) {
|
||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
@@ -179,10 +179,10 @@ bool test_copy_nd(drv::stream* stream, dtype_t dtype, const std::vector<int32_t>
|
|||||||
TSS.push_back({std::to_string(d)});
|
TSS.push_back({std::to_string(d)});
|
||||||
switch(dtype){
|
switch(dtype){
|
||||||
case HALF:
|
case HALF:
|
||||||
triton_copy_nd<half_float::half>(stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
triton_copy_nd<half_float::half>(context, stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
||||||
break;
|
break;
|
||||||
case FLOAT:
|
case FLOAT:
|
||||||
triton_copy_nd<float>(stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
triton_copy_nd<float>(context, stream, shape, x_order, y_order, TSS, TEST, bench, test);
|
||||||
break;
|
break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
|
@@ -79,14 +79,14 @@ template<> struct to_string<double>{
|
|||||||
};
|
};
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
void triton_dot(drv::stream* stream, bool AT, bool BT,
|
void triton_dot(drv::context* context, drv::stream* stream, bool AT, bool BT,
|
||||||
int32_t M, int32_t N, int32_t K,
|
int32_t M, int32_t N, int32_t K,
|
||||||
int32_t TM, int32_t TN, int32_t TK, int32_t nwarp,
|
int32_t TM, int32_t TN, int32_t TK, int32_t nwarp,
|
||||||
const std::vector<int>& a_order, const std::vector<int>& b_order,
|
const std::vector<int>& a_order, const std::vector<int>& b_order,
|
||||||
run_mode_t mode, std::vector<double>& bench, bool &test){
|
run_mode_t mode, std::vector<double>& bench, bool &test){
|
||||||
std::string ty = to_string<T>::value;
|
std::string ty = to_string<T>::value;
|
||||||
size_t dt_nbytes = sizeof(T);
|
size_t dt_nbytes = sizeof(T);
|
||||||
drv::context* context = stream->context();
|
drv::device* device = context->device();
|
||||||
int32_t lda = (AT ^ a_order[0]==1) ? K : M;
|
int32_t lda = (AT ^ a_order[0]==1) ? K : M;
|
||||||
int32_t ldb = (BT ^ b_order[0]==1) ? N : K;
|
int32_t ldb = (BT ^ b_order[0]==1) ? N : K;
|
||||||
int32_t ldc = N;
|
int32_t ldc = N;
|
||||||
@@ -148,20 +148,20 @@ void triton_dot(drv::stream* stream, bool AT, bool BT,
|
|||||||
// metrics
|
// metrics
|
||||||
if(mode == BENCH){
|
if(mode == BENCH){
|
||||||
auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; };
|
auto tflops = [&](double nanosec) { return 2.*M*N*K / nanosec * 1e-3; };
|
||||||
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream);}, stream);
|
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream, device);}, stream);
|
||||||
bench.push_back(tflops(triton_ns));
|
bench.push_back(tflops(triton_ns));
|
||||||
|
|
||||||
// cublas
|
// // cublas
|
||||||
if(cublas::cublasinit()){
|
// if(cublas::cublasinit()){
|
||||||
T alpha(static_cast<double>(1));
|
// T alpha(static_cast<double>(1));
|
||||||
T beta(static_cast<double>(0));
|
// T beta(static_cast<double>(0));
|
||||||
cublasGemmAlgo_t fastest;
|
// cublasGemmAlgo_t fastest;
|
||||||
cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest);
|
// cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K, &alpha, &*da, lda, &*db, ldb, &beta, &*dc, ldc, &fastest);
|
||||||
double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K,
|
// double cublas_ms = triton::tools::bench([&]() { cublasGemm(CUDA_R_16F, stream, AT, BT, M, N, K,
|
||||||
&alpha, &*da, lda, &*db, ldb, &beta, &*dc,
|
// &alpha, &*da, lda, &*db, ldb, &beta, &*dc,
|
||||||
ldc, nullptr, fastest); }, stream);
|
// ldc, nullptr, fastest); }, stream);
|
||||||
bench.push_back(tflops(cublas_ms));
|
// bench.push_back(tflops(cublas_ms));
|
||||||
}
|
// }
|
||||||
}
|
}
|
||||||
|
|
||||||
// test triton
|
// test triton
|
||||||
@@ -179,7 +179,7 @@ void triton_dot(drv::stream* stream, bool AT, bool BT,
|
|||||||
stream->write(&*da, true, 0, ha);
|
stream->write(&*da, true, 0, ha);
|
||||||
stream->write(&*db, true, 0, hb);
|
stream->write(&*db, true, 0, hb);
|
||||||
// run kernel
|
// run kernel
|
||||||
function((void**)&args, sizeof(args), grid, stream);
|
function((void**)&args, sizeof(args), grid, stream, device);
|
||||||
// write back
|
// write back
|
||||||
stream->synchronize();
|
stream->synchronize();
|
||||||
// compare with CPU
|
// compare with CPU
|
||||||
@@ -190,21 +190,21 @@ void triton_dot(drv::stream* stream, bool AT, bool BT,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<double> bench_dot(drv::stream* stream,
|
std::vector<double> bench_dot(drv::context* context, drv::stream* stream,
|
||||||
dtype_t dtype, bool AT, bool BT,
|
dtype_t dtype, bool AT, bool BT,
|
||||||
int32_t M, int32_t N, int32_t K,
|
int32_t M, int32_t N, int32_t K,
|
||||||
const std::vector<int>& a_order, const std::vector<int>& b_order) {
|
const std::vector<int>& a_order, const std::vector<int>& b_order) {
|
||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
bool test;
|
bool test;
|
||||||
switch(dtype){
|
switch(dtype){
|
||||||
case HALF: triton_dot<half_float::half>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
case HALF: triton_dot<half_float::half>(context, stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
||||||
case FLOAT: triton_dot<float>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
case FLOAT: triton_dot<float>(context, stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
||||||
case DOUBLE: triton_dot<double>(stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
case DOUBLE: triton_dot<double>(context, stream, AT, BT, M, N, K, 0, 0, 0, 0, a_order, b_order, BENCH, bench, test); break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
return bench;
|
return bench;
|
||||||
}
|
}
|
||||||
bool test_dot(drv::stream* stream,
|
bool test_dot(drv::context* context, drv::stream* stream,
|
||||||
dtype_t dtype, bool AT, bool BT,
|
dtype_t dtype, bool AT, bool BT,
|
||||||
int32_t M, int32_t N, int32_t K,
|
int32_t M, int32_t N, int32_t K,
|
||||||
const std::vector<int>& a_order, const std::vector<int>& b_order,
|
const std::vector<int>& a_order, const std::vector<int>& b_order,
|
||||||
@@ -212,9 +212,9 @@ bool test_dot(drv::stream* stream,
|
|||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
bool test = false;
|
bool test = false;
|
||||||
switch(dtype){
|
switch(dtype){
|
||||||
case HALF: triton_dot<half_float::half>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
case HALF: triton_dot<half_float::half>(context, stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
||||||
case FLOAT: triton_dot<float>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
case FLOAT: triton_dot<float>(context, stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
||||||
case DOUBLE: triton_dot<double>(stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
case DOUBLE: triton_dot<double>(context, stream, AT, BT, M, N, K, TM, TN, TK, nwarp, a_order, b_order, TEST, bench, test); break;
|
||||||
default: break;
|
default: break;
|
||||||
}
|
}
|
||||||
return test;
|
return test;
|
||||||
|
@@ -53,7 +53,7 @@ enum run_mode_t {
|
|||||||
TEST
|
TEST
|
||||||
};
|
};
|
||||||
|
|
||||||
void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
|
void triton_reduce_nd(drv::context* context, drv::stream* stream, const std::vector<int32_t>& shape_x,
|
||||||
int axis, reduce_op_t op,
|
int axis, reduce_op_t op,
|
||||||
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
||||||
std::vector<std::vector<std::string>> TS,
|
std::vector<std::vector<std::string>> TS,
|
||||||
@@ -61,7 +61,7 @@ void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
|
|||||||
typedef float NumericT;
|
typedef float NumericT;
|
||||||
std::string ty = "float";
|
std::string ty = "float";
|
||||||
size_t dtsize = sizeof(NumericT);
|
size_t dtsize = sizeof(NumericT);
|
||||||
drv::context* context = stream->context();
|
drv::device* device = context->device();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -141,7 +141,7 @@ void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
|
|||||||
// metrics
|
// metrics
|
||||||
if(mode == BENCH){
|
if(mode == BENCH){
|
||||||
auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; };
|
auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; };
|
||||||
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream);}, stream);
|
double triton_ns = triton::tools::bench([&]() { function((void**)&args, sizeof(args), grid, stream, device);}, stream);
|
||||||
bench.push_back(gbps(triton_ns));
|
bench.push_back(gbps(triton_ns));
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -153,7 +153,7 @@ void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
|
|||||||
init_zeros(hy);
|
init_zeros(hy);
|
||||||
init_rand(hx);
|
init_rand(hx);
|
||||||
stream->write(&*dx, true, 0, hx);
|
stream->write(&*dx, true, 0, hx);
|
||||||
function((void**)&args, sizeof(args), grid, stream);
|
function((void**)&args, sizeof(args), grid, stream, device);
|
||||||
stream->synchronize();
|
stream->synchronize();
|
||||||
stream->read(&*dy, true, 0, hy);
|
stream->read(&*dy, true, 0, hy);
|
||||||
cc_reduce_nd(ry, hx, op, axis, shape_x);
|
cc_reduce_nd(ry, hx, op, axis, shape_x);
|
||||||
@@ -161,12 +161,12 @@ void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape_x,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
|
bool do_test(drv::context* context, drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
|
||||||
std::vector<double> bench;
|
std::vector<double> bench;
|
||||||
bool test;
|
bool test;
|
||||||
std::vector<std::vector<std::string>> TSS;
|
std::vector<std::vector<std::string>> TSS;
|
||||||
for(int32_t d: shape)
|
for(int32_t d: shape)
|
||||||
TSS.push_back({std::to_string(d)});
|
TSS.push_back({std::to_string(d)});
|
||||||
triton_reduce_nd(stream, shape, axis, op, {0, 1, 2}, {0, 1, 2}, TSS, TEST, bench, test);
|
triton_reduce_nd(context, stream, shape, axis, op, {0, 1, 2}, {0, 1, 2}, TSS, TEST, bench, test);
|
||||||
return test;
|
return test;
|
||||||
}
|
}
|
||||||
|
@@ -8,7 +8,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to benchmark
|
// shapes to benchmark
|
||||||
typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>> config_t;
|
typedef std::tuple<std::vector<int>, std::vector<int>, std::vector<int>, std::vector<int>> config_t;
|
||||||
std::vector<config_t> configs;
|
std::vector<config_t> configs;
|
||||||
@@ -50,7 +50,7 @@ int main() {
|
|||||||
bool result = true;
|
bool result = true;
|
||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(shape, tile, ord_x, ord_y) = c;
|
std::tie(shape, tile, ord_x, ord_y) = c;
|
||||||
bool pass = test_copy_nd(stream, FLOAT, shape, tile, ord_x, ord_y);
|
bool pass = test_copy_nd(context, stream, FLOAT, shape, tile, ord_x, ord_y);
|
||||||
result = result && pass;
|
result = result && pass;
|
||||||
std::cout << "// " << c << ", " << pass << std::endl;
|
std::cout << "// " << c << ", " << pass << std::endl;
|
||||||
}
|
}
|
||||||
|
@@ -6,7 +6,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to test
|
// shapes to test
|
||||||
typedef std::tuple<dtype_t, bool, bool, int, int, int, int, int, int, int> config_t;
|
typedef std::tuple<dtype_t, bool, bool, int, int, int, int, int, int, int> config_t;
|
||||||
std::vector<config_t> configs;
|
std::vector<config_t> configs;
|
||||||
@@ -25,7 +25,7 @@ int main() {
|
|||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c;
|
std::tie(dtype, AT, BT, M, N, K, TM, TN, TK, nwarp) = c;
|
||||||
std::cout << "Testing " << c << " ... " << std::flush;
|
std::cout << "Testing " << c << " ... " << std::flush;
|
||||||
if(test_dot(stream, dtype, AT, BT, M, N, K, {0, 1}, {0, 1}, TM, TN, TK, (size_t)nwarp))
|
if(test_dot(context, stream, dtype, AT, BT, M, N, K, {0, 1}, {0, 1}, TM, TN, TK, (size_t)nwarp))
|
||||||
std::cout << " Pass! " << std::endl;
|
std::cout << " Pass! " << std::endl;
|
||||||
else{
|
else{
|
||||||
std::cout << " Fail! " << std::endl;
|
std::cout << " Fail! " << std::endl;
|
||||||
|
@@ -16,7 +16,7 @@
|
|||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
auto context = triton::driver::backend::contexts::get_default();
|
auto context = triton::driver::backend::contexts::get_default();
|
||||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
triton::driver::stream* stream = triton::driver::stream::create(context->backend());
|
||||||
// shapes to benchmark
|
// shapes to benchmark
|
||||||
typedef std::tuple<std::vector<int>, int, reduce_op_t> config_t;
|
typedef std::tuple<std::vector<int>, int, reduce_op_t> config_t;
|
||||||
std::vector<config_t> configs = {
|
std::vector<config_t> configs = {
|
||||||
@@ -34,7 +34,7 @@ int main() {
|
|||||||
for(const auto& c: configs){
|
for(const auto& c: configs){
|
||||||
std::tie(shape, axis, op) = c;
|
std::tie(shape, axis, op) = c;
|
||||||
std::cout << "Testing " << c << " ... " << std::flush;
|
std::cout << "Testing " << c << " ... " << std::flush;
|
||||||
if(do_test(stream, shape, axis, op, 1))
|
if(do_test(context, stream, shape, axis, op, 1))
|
||||||
std::cout << " Pass! " << std::endl;
|
std::cout << " Pass! " << std::endl;
|
||||||
else
|
else
|
||||||
std::cout << " Fail! " << std::endl;
|
std::cout << " Fail! " << std::endl;
|
||||||
|
Reference in New Issue
Block a user