[driver] added more genericity for opencl support
This commit is contained in:
@@ -135,7 +135,7 @@ int main() {
|
||||
|
||||
|
||||
// benchmark a given matrix multiplication kernel
|
||||
auto benchmark = [&](triton::driver::cu_kernel kernel,
|
||||
auto benchmark = [&](triton::driver::kernel* kernel,
|
||||
triton::jit::launch_information info) {
|
||||
// launch info
|
||||
unsigned TM = info.global_range_size[0];
|
||||
@@ -153,20 +153,20 @@ int main() {
|
||||
unsigned last_safe_b = (BT==true)?(N*K - 1 - lastj)/N - lastk : N*K - 1 - lastj*K - lastk;
|
||||
int32_t bound = std::max<unsigned>(1, std::max(K - last_safe_a, K - last_safe_b));
|
||||
// set argument
|
||||
kernel.setArg(0, da);
|
||||
kernel.setArg(1, db);
|
||||
kernel.setArg(2, dc);
|
||||
kernel.setArg(3, M);
|
||||
kernel.setArg(4, N);
|
||||
kernel.setArg(5, K);
|
||||
kernel.setArg(6, bound);
|
||||
kernel->setArg(0, da);
|
||||
kernel->setArg(1, db);
|
||||
kernel->setArg(2, dc);
|
||||
kernel->setArg(3, M);
|
||||
kernel->setArg(4, N);
|
||||
kernel->setArg(5, K);
|
||||
kernel->setArg(6, bound);
|
||||
// dry run
|
||||
stream.enqueue(kernel, grid, {nthreads, 1, 1});
|
||||
stream.synchronize();
|
||||
// benchmark
|
||||
double ts = bench([&](){stream.enqueue(kernel, grid, {nthreads, 1, 1});},
|
||||
[&](){ stream.synchronize(); },
|
||||
context->device());
|
||||
(triton::driver::cu_device&)*context->device());
|
||||
ts = ts * 1e-9;
|
||||
double tflops = 2*M*N*K / ts * 1e-12;
|
||||
return tflops;
|
||||
@@ -186,7 +186,7 @@ int main() {
|
||||
jit.add_module(src, params);
|
||||
triton::driver::cu_kernel kernel = jit.get_function("matmul");
|
||||
triton::jit::launch_information info = jit.get_launch_info("matmul");
|
||||
std::cout << benchmark(kernel, info) << std::endl;
|
||||
std::cout << benchmark(&kernel, info) << std::endl;
|
||||
stream.read(dc, true, 0, hc);
|
||||
simple_gemm(rc, ha, hb, M, N, K);
|
||||
for(size_t i = 0; i < M*N; i++)
|
||||
|
@@ -28,6 +28,10 @@
|
||||
#include <vector>
|
||||
#include "triton/driver/context.h"
|
||||
|
||||
namespace llvm
|
||||
{
|
||||
class Module;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
{
|
||||
@@ -81,7 +85,7 @@ struct backend
|
||||
|
||||
public:
|
||||
static void release();
|
||||
static driver::module* get(driver::stream* stream, std::string const & name, std::string const &src);
|
||||
static driver::module* get(driver::stream* stream, std::string const & name, llvm::Module *src);
|
||||
|
||||
private:
|
||||
static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
|
||||
|
@@ -40,6 +40,8 @@ public:
|
||||
context(driver::device *dev, cl_context cl, bool take_ownership);
|
||||
driver::device* device() const;
|
||||
std::string const & cache_path() const;
|
||||
// factory methods
|
||||
static context* create(driver::device *dev);
|
||||
|
||||
protected:
|
||||
driver::device* dev_;
|
||||
|
@@ -32,6 +32,8 @@ namespace triton
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class context;
|
||||
|
||||
// Base device
|
||||
class device: public polymorphic_resource<CUdevice, cl_device_id>{
|
||||
public:
|
||||
|
@@ -81,6 +81,7 @@ class polymorphic_resource {
|
||||
public:
|
||||
polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership){}
|
||||
polymorphic_resource(CLType cl, bool take_ownership): cl_(cl, take_ownership){}
|
||||
virtual ~polymorphic_resource() { }
|
||||
|
||||
handle<CUType> cu() { return cu_; }
|
||||
handle<CLType> cl() { return cl_; }
|
||||
|
@@ -41,14 +41,27 @@ class kernel: public polymorphic_resource<CUfunction, cl_kernel> {
|
||||
public:
|
||||
kernel(driver::module* program, CUfunction fn, bool has_ownership);
|
||||
kernel(driver::module* program, cl_kernel fn, bool has_ownership);
|
||||
// Getters
|
||||
driver::module* module();
|
||||
|
||||
// Factory methods
|
||||
static kernel* create(driver::module* program, const char* name);
|
||||
// Arguments setters
|
||||
virtual void setArg(unsigned int index, std::size_t size, void* ptr) = 0;
|
||||
virtual void setArg(unsigned int index, buffer *) = 0;
|
||||
template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
|
||||
private:
|
||||
driver::module* program_;
|
||||
};
|
||||
|
||||
// OpenCL
|
||||
class ocl_kernel: public kernel {
|
||||
public:
|
||||
//Constructors
|
||||
ocl_kernel(driver::module* program, const char* name);
|
||||
// Arguments setters
|
||||
void setArg(unsigned int index, std::size_t size, void* ptr);
|
||||
void setArg(unsigned int index, driver::buffer* buffer);
|
||||
|
||||
};
|
||||
|
||||
// CUDA
|
||||
@@ -56,10 +69,9 @@ class cu_kernel: public kernel {
|
||||
public:
|
||||
//Constructors
|
||||
cu_kernel(driver::module* program, const char * name);
|
||||
//Arguments setters
|
||||
// Arguments setters
|
||||
void setArg(unsigned int index, std::size_t size, void* ptr);
|
||||
void setArg(unsigned int index, cu_buffer const &);
|
||||
template<class T> void setArg(unsigned int index, T value) { setArg(index, sizeof(T), (void*)&value); }
|
||||
void setArg(unsigned int index, driver::buffer* buffer);
|
||||
//Arguments getters
|
||||
void* const* cu_params() const;
|
||||
|
||||
|
@@ -31,6 +31,8 @@
|
||||
namespace llvm
|
||||
{
|
||||
class Module;
|
||||
template<class T>
|
||||
class SmallVectorImpl;
|
||||
}
|
||||
|
||||
namespace triton
|
||||
@@ -42,20 +44,34 @@ namespace driver
|
||||
class cu_context;
|
||||
class cu_device;
|
||||
|
||||
// Base
|
||||
class module: public polymorphic_resource<CUmodule, cl_program> {
|
||||
protected:
|
||||
void init_llvm();
|
||||
|
||||
public:
|
||||
module(driver::context* ctx, CUmodule mod, bool has_ownership);
|
||||
module(driver::context* ctx, cl_program mod, bool has_ownership);
|
||||
static module* create(driver::context* ctx, llvm::Module *src);
|
||||
driver::context* context() const;
|
||||
void compile_llvm_module(llvm::Module* module, const std::string& triple,
|
||||
const std::string &proc, std::string layout,
|
||||
llvm::SmallVectorImpl<char> &buffer);
|
||||
|
||||
protected:
|
||||
driver::context* ctx_;
|
||||
};
|
||||
|
||||
// OpenCL
|
||||
class ocl_module: public module{
|
||||
|
||||
public:
|
||||
ocl_module(driver::context* context, llvm::Module *module);
|
||||
};
|
||||
|
||||
// CUDA
|
||||
class cu_module: public module {
|
||||
static std::string header(driver::cu_device const & device);
|
||||
std::string compile_llvm_module(llvm::Module* module);
|
||||
void init_llvm();
|
||||
|
||||
public:
|
||||
cu_module(driver::context* context, llvm::Module *module);
|
||||
|
@@ -35,7 +35,7 @@ namespace triton
|
||||
namespace driver
|
||||
{
|
||||
|
||||
class cu_kernel;
|
||||
class kernel;
|
||||
class Event;
|
||||
class Range;
|
||||
class cu_buffer;
|
||||
@@ -45,6 +45,9 @@ class stream: public polymorphic_resource<CUstream, cl_command_queue> {
|
||||
public:
|
||||
stream(driver::context *ctx, CUstream, bool has_ownership);
|
||||
stream(driver::context *ctx, cl_command_queue, bool has_ownership);
|
||||
// factory
|
||||
static driver::stream* create(driver::context* ctx);
|
||||
// accessors
|
||||
driver::context* context() const;
|
||||
virtual void synchronize() = 0;
|
||||
|
||||
@@ -73,7 +76,7 @@ public:
|
||||
void synchronize();
|
||||
|
||||
//Enqueue
|
||||
void enqueue(cu_kernel const & cu_kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
|
||||
void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const * = NULL, Event *event = NULL);
|
||||
|
||||
// Write
|
||||
void write(driver::cu_buffer const & cu_buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
|
||||
|
@@ -99,10 +99,11 @@ void backend::modules::release(){
|
||||
cache_.clear();
|
||||
}
|
||||
|
||||
driver::module* backend::modules::get(driver::stream* stream, std::string const & name, std::string const & src){
|
||||
driver::module* backend::modules::get(driver::stream* stream, std::string const & name, llvm::Module* src){
|
||||
std::tuple<driver::stream*, std::string> key(stream, name);
|
||||
if(cache_.find(key)==cache_.end())
|
||||
return &*cache_.insert(std::make_pair(key, new driver::cu_module(((driver::cu_stream*)stream)->context(), src))).first->second;
|
||||
if(cache_.find(key)==cache_.end()){
|
||||
return &*cache_.insert({key, driver::module::create(stream->context(), src)}).first->second;
|
||||
}
|
||||
return &*cache_.at(key);
|
||||
}
|
||||
|
||||
@@ -120,8 +121,9 @@ void backend::kernels::release(){
|
||||
|
||||
driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
|
||||
std::tuple<driver::module*, std::string> key(mod, name);
|
||||
if(cache_.find(key)==cache_.end())
|
||||
return &*cache_.insert(std::make_pair(key, new driver::cu_kernel((driver::cu_module*)mod, name.c_str()))).first->second;
|
||||
if(cache_.find(key)==cache_.end()){
|
||||
return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
|
||||
}
|
||||
return cache_.at(key);
|
||||
}
|
||||
|
||||
@@ -134,7 +136,7 @@ std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::ker
|
||||
void backend::streams::init(std::list<driver::context*> const & contexts){
|
||||
for(driver::context* ctx : contexts)
|
||||
if(cache_.find(ctx)==cache_.end())
|
||||
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{new driver::cu_stream(ctx)}));
|
||||
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx)}));
|
||||
}
|
||||
|
||||
void backend::streams::release(){
|
||||
@@ -168,7 +170,7 @@ std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache
|
||||
|
||||
void backend::contexts::init(std::vector<driver::device*> const & devices){
|
||||
for(driver::device* dvc: devices)
|
||||
cache_.push_back(new cu_context(dvc));
|
||||
cache_.push_back(driver::context::create(dvc));
|
||||
}
|
||||
|
||||
void backend::contexts::release(){
|
||||
|
@@ -50,6 +50,15 @@ context::context(driver::device *dev, cl_context cl, bool take_ownership):
|
||||
|
||||
}
|
||||
|
||||
context* context::create(driver::device *dev){
|
||||
if(dynamic_cast<driver::cu_device*>(dev))
|
||||
return new cu_context(dev);
|
||||
if(dynamic_cast<driver::ocl_device*>(dev))
|
||||
return new ocl_context(dev);
|
||||
throw std::runtime_error("unknown context");
|
||||
}
|
||||
|
||||
|
||||
driver::device* context::device() const {
|
||||
return dev_;
|
||||
}
|
||||
|
@@ -27,6 +27,7 @@
|
||||
#include <memory>
|
||||
|
||||
#include "triton/driver/device.h"
|
||||
#include "triton/driver/context.h"
|
||||
|
||||
namespace triton
|
||||
{
|
||||
@@ -35,11 +36,16 @@ namespace driver
|
||||
{
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// OpenCL //
|
||||
/* ------------------------ */
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
// Architecture
|
||||
// architecture
|
||||
cu_device::Architecture cu_device::nv_arch(std::pair<unsigned int, unsigned int> sm) const {
|
||||
switch(sm.first) {
|
||||
case 7:
|
||||
|
@@ -33,6 +33,12 @@ namespace driver
|
||||
//OpenCL
|
||||
inline void _delete(cl_platform_id) { }
|
||||
inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); }
|
||||
inline void _delete(cl_context x) { dispatch::clReleaseContext(x); }
|
||||
inline void _delete(cl_program x) { dispatch::clReleaseProgram(x); }
|
||||
inline void _delete(cl_kernel x) { dispatch::clReleaseKernel(x); }
|
||||
inline void _delete(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
|
||||
inline void _delete(cl_mem x) { dispatch::clReleaseMemObject(x); }
|
||||
|
||||
//CUDA
|
||||
inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
|
||||
inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
|
||||
@@ -67,6 +73,11 @@ template class handle<CUPlatform>;
|
||||
|
||||
template class handle<cl_platform_id>;
|
||||
template class handle<cl_device_id>;
|
||||
template class handle<cl_context>;
|
||||
template class handle<cl_program>;
|
||||
template class handle<cl_command_queue>;
|
||||
template class handle<cl_mem>;
|
||||
template class handle<cl_kernel>;
|
||||
|
||||
}
|
||||
}
|
||||
|
@@ -45,6 +45,14 @@ kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership):
|
||||
polymorphic_resource(fn, has_ownership), program_(program){
|
||||
}
|
||||
|
||||
kernel* kernel::create(driver::module* program, const char* name) {
|
||||
if(dynamic_cast<driver::cu_module*>(program))
|
||||
return new cu_kernel(program, name);
|
||||
if(dynamic_cast<driver::ocl_module*>(program))
|
||||
return new ocl_kernel(program, name);
|
||||
throw std::runtime_error("unknown program");
|
||||
}
|
||||
|
||||
driver::module* kernel::module() {
|
||||
return program_;
|
||||
}
|
||||
@@ -53,6 +61,19 @@ driver::module* kernel::module() {
|
||||
// OpenCL //
|
||||
/* ------------------------ */
|
||||
|
||||
ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) {
|
||||
cl_int err;
|
||||
*cl_ = dispatch::clCreateKernel(*program->cl(), name, &err);
|
||||
}
|
||||
|
||||
void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) {
|
||||
dispatch::clSetKernelArg(*cl_, index, size, ptr);
|
||||
}
|
||||
|
||||
void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) {
|
||||
dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl());
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
@@ -74,8 +95,8 @@ void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){
|
||||
cu_params_[index] = cu_params_store_[index].get();
|
||||
}
|
||||
|
||||
void cu_kernel::setArg(unsigned int index, cu_buffer const & data)
|
||||
{ return setArg(index, data.cu());}
|
||||
void cu_kernel::setArg(unsigned int index, driver::buffer* data)
|
||||
{ return kernel::setArg(index, *data->cu());}
|
||||
|
||||
void* const* cu_kernel::cu_params() const
|
||||
{ return cu_params_.data(); }
|
||||
|
@@ -50,6 +50,18 @@ namespace driver
|
||||
// Base //
|
||||
/* ------------------------ */
|
||||
|
||||
void module::init_llvm() {
|
||||
static bool init = false;
|
||||
if(!init){
|
||||
llvm::InitializeAllTargetInfos();
|
||||
llvm::InitializeAllTargets();
|
||||
llvm::InitializeAllTargetMCs();
|
||||
llvm::InitializeAllAsmParsers();
|
||||
llvm::InitializeAllAsmPrinters();
|
||||
init = true;
|
||||
}
|
||||
}
|
||||
|
||||
module::module(driver::context* ctx, CUmodule mod, bool has_ownership)
|
||||
: polymorphic_resource(mod, has_ownership), ctx_(ctx) {
|
||||
}
|
||||
@@ -62,26 +74,56 @@ driver::context* module::context() const {
|
||||
return ctx_;
|
||||
}
|
||||
|
||||
module* module::create(driver::context* ctx, llvm::Module *src) {
|
||||
if(dynamic_cast<driver::cu_context*>(ctx))
|
||||
return new cu_module(ctx, src);
|
||||
if(dynamic_cast<driver::ocl_device*>(ctx))
|
||||
return new ocl_module(ctx, src);
|
||||
throw std::runtime_error("unknown context");
|
||||
}
|
||||
|
||||
void module::compile_llvm_module(llvm::Module* module, const std::string& triple,
|
||||
const std::string &proc, std::string layout,
|
||||
llvm::SmallVectorImpl<char> &buffer) {
|
||||
init_llvm();
|
||||
// create machine
|
||||
module->setTargetTriple(triple);
|
||||
std::string error;
|
||||
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
||||
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, "",
|
||||
llvm::TargetOptions(), llvm::Reloc::Model(),
|
||||
llvm::None, llvm::CodeGenOpt::Aggressive);
|
||||
|
||||
|
||||
// set data layout
|
||||
if(layout.empty())
|
||||
layout = module->getDataLayoutStr();
|
||||
module->setDataLayout(layout);
|
||||
|
||||
// emit machine code
|
||||
llvm::legacy::PassManager pass;
|
||||
llvm::raw_svector_ostream stream(buffer);
|
||||
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
|
||||
pass.run(*module);
|
||||
}
|
||||
|
||||
/* ------------------------ */
|
||||
// OpenCL //
|
||||
/* ------------------------ */
|
||||
|
||||
ocl_module::ocl_module(driver::context * context, llvm::Module* src): module(context, cl_program(), true) {
|
||||
init_llvm();
|
||||
llvm::SmallVector<char, 0> buffer;
|
||||
module::compile_llvm_module(src, "amdgcn-amd-amdpal", "gfx902", "", buffer);
|
||||
throw std::runtime_error("need to implement opencl module creation");
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------ */
|
||||
// CUDA //
|
||||
/* ------------------------ */
|
||||
|
||||
std::string cu_module::compile_llvm_module(llvm::Module* module) {
|
||||
init_llvm();
|
||||
// create machine
|
||||
module->setTargetTriple("nvptx64-nvidia-cuda");
|
||||
std::string error;
|
||||
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
|
||||
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), "sm_52", "",
|
||||
llvm::TargetOptions(), llvm::Reloc::Model(),
|
||||
llvm::None, llvm::CodeGenOpt::Aggressive);
|
||||
|
||||
// set data layout
|
||||
std::string layout = "e";
|
||||
bool is_64bit = true;
|
||||
@@ -91,28 +133,13 @@ std::string cu_module::compile_llvm_module(llvm::Module* module) {
|
||||
else if (use_short_pointers)
|
||||
layout += "-p3:32:32-p4:32:32-p5:32:32";
|
||||
layout += "-i64:64-i128:128-v16:16-v32:32-n16:32:64";
|
||||
module->setDataLayout(layout);
|
||||
// emit machine code
|
||||
llvm::legacy::PassManager pass;
|
||||
// create
|
||||
llvm::SmallVector<char, 0> buffer;
|
||||
llvm::raw_svector_ostream stream(buffer);
|
||||
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::TargetMachine::CGFT_AssemblyFile);
|
||||
pass.run(*module);
|
||||
// done
|
||||
module::compile_llvm_module(module, "nvptx64-nvidia-cuda", "sm_52", layout, buffer);
|
||||
return std::string(buffer.begin(), buffer.end());
|
||||
}
|
||||
|
||||
void cu_module::init_llvm() {
|
||||
static bool init = false;
|
||||
if(!init){
|
||||
llvm::InitializeAllTargetInfos();
|
||||
llvm::InitializeAllTargets();
|
||||
llvm::InitializeAllTargetMCs();
|
||||
llvm::InitializeAllAsmParsers();
|
||||
llvm::InitializeAllAsmPrinters();
|
||||
init = true;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
cu_module::cu_module(driver::context * context, llvm::Module* ll_module): cu_module(context, compile_llvm_module(ll_module)) { }
|
||||
|
||||
|
@@ -52,6 +52,15 @@ stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership)
|
||||
|
||||
}
|
||||
|
||||
driver::stream* stream::create(driver::context* ctx) {
|
||||
if(dynamic_cast<driver::cu_context*>(ctx))
|
||||
return new cu_stream(ctx);
|
||||
if(dynamic_cast<driver::ocl_context*>(ctx))
|
||||
return new cl_stream(ctx);
|
||||
throw std::runtime_error("unknown context");
|
||||
}
|
||||
|
||||
|
||||
driver::context* stream::context() const {
|
||||
return ctx_;
|
||||
}
|
||||
@@ -61,6 +70,10 @@ driver::context* stream::context() const {
|
||||
// OpenCL //
|
||||
/* ------------------------ */
|
||||
|
||||
cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) {
|
||||
cl_int err;
|
||||
*cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err);
|
||||
}
|
||||
|
||||
void cl_stream::synchronize() {
|
||||
dispatch::clFinish(*cl_);
|
||||
@@ -91,11 +104,12 @@ void cu_stream::synchronize() {
|
||||
dispatch::cuStreamSynchronize(*cu_);
|
||||
}
|
||||
|
||||
void cu_stream::enqueue(driver::cu_kernel const & kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const *, Event* event) {
|
||||
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<Event> const *, Event* event) {
|
||||
driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel;
|
||||
cu_context::context_switcher ctx_switch(*ctx_);
|
||||
if(event)
|
||||
dispatch::cuEventRecord(event->cu()->first, *cu_);
|
||||
dispatch::cuLaunchKernel(*kernel.cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL);
|
||||
dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)cu_kernel->cu_params(), NULL);
|
||||
if(event)
|
||||
dispatch::cuEventRecord(event->cu()->second, *cu_);
|
||||
}
|
||||
|
Reference in New Issue
Block a user