[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch
- Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes
- Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
This commit is contained in:
Philippe Tillet
2021-09-09 00:04:28 -07:00
committed by GitHub
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions

View File

@@ -13,45 +13,40 @@
#include "triton/codegen/transform/peephole.h"
#include "triton/codegen/transform/pipeline.h"
#include "triton/codegen/transform/prefetch.h"
#include "triton/driver/device.h"
#include "triton/driver/kernel.h"
#include "triton/driver/module.h"
#include "triton/ir/function.h"
#include "triton/ir/module.h"
#include "triton/ir/print.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Verifier.h"
namespace triton {
namespace codegen {
// TODO:
// There should be a proper pass manager there!
void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
// generate llvm code
llvm::LLVMContext ctx;
std::string name = ir.get_function_list()[0]->get_name();
std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
// optimizations
std::unique_ptr<codegen::target> target = dev->make_target();
bool cts_use_async = target->as_nvidia()->sm() >= 80;
bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
// create passes
codegen::analysis::align align;
codegen::analysis::axes axes;
codegen::transform::cts cts(cts_use_async);
codegen::transform::pipeline pipeline(cts_use_async, num_stages);
codegen::transform::disassociate disassociate;
codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
codegen::analysis::liveness liveness(&layouts);
codegen::analysis::swizzle swizzle(&layouts, target.get());
codegen::analysis::swizzle swizzle(&layouts, target);
codegen::analysis::allocation allocation(&liveness);
codegen::transform::dce dce;
codegen::transform::peephole peephole(target.get(), &layouts);
// codegen::transform::reassociate reassociate;
codegen::transform::peephole peephole(target, &layouts);
codegen::transform::coalesce coalesce(&align, &layouts);
codegen::transform::prefetch prefetch_s(target.get());
codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
codegen::transform::prefetch prefetch_s(target);
codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
// run passes
dce.run(ir);
peephole.run(ir);
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
layouts.run(ir);
coalesce.run(ir);
dce.run(ir);
// exit(1);
align.run(ir);
dce.run(ir);
if (target->is_gpu())
cts.run(ir);
dce.run(ir);
align.run(ir);
// ir::print(ir, std::cout);
axes.run(ir);
layouts.run(ir);
peephole.run(ir);
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
allocation.run(ir);
prefetch_s.run(ir);
barriers.run(ir);
// ir.print(std::cout);
isel.visit(ir, *llvm);
mod = driver::module::create(dev, std::move(llvm));
ker = driver::kernel::create(&*mod, name.c_str());
shared_mem = allocation.allocated_size();
shared_static = allocation.allocated_size();
return llvm;
}
} // namespace codegen

View File

@@ -1,231 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <vector>
#include <stdexcept>
#include "triton/driver/dispatch.h"
#include "triton/driver/backend.h"
#include "triton/driver/buffer.h"
#include "triton/driver/context.h"
#include "triton/driver/stream.h"
#include "triton/driver/kernel.h"
namespace triton
{
namespace driver
{
/*-----------------------------------*/
//----------- Platforms ------------*/
/*-----------------------------------*/
void backend::platforms::init() {
if(!cache_.empty())
return;
//if CUDA is here
if(dispatch::cuinit()){
cache_.push_back(new cu_platform());
}
//if host should be added
bool host_visible = true;
if(host_visible){
cache_.push_back(new host_platform());
}
// //if OpenCL is here
// if(dispatch::clinit()){
// cl_uint num_platforms;
// dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
// std::vector<cl_platform_id> ids(num_platforms);
// dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
// for(cl_platform_id id: ids)
// cache_.push_back(new cl_platform(id));
// }
if(cache_.empty())
throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
}
void backend::platforms::get(std::vector<platform *> &results) {
std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
}
std::vector<driver::platform*> backend::platforms::cache_;
/*-----------------------------------*/
//----------- Devices --------------*/
/*-----------------------------------*/
void backend::devices::init(std::vector<platform*> const & platforms) {
if(!cache_.empty())
return;
for(driver::platform* pf: platforms)
pf->devices(cache_);
if(cache_.empty())
throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
}
void backend::devices::get(std::vector<device*> &devs) {
std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
}
std::vector<driver::device*> backend::devices::cache_;
/*-----------------------------------*/
//---------- Modules ----------------*/
/*-----------------------------------*/
void backend::modules::release(){
for(auto & x: cache_)
delete x.second;
cache_.clear();
}
std::map<std::tuple<driver::stream*, std::string>, driver::module*> backend::modules::cache_;
/*-----------------------------------*/
//----------- Kernels --------------*/
/*-----------------------------------*/
void backend::kernels::release(){
for(auto & x: cache_)
delete x.second;
cache_.clear();
}
driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
std::tuple<driver::module*, std::string> key(mod, name);
if(cache_.find(key)==cache_.end()){
return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
}
return cache_.at(key);
}
std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
/*-----------------------------------*/
//------------ Queues --------------*/
/*-----------------------------------*/
void backend::streams::init(std::list<driver::context*> const & contexts){
for(driver::context* ctx : contexts)
if(cache_.find(ctx)==cache_.end())
cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
}
void backend::streams::release(){
for(auto & x: cache_)
for(auto & y: x.second)
delete y;
cache_.clear();
}
driver::stream* backend::streams::get_default()
{ return get(contexts::get_default(), 0); }
driver::stream* backend::streams::get(driver::context* context, unsigned int id){
init(std::list<driver::context*>(1,context));
for(auto & x : cache_)
if(x.first==context)
return x.second[id];
throw;
}
void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
init(std::list<driver::context*>(1,context));
queues = cache_.at(context);
}
std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
/*-----------------------------------*/
//------------ Contexts ------------*/
/*-----------------------------------*/
void backend::contexts::init(std::vector<driver::device*> const & devices){
for(driver::device* dvc: devices)
cache_.push_back(driver::context::create(dvc));
}
void backend::contexts::release(){
for(auto & x: cache_)
delete x;
cache_.clear();
}
driver::context* backend::contexts::get_default(){
backend::init();
auto it = cache_.begin();
std::advance(it, default_device);
return *it;
}
void backend::contexts::get(std::list<driver::context*> & contexts){
backend::init();
contexts = cache_;
}
std::list<driver::context*> backend::contexts::cache_;
/*-----------------------------------*/
//------------ General -------------*/
/*-----------------------------------*/
void backend::synchronize(driver::context* context){
for(driver::stream * queue: streams::cache_.at(context))
queue->synchronize();
}
void backend::release(){
backend::kernels::release();
// backend::programs::release();
backend::streams::release();
backend::contexts::release();
}
void backend::init(){
if(!contexts::cache_.empty())
return;
// initialize platforms
backend::platforms::init();
// initialize devices
backend::devices::init(platforms::cache_);
// initialize contexts
backend::contexts::init(devices::cache_);
// initialize streams
streams::init(contexts::cache_);
}
unsigned int backend::default_device = 0;
}
}

View File

@@ -1,90 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "triton/driver/stream.h"
#include "triton/driver/buffer.h"
#include "triton/driver/context.h"
#include "triton/driver/dispatch.h"
namespace triton
{
namespace driver
{
//
buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
: polymorphic_resource(cu, take_ownership), size_(size) { }
buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
: polymorphic_resource(hst, take_ownership), size_(size) { }
size_t buffer::size() {
return size_;
}
uintptr_t buffer::addr_as_uintptr_t() {
switch(backend_){
case CUDA: return *cu_;
case Host: return (uintptr_t)hst_->data;
default: return 0;
}
}
buffer* buffer::create(driver::context* ctx, size_t size) {
switch(ctx->backend()){
case CUDA: return new cu_buffer(size);
case Host: return new host_buffer(size);
default: throw std::runtime_error("unknown backend");
}
}
//
host_buffer::host_buffer(size_t size)
: buffer(size, host_buffer_t(), true){
hst_->data = new char[size];
}
//
cu_buffer::cu_buffer(size_t size)
: buffer(size, CUdeviceptr(), true) {
dispatch::cuMemAlloc(&*cu_, size);
}
cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
: buffer(size, cu, take_ownership){
}
void cu_buffer::set_zero(driver::stream* queue, size_t size){
dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
}
}
}

View File

@@ -1,118 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cassert>
#include "triton/driver/context.h"
#include "triton/driver/module.h"
#include "triton/tools/sys/getenv.hpp"
#include "triton/tools/sys/mkdir.hpp"
namespace triton
{
namespace driver
{
/* ------------------------ */
// BASE //
/* ------------------------ */
context::context(driver::device *dev, CUcontext cu, bool take_ownership):
polymorphic_resource(cu, take_ownership),
dev_(dev), cache_path_(get_cache_path()) {
}
context::context(driver::device *dev, host_context_t hst, bool take_ownership):
polymorphic_resource(hst, take_ownership),
dev_(dev), cache_path_(get_cache_path()){
}
context* context::create(driver::device *dev){
switch(dev->backend()){
case CUDA: return new cu_context(dev);
case Host: return new host_context(dev);
default: throw std::runtime_error("unknown backend");
}
}
driver::device* context::device() const {
return dev_;
}
std::string context::get_cache_path(){
//user-specified cache path
std::string result = tools::getenv("TRITON_CACHE_PATH");
if(!result.empty()){
if(tools::mkpath(result)==0)
return result;
}
//create in home
result = tools::getenv("HOME");
if(!result.empty())
{
result = result + "/.triton/cache/";
if(tools::mkpath(result)==0)
return result;
}
//couldn't find a directory
return "";
}
std::string const & context::cache_path() const{
return cache_path_;
}
/* ------------------------ */
// Host //
/* ------------------------ */
host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
// import CUdevice
CUdevice cu_context::get_device_of(CUcontext context){
dispatch::cuCtxPushCurrent_v2(context);
CUdevice res;
dispatch::cuCtxGetDevice(&res);
dispatch::cuCtxPopCurrent_v2(NULL);
return res;
}
// wrapper for cuda context
cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
context, take_ownership) {
}
cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
// dispatch::cuCtxPopCurrent_v2(NULL);
}
}
}

View File

@@ -1,192 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <map>
#include <algorithm>
#include <sstream>
#include <cstring>
#include <memory>
#include "triton/driver/device.h"
#include "triton/driver/context.h"
#include "triton/driver/error.h"
#include "triton/codegen/target.h"
namespace triton
{
namespace driver
{
/* ------------------------ */
// Host //
/* ------------------------ */
std::unique_ptr<codegen::target> host_device::make_target() const {
return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
// information query
template<CUdevice_attribute attr>
int cu_device::cuGetInfo() const{
int res;
dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
return res;
}
// convert to nvml
nvmlDevice_t cu_device::nvml_device() const{
std::map<std::string, nvmlDevice_t> map;
std::string key = pci_bus_id();
if(map.find(key)==map.end()){
nvmlDevice_t device;
dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
return map.insert(std::make_pair(key, device)).first->second;
}
return map.at(key);
}
// number of address bits
size_t cu_device::address_bits() const{
return sizeof(size_t)*8;
}
// name
std::string cu_device::name() const {
char tmp[128];
dispatch::cuDeviceGetName(tmp, 128, *cu_);
return std::string(tmp);
}
// PCI bus ID
std::string cu_device::pci_bus_id() const{
char tmp[128];
dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
return std::string(tmp);
}
// force the device to be interpreted as a particular cc
void cu_device::interpret_as(int cc){
interpreted_as_ = std::make_shared<int>(cc);
}
// compute capability
int cu_device::compute_capability() const {
if(interpreted_as_)
return *interpreted_as_;
size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
return major*10 + minor;
}
// maximum number of threads per block
size_t cu_device::max_threads_per_block() const {
return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
}
// maximum amount of shared memory per block
size_t cu_device::max_shared_memory() const {
return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
}
// warp size
size_t cu_device::warp_size() const {
return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
}
// maximum block dimensions
std::vector<size_t> cu_device::max_block_dim() const {
std::vector<size_t> result(3);
result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
return result;
}
// current SM clock
size_t cu_device::current_sm_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
return result;
}
// max SM clock
size_t cu_device::max_sm_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
return result;
}
// current memory clock
size_t cu_device::current_mem_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
return result;
}
// max memory clock
size_t cu_device::max_mem_clock() const{
unsigned int result;
dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
return result;
}
// max memory clock
void cu_device::set_max_clock() {
dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
}
void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
CUcontext context;
dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
try {
dispatch::cuCtxEnablePeerAccess(context, 0);
} catch (exception::cuda::peer_access_already_enabled) {}
}
// print infos
std::string cu_device::infos() const{
std::ostringstream oss;
std::vector<size_t> max_wi_sizes = max_block_dim();
oss << "Platform: CUDA" << std::endl;
oss << "Name: " << name() << std::endl;
oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
oss << "Local memory size: " << max_shared_memory() << std::endl;
return oss.str();
}
// target
std::unique_ptr<codegen::target> cu_device::make_target() const {
return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
}
}
}

View File

@@ -21,7 +21,6 @@
*/
#include "triton/driver/dispatch.h"
#include "triton/driver/context.h"
#include "triton/tools/sys/getenv.hpp"
namespace triton
@@ -31,65 +30,65 @@ namespace driver
//Helpers for function definition
#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
void* dispatch::fname ## _;
#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
void* dispatch::fname ## _;
#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
void* dispatch::fname ## _;
#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
void* dispatch::fname ## _;
#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
void* dispatch::fname ## _;
#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
void* dispatch::fname ## _;
#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
void* dispatch::fname ## _;
#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
void* dispatch::fname ## _;
#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
void* dispatch::fname ## _;
#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
void* dispatch::fname ## _;
#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
void* dispatch::fname ## _;
#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
void* dispatch::fname ## _;
#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
void* dispatch::fname ## _;
#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
void* dispatch::fname ## _;
//Specialized helpers for CUDA
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
/* ------------------- *
* CUDA
* ------------------- */
bool dispatch::cuinit(){
if(cuda_==nullptr){
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
return true;
}
#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
// context management
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
// device management
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
// link management
CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
// module management
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
// stream management
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
// function management
CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
// memory management
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
// event management
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
/* ------------------- *
* NVML
* ------------------- */
bool dispatch::nvmlinit(){
if(nvml_==nullptr)
nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
return res;
}
//CUDA
CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
CUDA_DEFINE1(CUresult, cuInit, unsigned int)
CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
/* ------------------- *
* HIP
* ------------------- */
bool dispatch::hipinit(){
if(hip_==nullptr)
hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
if(hip_ == nullptr)
return false;
hipError_t (*fptr)();
hipInit_ = dlsym(hip_, "hipInit");
*reinterpret_cast<void **>(&fptr) = hipInit_;
hipError_t res = (*fptr)();
check(res);
return res;
}
#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
// context management
HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
HIP_DEFINE1(hipError_t, hipInit, unsigned int)
HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
// device management
HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
// module management
HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
// stream management
HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
// function management
HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
// memory management
HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
// event management
HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
/* ------------------- *
* COMMON
* ------------------- */
// Release
void dispatch::release(){
@@ -190,61 +291,9 @@ void dispatch::release(){
void* dispatch::cuda_;
void* dispatch::nvml_;
//CUDA
void* dispatch::cuCtxGetCurrent_;
void* dispatch::cuCtxSetCurrent_;
void* dispatch::cuCtxDestroy_v2_;
void* dispatch::cuEventCreate_;
void* dispatch::cuDeviceGet_;
void* dispatch::cuMemcpyDtoH_v2_;
void* dispatch::cuStreamCreate_;
void* dispatch::cuEventElapsedTime_;
void* dispatch::cuMemFree_v2_;
void* dispatch::cuMemcpyDtoHAsync_v2_;
void* dispatch::cuDriverGetVersion_;
void* dispatch::cuDeviceGetName_;
void* dispatch::cuDeviceGetPCIBusId_;
void* dispatch::cuModuleGetGlobal_v2_;
void* dispatch::cuLinkAddData_v2_;
void* dispatch::cuLinkCreate_v2_;
void* dispatch::cuLinkDestroy_;
void* dispatch::cuModuleLoadData_;
void* dispatch::cuLinkComplete_;
void* dispatch::cuMemcpyHtoDAsync_v2_;
void* dispatch::cuModuleLoad_;
void* dispatch::cuLaunchKernel_;
void* dispatch::cuModuleUnload_;
void* dispatch::cuModuleLoadDataEx_;
void* dispatch::cuDeviceGetAttribute_;
void* dispatch::cuDeviceGetCount_;
void* dispatch::cuMemcpyHtoD_v2_;
void* dispatch::cuInit_;
void* dispatch::cuEventRecord_;
void* dispatch::cuCtxCreate_v2_;
void* dispatch::cuModuleGetFunction_;
void* dispatch::cuStreamSynchronize_;
void* dispatch::cuStreamDestroy_v2_;
void* dispatch::cuStreamGetCtx_;
void* dispatch::cuEventDestroy_v2_;
void* dispatch::cuMemAlloc_v2_;
void* dispatch::cuPointerGetAttribute_;
void* dispatch::cuCtxGetDevice_;
void* dispatch::cuMemsetD8Async_;
void* dispatch::cuCtxPushCurrent_v2_;
void* dispatch::cuCtxPopCurrent_v2_;
void* dispatch::cuFuncGetAttribute_;
void* dispatch::cuFuncSetAttribute_;
void* dispatch::cuFuncSetCacheConfig_;
void* dispatch::cuCtxEnablePeerAccess_;
void* dispatch::nvmlInit_v2_;
void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
void* dispatch::nvmlDeviceGetClockInfo_;
void* dispatch::nvmlDeviceGetMaxClockInfo_;
void* dispatch::nvmlDeviceSetApplicationsClocks_;
void* dispatch::hip_;
}
}

View File

@@ -94,6 +94,73 @@ void check(CUresult err)
}
}
void check(hipError_t error) {
using namespace exception::hip;
switch(error)
{
case hipSuccess : break;
case hipErrorInvalidValue : throw invalid_value();
case hipErrorMemoryAllocation : throw out_of_memory();
case hipErrorNotInitialized : throw not_initialized();
case hipErrorDeinitialized : throw deinitialized();
case hipErrorProfilerDisabled : throw profiler_disabled();
case hipErrorProfilerNotInitialized : throw profiler_not_initialized();
case hipErrorProfilerAlreadyStarted : throw profiler_already_started();
case hipErrorProfilerAlreadyStopped : throw profiler_already_stopped();
case hipErrorNoDevice : throw no_device();
case hipErrorInvalidSymbol : throw invalid_symbol();
case hipErrorInvalidDevice : throw invalid_device();
case hipErrorInvalidImage : throw invalid_image();
case hipErrorInvalidContext : throw invalid_context();
case hipErrorContextAlreadyCurrent : throw context_already_current();
case hipErrorMapFailed : throw map_failed();
case hipErrorUnmapFailed : throw unmap_failed();
case hipErrorArrayIsMapped : throw array_is_mapped();
case hipErrorAlreadyMapped : throw already_mapped();
case hipErrorNoBinaryForGpu : throw no_binary_for_gpu();
case hipErrorAlreadyAcquired : throw already_acquired();
case hipErrorNotMapped : throw not_mapped();
case hipErrorNotMappedAsArray : throw not_mapped_as_array();
case hipErrorNotMappedAsPointer : throw not_mapped_as_pointer();
case hipErrorECCNotCorrectable : throw ecc_uncorrectable();
case hipErrorUnsupportedLimit : throw unsupported_limit();
case hipErrorContextAlreadyInUse : throw context_already_in_use();
case hipErrorPeerAccessUnsupported : throw peer_access_unsupported();
case hipErrorInvalidKernelFile : throw invalid_ptx();
case hipErrorInvalidGraphicsContext : throw invalid_graphics_context();
case hipErrorInvalidSource : throw invalid_source();
case hipErrorFileNotFound : throw file_not_found();
case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
case hipErrorSharedObjectInitFailed : throw shared_object_init_failed();
case hipErrorOperatingSystem : throw operating_system();
case hipErrorInvalidResourceHandle : throw invalid_handle();
case hipErrorNotFound : throw not_found();
case hipErrorNotReady : throw not_ready();
case hipErrorIllegalAddress : throw illegal_address();
case hipErrorLaunchOutOfResources : throw launch_out_of_resources();
case hipErrorLaunchTimeOut : throw launch_timeout();
// case hipErrorLaunchIncompatibleTexturing : throw launch_incompatible_texturing();
case hipErrorPeerAccessAlreadyEnabled : throw peer_access_already_enabled();
case hipErrorPeerAccessNotEnabled : throw peer_access_not_enabled();
// case hipErrorPrimaryContextActive : throw primary_context_active();
// case hipErrorContextIsDestroyed : throw context_is_destroyed();
case hipErrorAssert : throw assert_error();
// case hipErrorTooManyPeers : throw too_many_peers();
case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
case hipErrorHostMemoryNotRegistered : throw host_memory_not_registered();
// case hipErrorHardwareStackError : throw hardware_stack_error();
// case hipErrorIllegalInstruction : throw illegal_instruction();
// case hipErrorMisalignedAddress : throw misaligned_address();
// case hipErrorInvalidAddressSpace : throw invalid_address_space();
// case hipErrorInvalidPc : throw invalid_pc();
case hipErrorLaunchFailure : throw launch_failed();
// case hipErrorNotPermitted : throw not_permitted();
case hipErrorNotSupported : throw not_supported();
case hipErrorUnknown : throw unknown();
default : throw unknown();
}
}
}
}

View File

@@ -1,91 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include "triton/driver/handle.h"
#include "triton/driver/error.h"
namespace triton
{
namespace driver
{
//Host
inline void _delete(host_platform_t) { }
inline void _delete(host_device_t) { }
inline void _delete(host_context_t) { }
inline void _delete(host_module_t) { }
inline void _delete(host_stream_t) { }
inline void _delete(host_buffer_t x) { if(x.data) delete[] x.data; }
inline void _delete(host_function_t) { }
//CUDA
inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
inline void _delete(CUdevice) { }
inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
inline void _delete(CUfunction) { }
inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
inline void _delete(CUPlatform){}
//Constructor
template<class T>
handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
{ }
template<class T>
handle<T>::handle(): has_ownership_(false){ }
template<class T>
handle<T>::~handle(){
try{
if(has_ownership_ && h_ && h_.unique())
_delete(*h_);
}catch(const exception::cuda::base&){
// order of destruction for global variables
// is not guaranteed
}
}
template class handle<CUdeviceptr>;
template class handle<CUstream>;
template class handle<CUcontext>;
template class handle<CUdevice>;
template class handle<cu_event_t>;
template class handle<CUfunction>;
template class handle<CUmodule>;
template class handle<CUPlatform>;
template class handle<host_platform_t>;
template class handle<host_device_t>;
template class handle<host_context_t>;
template class handle<host_module_t>;
template class handle<host_stream_t>;
template class handle<host_buffer_t>;
template class handle<host_function_t>;
}
}

View File

@@ -1,94 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <string.h>
#include "triton/driver/kernel.h"
#include "triton/driver/buffer.h"
namespace triton
{
namespace driver
{
/* ------------------------ */
// Base //
/* ------------------------ */
kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
polymorphic_resource(fn, has_ownership), program_(program){
}
kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
polymorphic_resource(fn, has_ownership), program_(program){
}
kernel* kernel::create(driver::module* program, const char* name) {
switch(program->backend()){
case CUDA: return new cu_kernel(program, name);
case Host: return new host_kernel(program, name);
default: throw std::runtime_error("unknown backend");
}
}
driver::module* kernel::module() {
return program_;
}
/* ------------------------ */
// Host //
/* ------------------------ */
host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
hst_->fn = program->hst()->functions.at(name);
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
// properties
int shared_total, shared_optin, shared_static;
int n_spills, n_reg;
CUdevice dev;
dispatch::cuCtxGetDevice(&dev);
dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES, *cu_);
dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
// std::cout << n_reg << std::endl;
if (shared_optin > 49152){
// std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
}
}
}
}

324
lib/driver/llvm.cc Normal file
View File

@@ -0,0 +1,324 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <fstream>
#include <unistd.h>
#include <memory>
#include <regex>
#include "triton/driver/llvm.h"
#include "triton/driver/dispatch.h"
#include "triton/driver/error.h"
#include "triton/tools/sha1.hpp"
#include "triton/tools/sys/getenv.hpp"
#include "triton/tools/sys/mkdir.hpp"
#include "triton/tools/sys/exec.hpp"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/ExecutionEngine/ExecutionEngine.h"
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
#include "llvm/Transforms/Utils/Cloning.h"
// begin AMD stuff
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Support/Program.h"
#include "llvm/Support/ToolOutputFile.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
// end AMD stuff
namespace triton{
namespace driver{
void init_llvm() {
static bool init = false;
if(!init){
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
LLVMInitializeAMDGPUTargetInfo();
LLVMInitializeAMDGPUTarget();
LLVMInitializeAMDGPUTargetMC();
LLVMInitializeAMDGPUAsmPrinter();
init = true;
}
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
size_t start_replace = str.find(begin);
size_t end_replace = str.find(end, start_replace);
if(start_replace == std::string::npos)
return false;
str.replace(start_replace, end_replace + 1 - start_replace, target);
return true;
}
int vptx(int version){
if(version >= 11030) return 73;
if(version >= 11020) return 72;
if(version >= 11010) return 71;
if(version >= 11000) return 70;
if(version >= 10020) return 65;
if(version >= 10010) return 64;
if(version >= 10000) return 63;
throw std::runtime_error("Triton requires CUDA 10+");
}
std::string llir_to_ptx(llvm::Module* module, int cc, int version){
// LLVM version in use may not officially support target hardware
int max_nvvm_cc = 75;
int max_nvvm_ptx = 64;
// options
auto options = llvm::cl::getRegisteredOptions();
auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
assert(short_ptr);
short_ptr->setValue(true);
// compute capability
std::string sm = "sm_" + std::to_string(cc);
// max PTX version
int ptx = vptx(version);
int ptx_major = ptx / 10;
int ptx_minor = ptx % 10;
// create
llvm::SmallVector<char, 0> buffer;
std::string triple = "nvptx64-nvidia-cuda";
std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
std::string layout = "";
std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
init_llvm();
// verify and store llvm
llvm::legacy::PassManager pm;
pm.add(llvm::createVerifierPass());
pm.run(*module);
// create machine
module->setTargetTriple(triple);
std::string error;
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
llvm::TargetOptions opt;
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
opt.UnsafeFPMath = false;
opt.NoInfsFPMath = false;
opt.NoNaNsFPMath = true;
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
// set data layout
if(layout.empty())
module->setDataLayout(machine->createDataLayout());
else
module->setDataLayout(layout);
// emit machine code
for (llvm::Function &f : module->functions())
f.addFnAttr(llvm::Attribute::AlwaysInline);
llvm::legacy::PassManager pass;
llvm::raw_svector_ostream stream(buffer);
// emit
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
pass.run(*module);
// post-process
std::string result(buffer.begin(), buffer.end());
find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
while(find_and_replace(result, "\t// end inline asm", "\n", ""));
return result;
}
CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
// JIT compile source-code
try{
// use ptxas if present in PATH. Otherwise, use JIT from the driver
std::string ptxas = "ptxas";
std::string version;
int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
// Use PTXAS via system call
if(use_system_ptxas){
// compile ptx with ptxas
char _fsrc[] = "/tmp/triton_k_XXXXXX";
char _flog[] = "/tmp/triton_l_XXXXXX";
mkstemp(_fsrc);
mkstemp(_flog);
std::string fsrc = _fsrc;
std::string flog = _flog;
std::ofstream ofs(fsrc);
ofs << ptx;
ofs.close();
std::string cmd;
int err;
cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
err = system(cmd.c_str());
CUmodule ret;
dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
unlink(_fsrc);
unlink(_flog);
return ret;
}
// Use PTXAS included in driver
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
CU_JIT_LOG_VERBOSE};
unsigned int errbufsize = 8192;
unsigned int logbufsize = 8192;
char _err[errbufsize];
char _log[logbufsize];
void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
CUmodule ret;
dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
return ret;
}
catch(exception::cuda::invalid_ptx const &){
std::cout << ptx << std::endl;
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
throw;
}
}
/* ------------------------ */
// HIP //
/* ------------------------ */
std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
init_llvm();
// proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
// features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
// create
llvm::SmallVector<char, 0> buffer;
std::string triple = "amdgcn-amd-amdhsa";
std::string layout = "";
std::string features;
std::string proc = "gfx908";
// verify and store llvm
llvm::legacy::PassManager pm;
pm.add(llvm::createVerifierPass());
pm.run(*module);
// create machine
module->setTargetTriple(triple);
std::string error;
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
llvm::TargetOptions opt;
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
opt.UnsafeFPMath = false;
opt.NoInfsFPMath = false;
opt.NoNaNsFPMath = true;
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
llvm::Reloc::PIC_, llvm::None,
llvm::CodeGenOpt::Aggressive);
// set data layout
if(layout.empty())
module->setDataLayout(machine->createDataLayout());
else
module->setDataLayout(layout);
// emit machine code
for (llvm::Function &f : module->functions())
f.addFnAttr(llvm::Attribute::AlwaysInline);
llvm::legacy::PassManager pass;
llvm::raw_svector_ostream stream(buffer);
// create dump files
std::string module_name = module->getModuleIdentifier();
std::error_code ec;
// Save GCN ISA binary.
std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
if (ec)
{
std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
}
// emit
machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
pass.run(*module);
// Save GCN ISA.
std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
std::string result(buffer.begin(), buffer.end());
std::ofstream amdgcn(amdgcn_path);
amdgcn << result;
amdgcn.close();
// generate HASCO file
std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
std::string error_message;
int lld_result =
llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
{"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
llvm::None, {}, 0, 0, &error_message);
if (lld_result)
{
std::cout << "ld.lld execute fail: " << std::endl;
std::cout << error_message << std::endl;
std::cout << lld_result << std::endl;
}
return hsaco_path;
}
hipModule_t amdgpu_to_hipmodule(const std::string& path) {
// Read HSACO.
std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
std::vector<unsigned char> hsaco(hsaco_file_size);
hsaco_file.seekg(0, std::ios::beg);
hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
hsaco_file.close();
hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
hipJitOptionLogVerbose};
unsigned int errbufsize = 8192;
unsigned int logbufsize = 8192;
char _err[errbufsize];
char _log[logbufsize];
void* optval[] = {(void*)(uintptr_t)errbufsize,
(void*)_err, (void*)(uintptr_t)logbufsize,
(void*)_log, (void*)1};
hipModule_t ret;
dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
return ret;
}
}
}

View File

@@ -1,375 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <fstream>
#include <unistd.h>
#include <memory>
#include <regex>
#include "triton/driver/module.h"
#include "triton/driver/context.h"
#include "triton/driver/error.h"
#include "triton/tools/sha1.hpp"
#include "triton/tools/sys/getenv.hpp"
#include "triton/tools/sys/mkdir.hpp"
#include "triton/tools/sys/exec.hpp"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Verifier.h"
#include "llvm/IR/IRPrintingPasses.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/SourceMgr.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/IR/LegacyPassManager.h"
#include "llvm/ExecutionEngine/ExecutionEngine.h"
#include "llvm/ExecutionEngine/SectionMemoryManager.h"
#include "llvm/Transforms/Utils/Cloning.h"
std::string exec(const char* cmd) {
std::array<char, 128> buffer;
std::string result;
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
if (!pipe) {
throw std::runtime_error("popen() failed!");
}
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
result += buffer.data();
}
return result;
}
void LLVMInitializeNVPTXTargetInfo();
void LLVMInitializeNVPTXTarget();
void LLVMInitializeNVPTXTargetMC();
void LLVMInitializeNVPTXAsmPrinter();
void LLVMInitializeNVPTXAsmParser();
namespace triton
{
namespace driver
{
/* ------------------------ */
// Base //
/* ------------------------ */
void module::init_llvm() {
static bool init = false;
if(!init){
LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();
init = true;
}
}
module::module(CUmodule mod, bool has_ownership)
: polymorphic_resource(mod, has_ownership), spilled_(0) {
}
module::module(host_module_t mod, bool has_ownership)
: polymorphic_resource(mod, has_ownership), spilled_(0) {
}
module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
switch(device->backend()){
case CUDA: return new cu_module(device, std::move(src));
case Host: return new host_module(std::move(src));
default: throw std::runtime_error("unknown backend");
}
}
void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
const std::string &proc, std::string layout,
llvm::SmallVectorImpl<char> &buffer,
const std::string& features,
file_type_t ft) {
}
/* ------------------------ */
// Host //
/* ------------------------ */
host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
throw std::runtime_error("CPU unsupported");
// init_llvm();
// // create kernel wrapper
// llvm::LLVMContext &ctx = src->getContext();
// llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
// llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
// llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
// std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
// llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
// llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
// llvm::Function* fn = &*src->getFunctionList().begin();
// llvm::FunctionType *fn_ty = fn->getFunctionType();
// std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
// std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
// llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
// llvm::IRBuilder<> ir_builder(ctx);
// ir_builder.SetInsertPoint(entry);
// auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
// llvm::Value* base = main->arg_begin();
// llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
// size_t offset = 0;
// for(unsigned i = 0; i < ptrs.size(); i++){
// ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
// size_t nbytes = get_size(fn_ty->getParamType(i));
// offset += nbytes;
// if(i < ptrs.size() - 1){
// size_t np1bytes = get_size(fn_ty->getParamType(i+1));
// offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
// }
// }
// for(unsigned i = 0; i < ptrs.size(); i++)
// ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
// for(unsigned i = 0; i < ptrs.size(); i++)
// fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
// fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
// fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
// fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
// ir_builder.CreateCall(fn, fn_args);
// ir_builder.CreateRetVoid();
//// llvm::legacy::PassManager pm;
//// pm.add(llvm::createPrintModulePass(llvm::outs()));
//// pm.add(llvm::createVerifierPass());
//// pm.run(*src);
//// create execution engine
// for(llvm::Function& fn: src->functions())
// hst_->functions[fn.getName().str()] = &fn;
//// llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
//// auto DL = JTMB.getDefaultDataLayoutForTarget();
//// auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
//// hst_->ES = new llvm::orc::ExecutionSession();
//// hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
//// hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
//// hst_->DL = new llvm::DataLayout(std::move(*DL));
//// hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
//// hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
//// hst_->MainJD = &hst_->ES->createJITDylib("<main>");
//// hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
//// hst_->DL->getGlobalPrefix())));
//// llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
//// hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
// llvm::EngineBuilder builder(std::move(src));
// builder.setErrorStr(&hst_->error);
// builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
// builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
// builder.setEngineKind(llvm::EngineKind::JIT);
// hst_->engine = builder.create();
// hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
}
std::unique_ptr<buffer> host_module::symbol(const char *name) const {
throw std::runtime_error("not implemented");
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
size_t start_replace = str.find(begin);
size_t end_replace = str.find(end, start_replace);
if(start_replace == std::string::npos)
return false;
str.replace(start_replace, end_replace + 1 - start_replace, target);
return true;
}
//static std::map<int, int> vptx = {
// {10000, 63},
// {10010, 64},
// {10020, 65},
// {11000, 70},
// {11010, 71},
// {11020, 72},
// {11030, 73},
// {11040, 73}
//};
int vptx(int version){
if(version >= 11030) return 73;
if(version >= 11020) return 72;
if(version >= 11010) return 71;
if(version >= 11000) return 70;
if(version >= 10020) return 65;
if(version >= 10010) return 64;
if(version >= 10000) return 63;
throw std::runtime_error("Triton requires CUDA 10+");
}
std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
// LLVM version in use may not officially support target hardware
int max_nvvm_cc = 75;
int max_nvvm_ptx = 64;
// options
auto options = llvm::cl::getRegisteredOptions();
auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
assert(short_ptr);
short_ptr->setValue(true);
// compute capability
int cc = ((driver::cu_device*)device)->compute_capability();
std::string sm = "sm_" + std::to_string(cc);
// driver version
int version;
dispatch::cuDriverGetVersion(&version);
int ptx = vptx(version);
int ptx_major = ptx / 10;
int ptx_minor = ptx % 10;
// create
llvm::SmallVector<char, 0> buffer;
std::string triple = "nvptx64-nvidia-cuda";
std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
std::string layout = "";
std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
init_llvm();
// verify and store llvm
llvm::legacy::PassManager pm;
pm.add(llvm::createVerifierPass());
pm.run(*module);
// create machine
module->setTargetTriple(triple);
std::string error;
auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
llvm::TargetOptions opt;
opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
opt.UnsafeFPMath = false;
opt.NoInfsFPMath = false;
opt.NoNaNsFPMath = true;
llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
// set data layout
if(layout.empty())
module->setDataLayout(machine->createDataLayout());
else
module->setDataLayout(layout);
// emit machine code
for (llvm::Function &f : module->functions())
f.addFnAttr(llvm::Attribute::AlwaysInline);
llvm::legacy::PassManager pass;
llvm::raw_svector_ostream stream(buffer);
// emit
machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
pass.run(*module);
// post-process
std::string result(buffer.begin(), buffer.end());
find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
while(find_and_replace(result, "\t// end inline asm", "\n", ""));
return result;
}
void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
// JIT compile source-code
try{
// use ptxas if present in PATH. Otherwise, use JIT from the driver
std::string ptxas = "ptxas";
std::string version;
int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
// Use PTXAS via system call
if(use_system_ptxas){
// compile ptx with ptxas
char _fsrc[] = "/tmp/triton_k_XXXXXX";
char _flog[] = "/tmp/triton_l_XXXXXX";
mkstemp(_fsrc);
mkstemp(_flog);
std::string fsrc = _fsrc;
std::string flog = _flog;
std::ofstream ofs(fsrc);
ofs << ptx;
ofs.close();
std::string cmd;
int err;
std::string cc = std::to_string(device->compute_capability());
cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
err = system(cmd.c_str());
dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
unlink(_fsrc);
unlink(_flog);
return;
}
// Use PTXAS included in driver
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
CU_JIT_LOG_VERBOSE};
unsigned int errbufsize = 8192;
unsigned int logbufsize = 8192;
char _err[errbufsize];
char _log[logbufsize];
void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
}
catch(exception::cuda::invalid_ptx const &){
//#ifdef TRITON_LOG_PTX_ERROR
std::cout << ptx << std::endl;
std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
// exit(1);
//#endif
throw;
}
}
cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
llvm::raw_string_ostream oss(llir_);
oss << *ll_module;
oss.flush();
ptx_ = compile_llvm_module(ll_module.get(), device);
init_from_ptx(ptx_, (driver::cu_device*)device);
}
cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
init_from_ptx(ptx_, (driver::cu_device*)device);
}
std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
CUdeviceptr handle;
size_t size;
dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
return std::move(res);
}
}
}

View File

@@ -1,68 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <string>
#include "triton/driver/platform.h"
#include "triton/driver/device.h"
namespace triton
{
namespace driver
{
/* ------------------------ */
// CUDA //
/* ------------------------ */
std::string cu_platform::version() const{
int version;
dispatch::cuDriverGetVersion(&version);
return std::to_string(version);
}
void cu_platform::devices(std::vector<device *> &devices) const{
int N;
dispatch::cuDeviceGetCount(&N);
for(int i = 0 ; i < N ; ++i){
CUdevice dvc;
dispatch::cuDeviceGet(&dvc, i);
devices.push_back(new driver::cu_device(dvc));
}
}
/* ------------------------ */
// Host //
/* ------------------------ */
std::string host_platform::version() const {
return "1.0";
}
void host_platform::devices(std::vector<driver::device*> &devices) const {
devices.push_back(new driver::host_device());
}
}
}

View File

@@ -1,142 +0,0 @@
/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#include <cassert>
#include <unistd.h>
#include <array>
#include "triton/driver/backend.h"
#include "triton/driver/stream.h"
#include "triton/driver/context.h"
#include "triton/driver/device.h"
#include "triton/driver/kernel.h"
#include "triton/driver/buffer.h"
#include "llvm/ExecutionEngine/ExecutionEngine.h"
#include "llvm/ExecutionEngine/GenericValue.h"
namespace triton
{
namespace driver
{
/* ------------------------ */
// Base //
/* ------------------------ */
stream::stream(CUstream cu, bool has_ownership)
: polymorphic_resource(cu, has_ownership) {
}
stream::stream(host_stream_t cl, bool has_ownership)
: polymorphic_resource(cl, has_ownership) {
}
driver::stream* stream::create(backend_t backend) {
switch(backend){
case CUDA: return new cu_stream();
case Host: return new host_stream();
default: throw std::runtime_error("unknown backend");
}
}
/* ------------------------ */
// Host //
/* ------------------------ */
host_stream::host_stream(): stream(host_stream_t(), true) {
hst_->pool.reset(new ThreadPool(1));
hst_->futures.reset(new std::vector<std::future<void>>());
}
void host_stream::synchronize() {
for(auto& x: *hst_->futures)
x.wait();
hst_->futures->clear();
hst_->args.clear();
}
void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
auto hst = kernel->module()->hst();
hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
char* params = new char[args_size];
std::memcpy((void*)params, (void*)args, args_size);
for(size_t i = 0; i < grid[0]; i++)
for(size_t j = 0; j < grid[1]; j++)
for(size_t k = 0; k < grid[2]; k++)
hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
}
void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
std::memcpy((void*)buffer->hst()->data, ptr, size);
}
void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
std::memcpy(ptr, (const void*)buffer->hst()->data, size);
}
/* ------------------------ */
// CUDA //
/* ------------------------ */
cu_stream::cu_stream(CUstream str, bool take_ownership):
stream(str, take_ownership) {
}
cu_stream::cu_stream(): stream(CUstream(), true) {
dispatch::cuStreamCreate(&*cu_, 0);
}
void cu_stream::synchronize() {
dispatch::cuStreamSynchronize(*cu_);
}
void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
void *config[] = {
CU_LAUNCH_PARAM_BUFFER_POINTER, args,
CU_LAUNCH_PARAM_BUFFER_SIZE, &args_size,
CU_LAUNCH_PARAM_END
};
dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
}
void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
if(blocking)
dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
else
dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
}
void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
if(blocking)
dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
else
dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
}
}
}