[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
2021-09-09 00:04:28 -07:00
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions
--- a/lib/codegen/pass.cc
+++ b/lib/codegen/pass.cc
@@ -13,45 +13,40 @@
 #include "triton/codegen/transform/peephole.h"
 #include "triton/codegen/transform/pipeline.h"
 #include "triton/codegen/transform/prefetch.h"
-#include "triton/driver/device.h"
-#include "triton/driver/kernel.h"
-#include "triton/driver/module.h"
 #include "triton/ir/function.h"
 #include "triton/ir/module.h"
 #include "triton/ir/print.h"
 #include "llvm/IR/Module.h"
-
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
 namespace triton {
 namespace codegen {

 // TODO:
 // There should be a proper pass manager there!
-void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
-                            driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
+std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
+                                                     int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
  // generate llvm code
-  llvm::LLVMContext ctx;
  std::string name = ir.get_function_list()[0]->get_name();
  std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
  // optimizations
-  std::unique_ptr<codegen::target> target = dev->make_target();
-  bool cts_use_async = target->as_nvidia()->sm() >= 80;
+  bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
  // create passes
  codegen::analysis::align align;
  codegen::analysis::axes axes;
  codegen::transform::cts cts(cts_use_async);
  codegen::transform::pipeline pipeline(cts_use_async, num_stages);
  codegen::transform::disassociate disassociate;
-  codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
+  codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
  codegen::analysis::liveness liveness(&layouts);
-  codegen::analysis::swizzle swizzle(&layouts, target.get());
+  codegen::analysis::swizzle swizzle(&layouts, target);
  codegen::analysis::allocation allocation(&liveness);
  codegen::transform::dce dce;
-  codegen::transform::peephole peephole(target.get(), &layouts);
-//  codegen::transform::reassociate reassociate;
+  codegen::transform::peephole peephole(target, &layouts);
  codegen::transform::coalesce coalesce(&align, &layouts);
-  codegen::transform::prefetch prefetch_s(target.get());
-  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
-  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
+  codegen::transform::prefetch prefetch_s(target);
+  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
+  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
  // run passes
  dce.run(ir);
  peephole.run(ir);
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  layouts.run(ir);
  coalesce.run(ir);
  dce.run(ir);
-//  exit(1);
-
  align.run(ir);
  dce.run(ir);
  if (target->is_gpu())
    cts.run(ir);
  dce.run(ir);
  align.run(ir);
-//  ir::print(ir, std::cout);
  axes.run(ir);
  layouts.run(ir);
  peephole.run(ir);
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  allocation.run(ir);
  prefetch_s.run(ir);
  barriers.run(ir);
-  // ir.print(std::cout);
  isel.visit(ir, *llvm);
-  mod = driver::module::create(dev, std::move(llvm));
-  ker = driver::kernel::create(&*mod, name.c_str());
-  shared_mem = allocation.allocated_size();
+  shared_static = allocation.allocated_size();
+  return llvm;
 }

 } // namespace codegen
--- a/lib/driver/backend.cc
+++ b/lib/driver/backend.cc
@@ -1,231 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <vector>
-#include <stdexcept>
-#include "triton/driver/dispatch.h"
-#include "triton/driver/backend.h"
-#include "triton/driver/buffer.h"
-#include "triton/driver/context.h"
-#include "triton/driver/stream.h"
-#include "triton/driver/kernel.h"
-
-
-namespace triton
-{
-
-namespace driver
-{
-
-/*-----------------------------------*/
-//-----------  Platforms ------------*/
-/*-----------------------------------*/
-
-void backend::platforms::init() {
-  if(!cache_.empty())
-    return;
-  //if CUDA is here
-  if(dispatch::cuinit()){
-    cache_.push_back(new cu_platform());
-  }
-  //if host should be added
-  bool host_visible = true;
-  if(host_visible){
-    cache_.push_back(new host_platform());
-  }
-
-//  //if OpenCL is here
-//  if(dispatch::clinit()){
-//    cl_uint num_platforms;
-//    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
-//    std::vector<cl_platform_id> ids(num_platforms);
-//    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
-//    for(cl_platform_id id: ids)
-//      cache_.push_back(new cl_platform(id));
-//  }
-
-  if(cache_.empty())
-    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
-}
-
-void backend::platforms::get(std::vector<platform *> &results) {
-  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
-}
-
-std::vector<driver::platform*> backend::platforms::cache_;
-
-
-/*-----------------------------------*/
-//-----------  Devices --------------*/
-/*-----------------------------------*/
-
-void backend::devices::init(std::vector<platform*> const & platforms) {
-  if(!cache_.empty())
-    return;
-  for(driver::platform* pf: platforms)
-    pf->devices(cache_);
-  if(cache_.empty())
-    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
-}
-
-void backend::devices::get(std::vector<device*> &devs) {
-  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
-}
-
-std::vector<driver::device*> backend::devices::cache_;
-
-
-
-/*-----------------------------------*/
-//---------- Modules ----------------*/
-/*-----------------------------------*/
-
-void backend::modules::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
-
-/*-----------------------------------*/
-//-----------  Kernels --------------*/
-/*-----------------------------------*/
-
-void backend::kernels::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
-  std::tuple<driver::module*, std::string> key(mod, name);
-  if(cache_.find(key)==cache_.end()){
-    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
-  }
-  return cache_.at(key);
-}
-
-std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
-
-/*-----------------------------------*/
-//------------  Queues --------------*/
-/*-----------------------------------*/
-
-void backend::streams::init(std::list<driver::context*> const & contexts){
-  for(driver::context* ctx : contexts)
-    if(cache_.find(ctx)==cache_.end())
-      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
-}
-
-void backend::streams::release(){
-  for(auto & x: cache_)
-    for(auto & y: x.second)
-      delete y;
-  cache_.clear();
-}
-
-driver::stream* backend::streams::get_default()
-{ return get(contexts::get_default(), 0); }
-
-driver::stream* backend::streams::get(driver::context* context, unsigned int id){
-  init(std::list<driver::context*>(1,context));
-  for(auto & x : cache_)
-    if(x.first==context)
-      return x.second[id];
-  throw;
-}
-
-void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
-  init(std::list<driver::context*>(1,context));
-  queues = cache_.at(context);
-}
-
-std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
-
-/*-----------------------------------*/
-//------------  Contexts ------------*/
-/*-----------------------------------*/
-
-void backend::contexts::init(std::vector<driver::device*> const & devices){
-  for(driver::device* dvc: devices)
-    cache_.push_back(driver::context::create(dvc));
-}
-
-void backend::contexts::release(){
-  for(auto & x: cache_)
-    delete x;
-  cache_.clear();
-}
-
-driver::context* backend::contexts::get_default(){
-  backend::init();
-  auto it = cache_.begin();
-  std::advance(it, default_device);
-  return *it;
-}
-
-void backend::contexts::get(std::list<driver::context*> & contexts){
-  backend::init();
-  contexts = cache_;
-}
-
-std::list<driver::context*> backend::contexts::cache_;
-
-
-
-/*-----------------------------------*/
-//------------  General -------------*/
-/*-----------------------------------*/
-
-void backend::synchronize(driver::context* context){
-  for(driver::stream * queue: streams::cache_.at(context))
-    queue->synchronize();
-}
-
-
-void backend::release(){
-  backend::kernels::release();
-//  backend::programs::release();
-  backend::streams::release();
-  backend::contexts::release();
-}
-
-
-void backend::init(){
-  if(!contexts::cache_.empty())
-    return;
-  // initialize platforms
-  backend::platforms::init();
-  // initialize devices
-  backend::devices::init(platforms::cache_);
-  // initialize contexts
-  backend::contexts::init(devices::cache_);
-  // initialize streams
-  streams::init(contexts::cache_);
-}
-
-unsigned int backend::default_device = 0;
-
-}
-
-}
--- a/lib/driver/buffer.cc
+++ b/lib/driver/buffer.cc
@@ -1,90 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "triton/driver/stream.h"
-#include "triton/driver/buffer.h"
-#include "triton/driver/context.h"
-#include "triton/driver/dispatch.h"
-
-
-namespace triton
-{
-
-namespace driver
-{
-
-
-//
-
-buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
-  : polymorphic_resource(cu, take_ownership), size_(size) { }
-
-buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
-  : polymorphic_resource(hst, take_ownership), size_(size) { }
-
-size_t buffer::size() {
-  return size_;
-}
-
-uintptr_t buffer::addr_as_uintptr_t() {
-  switch(backend_){
-    case CUDA: return *cu_;
-    case Host: return (uintptr_t)hst_->data;
-    default: return 0;
-  }
-}
-
-
-buffer* buffer::create(driver::context* ctx, size_t size) {
-  switch(ctx->backend()){
-  case CUDA: return new cu_buffer(size);
-  case Host: return new host_buffer(size);
-  default: throw std::runtime_error("unknown backend");
-  }
-}
-
-//
-
-host_buffer::host_buffer(size_t size)
-  :  buffer(size, host_buffer_t(), true){
-  hst_->data = new char[size];
-}
-
-
-//
-
-cu_buffer::cu_buffer(size_t size)
-  : buffer(size, CUdeviceptr(), true) {
-  dispatch::cuMemAlloc(&*cu_, size);
-}
-
-cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
-  : buffer(size, cu, take_ownership){
-}
-
-void cu_buffer::set_zero(driver::stream* queue, size_t size){
-  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
-}
-
-}
-
-}
--- a/lib/driver/context.cc
+++ b/lib/driver/context.cc
@@ -1,118 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <cassert>
-#include "triton/driver/context.h"
-#include "triton/driver/module.h"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//         BASE             //
-/* ------------------------ */
-
-context::context(driver::device *dev, CUcontext cu, bool take_ownership):
-  polymorphic_resource(cu, take_ownership),
-  dev_(dev), cache_path_(get_cache_path()) {
-}
-
-context::context(driver::device *dev, host_context_t hst, bool take_ownership):
-  polymorphic_resource(hst, take_ownership),
-  dev_(dev), cache_path_(get_cache_path()){
-}
-
-context* context::create(driver::device *dev){
-  switch(dev->backend()){
-  case CUDA: return new cu_context(dev);
-  case Host: return new host_context(dev);
-  default: throw std::runtime_error("unknown backend");
-  }
-}
-
-
-driver::device* context::device() const {
-  return dev_;
-}
-
-std::string context::get_cache_path(){
-  //user-specified cache path
-  std::string result = tools::getenv("TRITON_CACHE_PATH");
-  if(!result.empty()){
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //create in home
-  result = tools::getenv("HOME");
-  if(!result.empty())
-  {
-    result = result + "/.triton/cache/";
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //couldn't find a directory
-  return "";
-}
-
-std::string const & context::cache_path() const{
-  return cache_path_;
-}
-
-/* ------------------------ */
-//         Host             //
-/* ------------------------ */
-
-host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
-
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-// import CUdevice
-CUdevice cu_context::get_device_of(CUcontext context){
-  dispatch::cuCtxPushCurrent_v2(context);
-  CUdevice res;
-  dispatch::cuCtxGetDevice(&res);
-  dispatch::cuCtxPopCurrent_v2(NULL);
-  return res;
-}
-
-// wrapper for cuda context
-cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
-                                                                                context, take_ownership) {
-}
-
-cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
-  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
-//  dispatch::cuCtxPopCurrent_v2(NULL);
-}
-
-
-}
-}
--- a/lib/driver/device.cc
+++ b/lib/driver/device.cc
@@ -1,192 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <map>
-#include <algorithm>
-#include <sstream>
-#include <cstring>
-#include <memory>
-#include "triton/driver/device.h"
-#include "triton/driver/context.h"
-#include "triton/driver/error.h"
-#include "triton/codegen/target.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//          Host            //
-/* ------------------------ */
-
-std::unique_ptr<codegen::target> host_device::make_target() const {
-  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
-}
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-// information query
-template<CUdevice_attribute attr>
-int cu_device::cuGetInfo() const{
-  int res;
-  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
-  return res;
-}
-
-// convert to nvml
-nvmlDevice_t cu_device::nvml_device() const{
-  std::map<std::string, nvmlDevice_t> map;
-  std::string key = pci_bus_id();
-  if(map.find(key)==map.end()){
-    nvmlDevice_t device;
-    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
-    return map.insert(std::make_pair(key, device)).first->second;
-  }
-  return map.at(key);
-}
-
-// number of address bits
-size_t cu_device::address_bits() const{
-  return sizeof(size_t)*8;
-}
-
-// name
-std::string cu_device::name() const {
-    char tmp[128];
-    dispatch::cuDeviceGetName(tmp, 128, *cu_);
-    return std::string(tmp);
-}
-
-// PCI bus ID
-std::string cu_device::pci_bus_id() const{
-  char tmp[128];
-  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
-  return std::string(tmp);
-}
-
-// force the device to be interpreted as a particular cc
-void cu_device::interpret_as(int cc){
-  interpreted_as_ = std::make_shared<int>(cc);
-}
-
-// compute capability
-int cu_device::compute_capability() const {
-  if(interpreted_as_)
-    return *interpreted_as_;
-  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
-  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
-  return major*10 + minor;
-}
-
-// maximum number of threads per block
-size_t cu_device::max_threads_per_block() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
-}
-
-// maximum amount of shared memory per block
-size_t cu_device::max_shared_memory() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
-}
-
-// warp size
-size_t cu_device::warp_size() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
-}
-
-
-// maximum block dimensions
-std::vector<size_t> cu_device::max_block_dim() const {
-  std::vector<size_t> result(3);
-  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
-  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
-  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
-  return result;
-}
-
-// current SM clock
-size_t cu_device::current_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-// max SM clock
-size_t cu_device::max_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-// current memory clock
-size_t cu_device::current_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-// max memory clock
-size_t cu_device::max_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-// max memory clock
-void cu_device::set_max_clock() {
-  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
-}
-
-void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
-  CUcontext context;
-  dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
-  try {
-    dispatch::cuCtxEnablePeerAccess(context, 0);
-  } catch (exception::cuda::peer_access_already_enabled) {}
-}
-
-// print infos
-std::string cu_device::infos() const{
-  std::ostringstream oss;
-  std::vector<size_t> max_wi_sizes = max_block_dim();
-  oss << "Platform: CUDA" << std::endl;
-  oss << "Name: " << name() << std::endl;
-  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
-  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
-  oss << "Local memory size: " << max_shared_memory() << std::endl;
-  return oss.str();
-}
-
-// target
-std::unique_ptr<codegen::target> cu_device::make_target() const {
-  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
-}
-
-
-}
-
-}
-
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -21,7 +21,6 @@
 */

 #include "triton/driver/dispatch.h"
-#include "triton/driver/context.h"
 #include "triton/tools/sys/getenv.hpp"

 namespace triton
@@ -31,65 +30,65 @@ namespace driver

 //Helpers for function definition
 #define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
+void* dispatch::fname ## _;

 #define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
+void* dispatch::fname ## _;

 #define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
+void* dispatch::fname ## _;

 #define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
+void* dispatch::fname ## _;

 #define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
+void* dispatch::fname ## _;

 #define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
+void* dispatch::fname ## _;

 #define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
+void* dispatch::fname ## _;

 #define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
+void* dispatch::fname ## _;

 #define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
+void* dispatch::fname ## _;

 #define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
+void* dispatch::fname ## _;

 #define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
+void* dispatch::fname ## _;

 #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
+void* dispatch::fname ## _;

 #define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
+void* dispatch::fname ## _;

 #define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
+void* dispatch::fname ## _;

-//Specialized helpers for CUDA
-#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
-
-#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)

+/* ------------------- *
+ * CUDA
+ * ------------------- */

 bool dispatch::cuinit(){
  if(cuda_==nullptr){
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
  return true;
 }

+#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
+#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
+#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
+#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
+#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
+#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+// context management
+CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
+CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
+CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
+CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
+CUDA_DEFINE1(CUresult, cuInit, unsigned int)
+CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
+// device management
+CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
+CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
+CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
+
+// link management
+CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
+CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
+CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
+CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
+// module management
+CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
+CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
+CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
+CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
+CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
+CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+// stream management
+CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
+CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
+CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
+CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
+CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
+// function management
+CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
+CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
+// memory management
+CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
+CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
+CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
+CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
+CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
+CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
+// event management
+CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
+CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
+CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
+CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
+
+
+
+/* ------------------- *
+ * NVML
+ * ------------------- */
 bool dispatch::nvmlinit(){
  if(nvml_==nullptr)
    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
  return res;
 }

-//CUDA
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
-CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
-CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
-CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
-CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
-CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
-CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
-CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
-CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
-
-CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
-CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
-CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
-CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
-CUDA_DEFINE1(CUresult, cuInit, unsigned int)
-CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
-CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
-CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
-CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
-CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
-CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
-CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
-CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
-CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
-CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
-CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
-CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
+#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
+#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
+#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)

 NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)

+/* ------------------- *
+ * HIP
+ * ------------------- */
+bool dispatch::hipinit(){
+  if(hip_==nullptr)
+    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
+  if(hip_ == nullptr)
+    return false;
+  hipError_t (*fptr)();
+  hipInit_ = dlsym(hip_, "hipInit");
+  *reinterpret_cast<void **>(&fptr) = hipInit_;
+  hipError_t res = (*fptr)();
+  check(res);
+  return res;
+}
+
+#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
+#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
+#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
+#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
+#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
+#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+// context management
+HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
+HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
+HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
+HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
+HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
+HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
+HIP_DEFINE1(hipError_t, hipInit, unsigned int)
+HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
+// device management
+HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
+HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
+HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
+HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
+HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
+// module management
+HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
+HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
+HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
+HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
+HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
+HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
+// stream management
+HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
+HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
+HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
+HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
+// function management
+HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
+HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
+// memory management
+HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
+HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
+HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
+HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
+HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
+HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
+HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
+HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
+// event management
+HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
+HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
+HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
+HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
+
+
+/* ------------------- *
+ * COMMON
+ * ------------------- */

 // Release
 void dispatch::release(){
@@ -190,61 +291,9 @@ void dispatch::release(){

 void* dispatch::cuda_;
 void* dispatch::nvml_;
-
-//CUDA
-void* dispatch::cuCtxGetCurrent_;
-void* dispatch::cuCtxSetCurrent_;
-void* dispatch::cuCtxDestroy_v2_;
-void* dispatch::cuEventCreate_;
-void* dispatch::cuDeviceGet_;
-void* dispatch::cuMemcpyDtoH_v2_;
-void* dispatch::cuStreamCreate_;
-void* dispatch::cuEventElapsedTime_;
-void* dispatch::cuMemFree_v2_;
-void* dispatch::cuMemcpyDtoHAsync_v2_;
-void* dispatch::cuDriverGetVersion_;
-void* dispatch::cuDeviceGetName_;
-void* dispatch::cuDeviceGetPCIBusId_;
-void* dispatch::cuModuleGetGlobal_v2_;
-
-void* dispatch::cuLinkAddData_v2_;
-void* dispatch::cuLinkCreate_v2_;
-void* dispatch::cuLinkDestroy_;
-void* dispatch::cuModuleLoadData_;
-void* dispatch::cuLinkComplete_;
-
-void* dispatch::cuMemcpyHtoDAsync_v2_;
-void* dispatch::cuModuleLoad_;
-void* dispatch::cuLaunchKernel_;
-void* dispatch::cuModuleUnload_;
-void* dispatch::cuModuleLoadDataEx_;
-void* dispatch::cuDeviceGetAttribute_;
-void* dispatch::cuDeviceGetCount_;
-void* dispatch::cuMemcpyHtoD_v2_;
-void* dispatch::cuInit_;
-void* dispatch::cuEventRecord_;
-void* dispatch::cuCtxCreate_v2_;
-void* dispatch::cuModuleGetFunction_;
-void* dispatch::cuStreamSynchronize_;
-void* dispatch::cuStreamDestroy_v2_;
-void* dispatch::cuStreamGetCtx_;
-void* dispatch::cuEventDestroy_v2_;
-void* dispatch::cuMemAlloc_v2_;
-void* dispatch::cuPointerGetAttribute_;
-void* dispatch::cuCtxGetDevice_;
-void* dispatch::cuMemsetD8Async_;
-void* dispatch::cuCtxPushCurrent_v2_;
-void* dispatch::cuCtxPopCurrent_v2_;
-void* dispatch::cuFuncGetAttribute_;
-void* dispatch::cuFuncSetAttribute_;
-void* dispatch::cuFuncSetCacheConfig_;
-void* dispatch::cuCtxEnablePeerAccess_;
-
 void* dispatch::nvmlInit_v2_;
-void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
-void* dispatch::nvmlDeviceGetClockInfo_;
-void* dispatch::nvmlDeviceGetMaxClockInfo_;
-void* dispatch::nvmlDeviceSetApplicationsClocks_;
+void* dispatch::hip_;
+

 }
 }
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -94,6 +94,73 @@ void check(CUresult err)
  }
 }

+void check(hipError_t error) {
+  using namespace exception::hip;
+  switch(error)
+  {
+  case hipSuccess                              : break;
+    case hipErrorInvalidValue                  : throw invalid_value();
+    case hipErrorMemoryAllocation                  : throw out_of_memory();
+    case hipErrorNotInitialized                : throw not_initialized();
+    case hipErrorDeinitialized                  : throw deinitialized();
+    case hipErrorProfilerDisabled              : throw profiler_disabled();
+    case hipErrorProfilerNotInitialized       : throw profiler_not_initialized();
+    case hipErrorProfilerAlreadyStarted       : throw profiler_already_started();
+    case hipErrorProfilerAlreadyStopped       : throw profiler_already_stopped();
+    case hipErrorNoDevice                      : throw no_device();
+    case hipErrorInvalidSymbol                      : throw invalid_symbol();
+    case hipErrorInvalidDevice                 : throw invalid_device();
+    case hipErrorInvalidImage                  : throw invalid_image();
+    case hipErrorInvalidContext                : throw invalid_context();
+    case hipErrorContextAlreadyCurrent        : throw context_already_current();
+    case hipErrorMapFailed                     : throw map_failed();
+    case hipErrorUnmapFailed                   : throw unmap_failed();
+    case hipErrorArrayIsMapped                : throw array_is_mapped();
+    case hipErrorAlreadyMapped                 : throw already_mapped();
+    case hipErrorNoBinaryForGpu              : throw no_binary_for_gpu();
+    case hipErrorAlreadyAcquired               : throw already_acquired();
+    case hipErrorNotMapped                     : throw not_mapped();
+    case hipErrorNotMappedAsArray             : throw not_mapped_as_array();
+    case hipErrorNotMappedAsPointer           : throw not_mapped_as_pointer();
+    case hipErrorECCNotCorrectable            : throw ecc_uncorrectable();
+    case hipErrorUnsupportedLimit             : throw unsupported_limit();
+    case hipErrorContextAlreadyInUse          : throw context_already_in_use();
+    case hipErrorPeerAccessUnsupported        : throw peer_access_unsupported();
+    case hipErrorInvalidKernelFile            : throw invalid_ptx();
+    case hipErrorInvalidGraphicsContext       : throw invalid_graphics_context();
+    case hipErrorInvalidSource                 : throw invalid_source();
+    case hipErrorFileNotFound                 : throw file_not_found();
+    case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
+    case hipErrorSharedObjectInitFailed      : throw shared_object_init_failed();
+    case hipErrorOperatingSystem               : throw operating_system();
+    case hipErrorInvalidResourceHandle                 : throw invalid_handle();
+    case hipErrorNotFound                      : throw not_found();
+    case hipErrorNotReady                      : throw not_ready();
+    case hipErrorIllegalAddress                : throw illegal_address();
+    case hipErrorLaunchOutOfResources        : throw launch_out_of_resources();
+    case hipErrorLaunchTimeOut                 : throw launch_timeout();
+    // case hipErrorLaunchIncompatibleTexturing  : throw launch_incompatible_texturing();
+    case hipErrorPeerAccessAlreadyEnabled    : throw peer_access_already_enabled();
+    case hipErrorPeerAccessNotEnabled        : throw peer_access_not_enabled();
+    // case hipErrorPrimaryContextActive         : throw primary_context_active();
+    // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
+    case hipErrorAssert                         : throw assert_error();
+    // case hipErrorTooManyPeers                 : throw too_many_peers();
+    case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
+    case hipErrorHostMemoryNotRegistered     : throw host_memory_not_registered();
+    // case hipErrorHardwareStackError           : throw hardware_stack_error();
+    // case hipErrorIllegalInstruction            : throw illegal_instruction();
+    // case hipErrorMisalignedAddress             : throw misaligned_address();
+    // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
+    // case hipErrorInvalidPc                     : throw invalid_pc();
+    case hipErrorLaunchFailure                  : throw launch_failed();
+    // case hipErrorNotPermitted                  : throw not_permitted();
+    case hipErrorNotSupported                  : throw not_supported();
+    case hipErrorUnknown                        : throw unknown();
+    default                                        : throw unknown();
+}
+}
+
 }
 }

--- a/lib/driver/handle.cc
+++ b/lib/driver/handle.cc
@@ -1,91 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "triton/driver/handle.h"
-#include "triton/driver/error.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-//Host
-inline void _delete(host_platform_t) { }
-inline void _delete(host_device_t)   { }
-inline void _delete(host_context_t)  { }
-inline void _delete(host_module_t)   { }
-inline void _delete(host_stream_t)   { }
-inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
-inline void _delete(host_function_t) { }
-
-//CUDA
-inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
-inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
-inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
-inline void _delete(CUdevice) { }
-inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
-inline void _delete(CUfunction) { }
-inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
-inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
-inline void _delete(CUPlatform){}
-
-//Constructor
-template<class T>
-handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
-{ }
-
-template<class T>
-handle<T>::handle(): has_ownership_(false){ }
-
-
-template<class T>
-handle<T>::~handle(){
-  try{
-    if(has_ownership_ && h_ && h_.unique())
-      _delete(*h_);
-  }catch(const exception::cuda::base&){
-    // order of destruction for global variables
-    // is not guaranteed
-  }
-}
-
-template class handle<CUdeviceptr>;
-template class handle<CUstream>;
-template class handle<CUcontext>;
-template class handle<CUdevice>;
-template class handle<cu_event_t>;
-template class handle<CUfunction>;
-template class handle<CUmodule>;
-template class handle<CUPlatform>;
-
-template class handle<host_platform_t>;
-template class handle<host_device_t>;
-template class handle<host_context_t>;
-template class handle<host_module_t>;
-template class handle<host_stream_t>;
-template class handle<host_buffer_t>;
-template class handle<host_function_t>;
-
-
-}
-}
--- a/lib/driver/kernel.cc
+++ b/lib/driver/kernel.cc
@@ -1,94 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <string.h>
-#include "triton/driver/kernel.h"
-#include "triton/driver/buffer.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
-  polymorphic_resource(fn, has_ownership), program_(program){
-}
-
-
-kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
-  polymorphic_resource(fn, has_ownership), program_(program){
-}
-
-kernel* kernel::create(driver::module* program, const char* name) {
-    switch(program->backend()){
-    case CUDA: return new cu_kernel(program, name);
-    case Host: return new host_kernel(program, name);
-    default: throw std::runtime_error("unknown backend");
-    }
-}
-
-driver::module* kernel::module() {
-  return program_;
-}
-
-/* ------------------------ */
-//         Host             //
-/* ------------------------ */
-
-host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
-  hst_->fn = program->hst()->functions.at(name);
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
-  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
-  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
-  // properties
-  int shared_total, shared_optin, shared_static;
-  int n_spills, n_reg;
-  CUdevice dev;
-  dispatch::cuCtxGetDevice(&dev);
-  dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
-  dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
-  dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
-  dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  *cu_);
-  dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
-//  std::cout << n_reg << std::endl;
-  if (shared_optin > 49152){
-//      std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
-      dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
-  }
-}
-
-}
-
-}
-
--- a/lib/driver/llvm.cc
+++ b/lib/driver/llvm.cc
@@ -0,0 +1,324 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include <fstream>
+#include <unistd.h>
+#include <memory>
+#include <regex>
+#include "triton/driver/llvm.h"
+#include "triton/driver/dispatch.h"
+#include "triton/driver/error.h"
+#include "triton/tools/sha1.hpp"
+#include "triton/tools/sys/getenv.hpp"
+#include "triton/tools/sys/mkdir.hpp"
+#include "triton/tools/sys/exec.hpp"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+// begin AMD stuff
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+// end AMD stuff
+
+namespace triton{
+namespace driver{
+
+void init_llvm() {
+  static bool init = false;
+  if(!init){
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+    init = true;
+  }
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
+  size_t start_replace = str.find(begin);
+  size_t end_replace = str.find(end, start_replace);
+  if(start_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+int vptx(int version){
+  if(version >= 11030) return 73;
+  if(version >= 11020) return 72;
+  if(version >= 11010) return 71;
+  if(version >= 11000) return 70;
+  if(version >= 10020) return 65;
+  if(version >= 10010) return 64;
+  if(version >= 10000) return 63;
+  throw std::runtime_error("Triton requires CUDA 10+");
+}
+
+std::string llir_to_ptx(llvm::Module* module, int cc, int version){
+  // LLVM version in use may not officially support target hardware
+  int max_nvvm_cc = 75;
+  int max_nvvm_ptx = 64;
+  // options
+  auto options = llvm::cl::getRegisteredOptions();
+  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
+  assert(short_ptr);
+  short_ptr->setValue(true);
+  // compute capability
+  std::string sm = "sm_" + std::to_string(cc);
+  // max PTX version
+  int ptx = vptx(version);
+  int ptx_major = ptx / 10;
+  int ptx_minor = ptx % 10;
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "nvptx64-nvidia-cuda";
+  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
+  std::string layout = "";
+  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
+  init_llvm();
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
+  pass.run(*module);
+
+  // post-process
+  std::string result(buffer.begin(), buffer.end());
+  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
+  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
+  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
+  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
+  return result;
+}
+
+
+CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
+  // JIT compile source-code
+  try{
+    // use ptxas if present in PATH. Otherwise, use JIT from the driver
+    std::string ptxas = "ptxas";
+    std::string version;
+    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
+
+    // Use PTXAS via system call
+    if(use_system_ptxas){
+      // compile ptx with ptxas
+      char _fsrc[] = "/tmp/triton_k_XXXXXX";
+      char _flog[] = "/tmp/triton_l_XXXXXX";
+      mkstemp(_fsrc);
+      mkstemp(_flog);
+      std::string fsrc = _fsrc;
+      std::string flog = _flog;
+      std::ofstream ofs(fsrc);
+      ofs << ptx;
+      ofs.close();
+      std::string cmd;
+      int err;
+      cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
+      err = system(cmd.c_str());
+      CUmodule ret;
+      dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
+      unlink(_fsrc);
+      unlink(_flog);
+      return ret;
+    }
+
+    // Use PTXAS included in driver
+    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
+                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
+                          CU_JIT_LOG_VERBOSE};
+    unsigned int errbufsize = 8192;
+    unsigned int logbufsize = 8192;
+    char _err[errbufsize];
+    char _log[logbufsize];
+    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
+    CUmodule ret;
+    dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
+    return ret;
+  }
+  catch(exception::cuda::invalid_ptx const &){
+    std::cout << ptx << std::endl;
+    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
+    throw;
+  }
+}
+
+/* ------------------------ */
+//         HIP              //
+/* ------------------------ */
+
+std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
+  init_llvm();
+
+//  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
+//  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
+
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "amdgcn-amd-amdhsa";
+  std::string layout = "";
+  std::string features;
+  std::string proc = "gfx908";
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None,
+                                                             llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+
+  // create dump files
+  std::string module_name = module->getModuleIdentifier();
+  std::error_code ec;
+
+  // Save GCN ISA binary.
+  std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
+  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
+      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
+  if (ec)
+  {
+    std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
+  }
+
+  // emit
+  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
+  pass.run(*module);
+  // Save GCN ISA.
+  std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
+  std::string result(buffer.begin(), buffer.end());
+  std::ofstream amdgcn(amdgcn_path);
+  amdgcn << result;
+  amdgcn.close();
+
+  // generate HASCO file
+  std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
+  std::string error_message;
+  int lld_result =
+      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
+                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
+                                llvm::None, {}, 0, 0, &error_message);
+  if (lld_result)
+  {
+    std::cout << "ld.lld execute fail: " << std::endl;
+    std::cout << error_message << std::endl;
+    std::cout << lld_result << std::endl;
+  }
+
+  return hsaco_path;
+}
+
+
+hipModule_t amdgpu_to_hipmodule(const std::string& path) {
+  // Read HSACO.
+  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
+  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
+
+  std::vector<unsigned char> hsaco(hsaco_file_size);
+  hsaco_file.seekg(0, std::ios::beg);
+  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.close();
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
+                            hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
+                            hipJitOptionLogVerbose};
+  unsigned int errbufsize = 8192;
+  unsigned int logbufsize = 8192;
+  char _err[errbufsize];
+  char _log[logbufsize];
+  void* optval[] = {(void*)(uintptr_t)errbufsize,
+                    (void*)_err, (void*)(uintptr_t)logbufsize,
+                    (void*)_log, (void*)1};
+  hipModule_t ret;
+  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
+  return ret;
+}
+
+
+
+}
+}
+
--- a/lib/driver/module.cc
+++ b/lib/driver/module.cc
@@ -1,375 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-#include <fstream>
-#include <unistd.h>
-#include <memory>
-#include <regex>
-#include "triton/driver/module.h"
-#include "triton/driver/context.h"
-#include "triton/driver/error.h"
-#include "triton/tools/sha1.hpp"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-#include "triton/tools/sys/exec.hpp"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-std::string exec(const char* cmd) {
-    std::array<char, 128> buffer;
-    std::string result;
-    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
-    if (!pipe) {
-        throw std::runtime_error("popen() failed!");
-    }
-    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
-        result += buffer.data();
-    }
-    return result;
-}
-
-  void LLVMInitializeNVPTXTargetInfo();
-  void LLVMInitializeNVPTXTarget();
-  void LLVMInitializeNVPTXTargetMC();
-  void LLVMInitializeNVPTXAsmPrinter();
-  void LLVMInitializeNVPTXAsmParser();
-
-
-namespace triton
-{
-namespace driver
-{
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-
-void module::init_llvm() {
-  static bool init = false;
-  if(!init){
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-    init = true;
-  }
-}
-
-module::module(CUmodule mod, bool has_ownership)
-  : polymorphic_resource(mod, has_ownership), spilled_(0) {
-}
-
-module::module(host_module_t mod, bool has_ownership)
-  : polymorphic_resource(mod, has_ownership), spilled_(0) {
-}
-
-
-module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
-  switch(device->backend()){
-    case CUDA: return new cu_module(device, std::move(src));
-    case Host: return new host_module(std::move(src));
-    default: throw std::runtime_error("unknown backend");
-  }
-}
-
-void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
-                                 const std::string &proc, std::string layout,
-                                 llvm::SmallVectorImpl<char> &buffer,
-                                 const std::string& features,
-                                 file_type_t ft) {
-
-}
-
-
-/* ------------------------ */
-//        Host              //
-/* ------------------------ */
-
-host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
-  throw std::runtime_error("CPU unsupported");
-//  init_llvm();
-//  // create kernel wrapper
-//  llvm::LLVMContext &ctx = src->getContext();
-//  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
-//  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
-//  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
-//  std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
-//  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
-//  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
-//  llvm::Function* fn = &*src->getFunctionList().begin();
-//  llvm::FunctionType *fn_ty = fn->getFunctionType();
-//  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
-//  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
-//  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
-//  llvm::IRBuilder<> ir_builder(ctx);
-//  ir_builder.SetInsertPoint(entry);
-//  auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
-//  llvm::Value* base = main->arg_begin();
-//  llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
-
-//  size_t offset = 0;
-//  for(unsigned i = 0; i < ptrs.size(); i++){
-//    ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
-//    size_t nbytes = get_size(fn_ty->getParamType(i));
-//    offset += nbytes;
-//    if(i < ptrs.size() - 1){
-//      size_t np1bytes = get_size(fn_ty->getParamType(i+1));
-//      offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
-//    }
-//  }
-//  for(unsigned i = 0; i < ptrs.size(); i++)
-//    ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
-//  for(unsigned i = 0; i < ptrs.size(); i++)
-//    fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
-
-//  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
-//  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
-//  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
-//  ir_builder.CreateCall(fn, fn_args);
-//  ir_builder.CreateRetVoid();
-
-////  llvm::legacy::PassManager pm;
-////  pm.add(llvm::createPrintModulePass(llvm::outs()));
-////  pm.add(llvm::createVerifierPass());
-////  pm.run(*src);
-
-////   create execution engine
-//  for(llvm::Function& fn: src->functions())
-//    hst_->functions[fn.getName().str()] = &fn;
-
-////  llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
-////  auto DL = JTMB.getDefaultDataLayoutForTarget();
-////  auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
-////  hst_->ES = new llvm::orc::ExecutionSession();
-////  hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
-////  hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
-////  hst_->DL = new llvm::DataLayout(std::move(*DL));
-////  hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
-////  hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
-////  hst_->MainJD =  &hst_->ES->createJITDylib("<main>");
-////  hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
-////                                            hst_->DL->getGlobalPrefix())));
-////  llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
-////  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
-
-
-
-//  llvm::EngineBuilder builder(std::move(src));
-//  builder.setErrorStr(&hst_->error);
-//  builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
-//  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
-//  builder.setEngineKind(llvm::EngineKind::JIT);
-//  hst_->engine = builder.create();
-//  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
-}
-
-std::unique_ptr<buffer> host_module::symbol(const char *name) const {
-  throw std::runtime_error("not implemented");
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
-  size_t start_replace = str.find(begin);
-  size_t end_replace = str.find(end, start_replace);
-  if(start_replace == std::string::npos)
-    return false;
-  str.replace(start_replace, end_replace + 1 - start_replace, target);
-  return true;
-}
-
-//static std::map<int, int> vptx = {
-//  {10000, 63},
-//  {10010, 64},
-//  {10020, 65},
-//  {11000, 70},
-//  {11010, 71},
-//  {11020, 72},
-//  {11030, 73},
-//  {11040, 73}
-//};
-
-int vptx(int version){
-  if(version >= 11030) return 73;
-  if(version >= 11020) return 72;
-  if(version >= 11010) return 71;
-  if(version >= 11000) return 70;
-  if(version >= 10020) return 65;
-  if(version >= 10010) return 64;
-  if(version >= 10000) return 63;
-  throw std::runtime_error("Triton requires CUDA 10+");
-}
-
-std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
-  // LLVM version in use may not officially support target hardware
-  int max_nvvm_cc = 75;
-  int max_nvvm_ptx = 64;
-  // options
-  auto options = llvm::cl::getRegisteredOptions();
-  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
-  assert(short_ptr);
-  short_ptr->setValue(true);
-  // compute capability
-  int cc = ((driver::cu_device*)device)->compute_capability();
-  std::string sm = "sm_" + std::to_string(cc);
-  // driver version
-  int version;
-  dispatch::cuDriverGetVersion(&version);
-  int ptx = vptx(version);
-  int ptx_major = ptx / 10;
-  int ptx_minor = ptx % 10;
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "nvptx64-nvidia-cuda";
-  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
-  std::string layout = "";
-  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
-  init_llvm();
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
-                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if(layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-  // emit
-  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
-  pass.run(*module);
-
-  // post-process
-  std::string result(buffer.begin(), buffer.end());
-  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
-  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
-  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
-  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
-  return result;
-}
-
-void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
-  // JIT compile source-code
-  try{
-    // use ptxas if present in PATH. Otherwise, use JIT from the driver
-    std::string ptxas = "ptxas";
-    std::string version;
-    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
-
-    // Use PTXAS via system call
-    if(use_system_ptxas){
-      // compile ptx with ptxas
-      char _fsrc[] = "/tmp/triton_k_XXXXXX";
-      char _flog[] = "/tmp/triton_l_XXXXXX";
-      mkstemp(_fsrc);
-      mkstemp(_flog);
-      std::string fsrc = _fsrc;
-      std::string flog = _flog;
-      std::ofstream ofs(fsrc);
-      ofs << ptx;
-      ofs.close();
-      std::string cmd;
-      int err;
-      std::string cc = std::to_string(device->compute_capability());
-      cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
-      err = system(cmd.c_str());
-      dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
-      unlink(_fsrc);
-      unlink(_flog);
-      return;
-    }
-
-    // Use PTXAS included in driver
-    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
-                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
-                          CU_JIT_LOG_VERBOSE};
-    unsigned int errbufsize = 8192;
-    unsigned int logbufsize = 8192;
-    char _err[errbufsize];
-    char _log[logbufsize];
-    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
-    dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
-  }
-  catch(exception::cuda::invalid_ptx const &){
-//#ifdef TRITON_LOG_PTX_ERROR
-     std::cout << ptx << std::endl;
-    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
-//    exit(1);
-//#endif
-    throw;
-  }
-}
-
-cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
-  llvm::raw_string_ostream oss(llir_);
-  oss << *ll_module;
-  oss.flush();
-  ptx_ = compile_llvm_module(ll_module.get(), device);
-  init_from_ptx(ptx_, (driver::cu_device*)device);
-}
-
-cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
-  init_from_ptx(ptx_, (driver::cu_device*)device);
-}
-
-std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
-  CUdeviceptr handle;
-  size_t size;
-  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
-  std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
-  return std::move(res);
-}
-
-
-}
-}
-
--- a/lib/driver/platform.cc
+++ b/lib/driver/platform.cc
@@ -1,68 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <string>
-#include "triton/driver/platform.h"
-#include "triton/driver/device.h"
-
-
-namespace triton
-{
-namespace driver
-{
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-std::string cu_platform::version() const{
-  int version;
-  dispatch::cuDriverGetVersion(&version);
-  return std::to_string(version);
-}
-
-void cu_platform::devices(std::vector<device *> &devices) const{
-  int N;
-  dispatch::cuDeviceGetCount(&N);
-  for(int i = 0 ; i < N ; ++i){
-    CUdevice dvc;
-    dispatch::cuDeviceGet(&dvc, i);
-    devices.push_back(new driver::cu_device(dvc));
-  }
-}
-
-/* ------------------------ */
-//        Host              //
-/* ------------------------ */
-
-std::string host_platform::version() const {
-  return "1.0";
-}
-
-void host_platform::devices(std::vector<driver::device*> &devices) const {
-  devices.push_back(new driver::host_device());
-}
-
-
-}
-}
--- a/lib/driver/stream.cc
+++ b/lib/driver/stream.cc
@@ -1,142 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <cassert>
-#include <unistd.h>
-#include <array>
-#include "triton/driver/backend.h"
-#include "triton/driver/stream.h"
-#include "triton/driver/context.h"
-#include "triton/driver/device.h"
-#include "triton/driver/kernel.h"
-#include "triton/driver/buffer.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-stream::stream(CUstream cu, bool has_ownership)
-  : polymorphic_resource(cu, has_ownership) {
-}
-
-
-stream::stream(host_stream_t cl, bool has_ownership)
-  : polymorphic_resource(cl, has_ownership) {
-}
-
-driver::stream* stream::create(backend_t backend) {
-  switch(backend){
-    case CUDA: return new cu_stream();
-    case Host: return new host_stream();
-    default: throw std::runtime_error("unknown backend");
-  }
-}
-
-
-/* ------------------------ */
-//          Host            //
-/* ------------------------ */
-
-host_stream::host_stream(): stream(host_stream_t(), true) {
-  hst_->pool.reset(new ThreadPool(1));
-  hst_->futures.reset(new std::vector<std::future<void>>());
-}
-
-void host_stream::synchronize() {
-  for(auto& x: *hst_->futures)
-    x.wait();
-  hst_->futures->clear();
-  hst_->args.clear();
-}
-
-void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
-  auto hst = kernel->module()->hst();
-  hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
-  char* params = new char[args_size];
-  std::memcpy((void*)params, (void*)args, args_size);
-  for(size_t i = 0; i < grid[0]; i++)
-    for(size_t j = 0; j < grid[1]; j++)
-      for(size_t k = 0; k < grid[2]; k++)
-        hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
-}
-
-void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
-  std::memcpy((void*)buffer->hst()->data, ptr, size);
-}
-
-void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
-  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
-}
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-
-cu_stream::cu_stream(CUstream str, bool take_ownership):
-  stream(str, take_ownership) {
-}
-
-cu_stream::cu_stream(): stream(CUstream(), true) {
-  dispatch::cuStreamCreate(&*cu_, 0);
-}
-
-void cu_stream::synchronize() {
-  dispatch::cuStreamSynchronize(*cu_);
-}
-
-void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
-  void *config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, args,
-      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
-      CU_LAUNCH_PARAM_END
-  };
-  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
-}
-
-void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
-  if(blocking)
-    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
-  else
-    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
-}
-
-void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
-  if(blocking)
-    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
-  else
-    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
-}
-
-
-}
-
-}