History prior to this date belonged to the now deprecated ISAAC project, and was deleted to save space

2021-07-27 12:38:38 -07:00
commit 6d7cf35123
202 changed files with 94034 additions and 0 deletions
--- a/lib/driver/backend.cc
+++ b/lib/driver/backend.cc
@@ -0,0 +1,229 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <vector>
+#include <stdexcept>
+#include "triton/driver/dispatch.h"
+#include "triton/driver/backend.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/context.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/kernel.h"
+
+
+namespace triton
+{
+
+namespace driver
+{
+
+/*-----------------------------------*/
+//-----------  Platforms ------------*/
+/*-----------------------------------*/
+
+void backend::platforms::init() {
+  if(!cache_.empty())
+    return;
+  //if CUDA is here
+  if(dispatch::cuinit()){
+    cache_.push_back(new cu_platform());
+  }
+//  //if OpenCL is here
+//  if(dispatch::clinit()){
+//    cl_uint num_platforms;
+//    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
+//    std::vector<cl_platform_id> ids(num_platforms);
+//    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
+//    for(cl_platform_id id: ids)
+//      cache_.push_back(new cl_platform(id));
+//  }
+//  //if host is here
+//  bool host_visible = true;
+//  if(host_visible){
+//    cache_.push_back(new host_platform());
+//  }
+  if(cache_.empty())
+    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
+}
+
+void backend::platforms::get(std::vector<platform *> &results) {
+  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
+}
+
+std::vector<driver::platform*> backend::platforms::cache_;
+
+
+/*-----------------------------------*/
+//-----------  Devices --------------*/
+/*-----------------------------------*/
+
+void backend::devices::init(std::vector<platform*> const & platforms) {
+  if(!cache_.empty())
+    return;
+  for(driver::platform* pf: platforms)
+    pf->devices(cache_);
+  if(cache_.empty())
+    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
+}
+
+void backend::devices::get(std::vector<device*> &devs) {
+  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
+}
+
+std::vector<driver::device*> backend::devices::cache_;
+
+
+
+/*-----------------------------------*/
+//---------- Modules ----------------*/
+/*-----------------------------------*/
+
+void backend::modules::release(){
+  for(auto & x: cache_)
+    delete x.second;
+  cache_.clear();
+}
+
+std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
+
+/*-----------------------------------*/
+//-----------  Kernels --------------*/
+/*-----------------------------------*/
+
+void backend::kernels::release(){
+  for(auto & x: cache_)
+    delete x.second;
+  cache_.clear();
+}
+
+driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
+  std::tuple<driver::module*, std::string> key(mod, name);
+  if(cache_.find(key)==cache_.end()){
+    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
+  }
+  return cache_.at(key);
+}
+
+std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
+
+/*-----------------------------------*/
+//------------  Queues --------------*/
+/*-----------------------------------*/
+
+void backend::streams::init(std::list<driver::context*> const & contexts){
+  for(driver::context* ctx : contexts)
+    if(cache_.find(ctx)==cache_.end())
+      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx)}));
+}
+
+void backend::streams::release(){
+  for(auto & x: cache_)
+    for(auto & y: x.second)
+      delete y;
+  cache_.clear();
+}
+
+driver::stream* backend::streams::get_default()
+{ return get(contexts::get_default(), 0); }
+
+driver::stream* backend::streams::get(driver::context* context, unsigned int id){
+  init(std::list<driver::context*>(1,context));
+  for(auto & x : cache_)
+    if(x.first==context)
+      return x.second[id];
+  throw;
+}
+
+void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
+  init(std::list<driver::context*>(1,context));
+  queues = cache_.at(context);
+}
+
+std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
+
+/*-----------------------------------*/
+//------------  Contexts ------------*/
+/*-----------------------------------*/
+
+void backend::contexts::init(std::vector<driver::device*> const & devices){
+  for(driver::device* dvc: devices)
+    cache_.push_back(driver::context::create(dvc));
+}
+
+void backend::contexts::release(){
+  for(auto & x: cache_)
+    delete x;
+  cache_.clear();
+}
+
+driver::context* backend::contexts::get_default(){
+  backend::init();
+  auto it = cache_.begin();
+  std::advance(it, default_device);
+  return *it;
+}
+
+void backend::contexts::get(std::list<driver::context*> & contexts){
+  backend::init();
+  contexts = cache_;
+}
+
+std::list<driver::context*> backend::contexts::cache_;
+
+
+
+/*-----------------------------------*/
+//------------  General -------------*/
+/*-----------------------------------*/
+
+void backend::synchronize(driver::context* context){
+  for(driver::stream * queue: streams::cache_.at(context))
+    queue->synchronize();
+}
+
+
+void backend::release(){
+  backend::kernels::release();
+//  backend::programs::release();
+  backend::streams::release();
+  backend::contexts::release();
+}
+
+
+void backend::init(){
+  if(!contexts::cache_.empty())
+    return;
+  // initialize platforms
+  backend::platforms::init();
+  // initialize devices
+  backend::devices::init(platforms::cache_);
+  // initialize contexts
+  backend::contexts::init(devices::cache_);
+  // initialize streams
+  streams::init(contexts::cache_);
+}
+
+unsigned int backend::default_device = 0;
+
+}
+
+}
--- a/lib/driver/buffer.cc
+++ b/lib/driver/buffer.cc
@@ -0,0 +1,103 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/stream.h"
+#include "triton/driver/buffer.h"
+#include "triton/driver/context.h"
+#include "triton/driver/dispatch.h"
+
+
+namespace triton
+{
+
+namespace driver
+{
+
+
+//
+
+buffer::buffer(driver::context* ctx, size_t size, CUdeviceptr cu, bool take_ownership)
+  : polymorphic_resource(cu, take_ownership), context_(ctx), size_(size) { }
+
+buffer::buffer(driver::context* ctx, size_t size, cl_mem cl, bool take_ownership)
+  : polymorphic_resource(cl, take_ownership), context_(ctx), size_(size) { }
+
+buffer::buffer(driver::context* ctx, size_t size, host_buffer_t hst, bool take_ownership)
+  : polymorphic_resource(hst, take_ownership), context_(ctx), size_(size) { }
+
+
+driver::context* buffer::context() {
+  return context_;
+}
+
+size_t buffer::size() {
+  return size_;
+}
+
+
+buffer* buffer::create(driver::context* ctx, size_t size) {
+  switch(ctx->backend()){
+  case CUDA: return new cu_buffer(ctx, size);
+  case OpenCL: return new ocl_buffer(ctx, size);
+  case Host: return new host_buffer(ctx, size);
+  default: throw std::runtime_error("unknown backend");
+  }
+}
+
+//
+
+host_buffer::host_buffer(driver::context *context, size_t size)
+  :  buffer(context, size, host_buffer_t(), true){
+  hst_->data = new char[size];
+}
+
+//
+
+ocl_buffer::ocl_buffer(driver::context* context, size_t size)
+  : buffer(context, size, cl_mem(), true){
+  cl_int err;
+  *cl_ = dispatch::clCreateBuffer(*context->cl(), CL_MEM_READ_WRITE, size, NULL, &err);
+  check(err);
+}
+
+
+//
+
+cu_buffer::cu_buffer(driver::context* context, size_t size)
+  : buffer(context, size, CUdeviceptr(), true) {
+  cu_context::context_switcher ctx_switch(*context_);
+  dispatch::cuMemAlloc(&*cu_, size);
+}
+
+cu_buffer::cu_buffer(driver::context* context, size_t size, CUdeviceptr cu, bool take_ownership)
+  : buffer(context, size, cu, take_ownership){
+}
+
+void cu_buffer::set_zero(driver::stream* queue, size_t size)
+{
+  cu_context::context_switcher ctx_switch(*context_);
+  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
+}
+
+}
+
+}
--- a/lib/driver/context.cc
+++ b/lib/driver/context.cc
@@ -0,0 +1,147 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <cassert>
+#include "triton/driver/context.h"
+#include "triton/driver/module.h"
+#include "triton/tools/sys/getenv.hpp"
+#include "triton/tools/sys/mkdir.hpp"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//         BASE             //
+/* ------------------------ */
+
+context::context(driver::device *dev, CUcontext cu, bool take_ownership):
+  polymorphic_resource(cu, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()) {
+}
+
+context::context(driver::device *dev, cl_context cl, bool take_ownership):
+  polymorphic_resource(cl, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()){
+}
+
+context::context(driver::device *dev, host_context_t hst, bool take_ownership):
+  polymorphic_resource(hst, take_ownership),
+  dev_(dev), cache_path_(get_cache_path()){
+}
+
+context* context::create(driver::device *dev){
+  switch(dev->backend()){
+  case CUDA: return new cu_context(dev);
+  case OpenCL: return new ocl_context(dev);
+  case Host: return new host_context(dev);
+  default: throw std::runtime_error("unknown backend");
+  }
+}
+
+
+driver::device* context::device() const {
+  return dev_;
+}
+
+std::string context::get_cache_path(){
+  //user-specified cache path
+  std::string result = tools::getenv("TRITON_CACHE_PATH");
+  if(!result.empty()){
+    if(tools::mkpath(result)==0)
+      return result;
+  }
+  //create in home
+  result = tools::getenv("HOME");
+  if(!result.empty())
+  {
+    result = result + "/.triton/cache/";
+    if(tools::mkpath(result)==0)
+      return result;
+  }
+  //couldn't find a directory
+  return "";
+}
+
+std::string const & context::cache_path() const{
+  return cache_path_;
+}
+
+/* ------------------------ */
+//         Host             //
+/* ------------------------ */
+
+host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
+
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+// RAII context switcher
+cu_context::context_switcher::context_switcher(const context &ctx): ctx_((const cu_context&)ctx) {
+  dispatch::cuCtxPushCurrent_v2(*ctx_.cu());
+}
+
+cu_context::context_switcher::~context_switcher() {
+  CUcontext tmp;
+  dispatch::cuCtxPopCurrent_v2(&tmp);
+  assert(tmp==*ctx_.cu() && "Switching back to invalid context!");
+}
+
+// import CUdevice
+CUdevice cu_context::get_device_of(CUcontext context){
+  dispatch::cuCtxPushCurrent_v2(context);
+  CUdevice res;
+  dispatch::cuCtxGetDevice(&res);
+  dispatch::cuCtxPopCurrent_v2(NULL);
+  return res;
+}
+
+// wrapper for cuda context
+cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
+                                                                                context, take_ownership) {
+}
+
+cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
+  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
+  dispatch::cuCtxPopCurrent_v2(NULL);
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_context::ocl_context(driver::device* dev): context(dev, cl_context(), true) {
+  cl_int err;
+  *cl_ = dispatch::clCreateContext(nullptr, 1, &*dev->cl(), nullptr, nullptr, &err);
+  check(err);
+}
+
+
+
+}
+}
--- a/lib/driver/device.cc
+++ b/lib/driver/device.cc
@@ -0,0 +1,247 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <map>
+#include <algorithm>
+#include <sstream>
+#include <cstring>
+#include <memory>
+#include "triton/driver/device.h"
+#include "triton/driver/context.h"
+#include "triton/codegen/target.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//          Host            //
+/* ------------------------ */
+
+std::unique_ptr<codegen::target> host_device::make_target() const {
+  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+// maximum amount of shared memory per block
+size_t ocl_device::max_shared_memory() const {
+  throw std::runtime_error("not implemented");
+//  return ocl::info<CL_DEVICE_LOCAL_MEM_SIZE>(*cl_);
+}
+
+size_t ocl_device::max_threads_per_block() const {
+  throw std::runtime_error("not implemented");
+//  return ocl::info<CL_DEVICE_MAX_WORK_ITEM_SIZES>(*cl_).at(0);
+}
+
+std::unique_ptr<codegen::target> ocl_device::make_target() const {
+  return std::unique_ptr<codegen::amd_cl_target>(new codegen::amd_cl_target());
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+// architecture
+cu_device::Architecture cu_device::nv_arch(std::pair<unsigned int, unsigned int> sm) const {
+  switch(sm.first) {
+   case 7:
+     switch(sm.second){
+     case 0: return Architecture::SM_7_0;
+     }
+
+  case 6:
+    switch(sm.second){
+    case 0: return Architecture::SM_6_0;
+    case 1: return Architecture::SM_6_1;
+    }
+
+  case 5:
+    switch(sm.second){
+    case 0: return Architecture::SM_5_0;
+    case 2: return Architecture::SM_5_2;
+    default: return Architecture::UNKNOWN;
+    }
+
+  case 3:
+    switch(sm.second){
+    case 0: return Architecture::SM_3_0;
+    case 5: return Architecture::SM_3_5;
+    case 7: return Architecture::SM_3_7;
+    default: return Architecture::UNKNOWN;
+    }
+
+  case 2:
+    switch(sm.second){
+    case 0: return Architecture::SM_2_0;
+    case 1: return Architecture::SM_2_1;
+    default: return Architecture::UNKNOWN;
+    }
+
+  default: return Architecture::UNKNOWN;
+  }
+}
+
+// information query
+template<CUdevice_attribute attr>
+int cu_device::cuGetInfo() const{
+  int res;
+  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
+  return res;
+}
+
+// convert to nvml
+nvmlDevice_t cu_device::nvml_device() const{
+  std::map<std::string, nvmlDevice_t> map;
+  std::string key = pci_bus_id();
+  if(map.find(key)==map.end()){
+    nvmlDevice_t device;
+    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
+    return map.insert(std::make_pair(key, device)).first->second;
+  }
+  return map.at(key);
+}
+
+// architecture
+cu_device::Architecture cu_device::architecture() const{
+  return nv_arch(compute_capability());
+}
+
+// number of address bits
+size_t cu_device::address_bits() const{
+  return sizeof(size_t)*8;
+}
+
+// name
+std::string cu_device::name() const {
+    char tmp[128];
+    dispatch::cuDeviceGetName(tmp, 128, *cu_);
+    return std::string(tmp);
+}
+
+// PCI bus ID
+std::string cu_device::pci_bus_id() const{
+  char tmp[128];
+  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
+  return std::string(tmp);
+}
+
+// force the device to be interpreted as a particular cc
+void cu_device::interpret_as(std::pair<size_t, size_t> cc){
+  interpreted_as_ = std::make_shared<std::pair<size_t, size_t>>(cc);
+}
+
+// compute capability
+std::pair<size_t, size_t> cu_device::compute_capability() const {
+  if(interpreted_as_)
+    return *interpreted_as_;
+  size_t _major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
+  size_t _minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
+  return std::make_pair(_major, _minor);
+}
+
+// maximum number of threads per block
+size_t cu_device::max_threads_per_block() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
+}
+
+// maximum amount of shared memory per block
+size_t cu_device::max_shared_memory() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK>();
+}
+
+// warp size
+size_t cu_device::warp_size() const {
+  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
+}
+
+
+// maximum block dimensions
+std::vector<size_t> cu_device::max_block_dim() const {
+  std::vector<size_t> result(3);
+  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
+  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
+  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
+  return result;
+}
+
+// current SM clock
+size_t cu_device::current_sm_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
+  return result;
+}
+
+// max SM clock
+size_t cu_device::max_sm_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
+  return result;
+}
+
+// current memory clock
+size_t cu_device::current_mem_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
+  return result;
+}
+
+// max memory clock
+size_t cu_device::max_mem_clock() const{
+  unsigned int result;
+  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
+  return result;
+}
+
+// max memory clock
+void cu_device::set_max_clock() {
+  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
+}
+
+// print infos
+std::string cu_device::infos() const{
+  std::ostringstream oss;
+  std::vector<size_t> max_wi_sizes = max_block_dim();
+  oss << "Platform: CUDA" << std::endl;
+  oss << "Name: " << name() << std::endl;
+  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
+  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
+  oss << "Local memory size: " << max_shared_memory() << std::endl;
+  return oss.str();
+}
+
+// target
+std::unique_ptr<codegen::target> cu_device::make_target() const {
+  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target());
+}
+
+
+}
+
+}
+
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -0,0 +1,342 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/dispatch.h"
+#include "triton/driver/context.h"
+#include "triton/tools/sys/getenv.hpp"
+
+namespace triton
+{
+namespace driver
+{
+
+//Helpers for function definition
+#define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
+
+#define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
+
+#define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
+
+#define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
+
+#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
+
+#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
+
+#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
+
+#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
+
+#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
+
+#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
+
+#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
+
+#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
+
+#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+
+#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
+
+//Specialized helpers for OpenCL
+#define OCL_DEFINE1(ret, fname, t1) DEFINE1(clinit, opencl_, ret, fname, t1)
+#define OCL_DEFINE2(ret, fname, t1, t2) DEFINE2(clinit, opencl_, ret, fname, t1, t2)
+#define OCL_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(clinit, opencl_, ret, fname, t1, t2, t3)
+#define OCL_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(clinit, opencl_, ret, fname, t1, t2, t3, t4)
+#define OCL_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5)
+#define OCL_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define OCL_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define OCL_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define OCL_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(clinit, opencl_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+
+//Specialized helpers for CUDA
+#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
+#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
+#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
+#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
+#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
+#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
+#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
+#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
+
+bool dispatch::clinit()
+{
+    if(opencl_==nullptr)
+        opencl_ = dlopen("libOpenCL.so", RTLD_LAZY);
+    return opencl_ != nullptr;
+}
+
+bool dispatch::cuinit(){
+  if(cuda_==nullptr){
+    std::string libcuda = tools::getenv("TRITON_LIBCUDA");
+    if(libcuda.empty())
+      cuda_ = dlopen("libcuda.so", RTLD_LAZY);
+    else
+      cuda_ = dlopen(libcuda.c_str(), RTLD_LAZY);
+  }
+  if(cuda_ == nullptr)
+    return false;
+  CUresult (*fptr)(unsigned int);
+  cuInit_ = dlsym(cuda_, "cuInit");
+  *reinterpret_cast<void **>(&fptr) = cuInit_;
+  CUresult res = (*fptr)(0);
+  check(res);
+  return true;
+}
+
+bool dispatch::nvmlinit(){
+  if(nvml_==nullptr)
+    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
+  nvmlReturn_t (*fptr)();
+  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
+  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
+  nvmlReturn_t res = (*fptr)();
+  check(res);
+  return res;
+}
+
+bool dispatch::spvllvminit(){
+  if(spvllvm_==nullptr)
+    spvllvm_ = dlopen("libLLVMSPIRVLib.so", RTLD_LAZY);
+  return spvllvm_ != nullptr;
+}
+
+//CUDA
+CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
+CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
+CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
+CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
+CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
+CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
+CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
+CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
+CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
+CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
+
+CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
+CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
+CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
+CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
+CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
+CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
+CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
+CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
+CUDA_DEFINE1(CUresult, cuInit, unsigned int)
+CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
+CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
+CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
+CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
+CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
+CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
+CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
+CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
+CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
+CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
+CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
+CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
+CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
+CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
+CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
+
+NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
+NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
+
+// OpenCL
+cl_int dispatch::clBuildProgram(cl_program a, cl_uint b, const cl_device_id * c, const char * d, void (*e)(cl_program, void *), void * f)
+{ return f_impl<dispatch::clinit>(opencl_, clBuildProgram, clBuildProgram_, "clBuildProgram", a, b, c, d, e, f); }
+
+cl_context dispatch::clCreateContext(const cl_context_properties * a, cl_uint b, const cl_device_id * c, void (*d)(const char *, const void *, size_t, void *), void * e, cl_int * f)
+{ return f_impl<dispatch::clinit>(opencl_, dispatch::clCreateContext, dispatch::clCreateContext_, "clCreateContext", a, b, c, d, e, f); }
+
+OCL_DEFINE9(cl_int, clEnqueueNDRangeKernel, cl_command_queue, cl_kernel, cl_uint, const size_t*, const size_t*, const size_t*,  cl_uint, const cl_event*, cl_event*)
+OCL_DEFINE4(cl_int, clSetKernelArg, cl_kernel, cl_uint, size_t, const void *)
+OCL_DEFINE1(cl_int, clReleaseMemObject, cl_mem)
+OCL_DEFINE1(cl_int, clFinish, cl_command_queue)
+OCL_DEFINE5(cl_int, clGetMemObjectInfo, cl_mem, cl_mem_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetCommandQueueInfo, cl_command_queue, cl_command_queue_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseContext, cl_context)
+OCL_DEFINE1(cl_int, clReleaseEvent, cl_event)
+OCL_DEFINE9(cl_int, clEnqueueWriteBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, const void *, cl_uint, const cl_event *, cl_event *)
+OCL_DEFINE9(cl_int, clEnqueueReadBuffer, cl_command_queue, cl_mem, cl_bool, size_t, size_t, void *, cl_uint, const cl_event *, cl_event *)
+OCL_DEFINE6(cl_int, clGetProgramBuildInfo, cl_program, cl_device_id, cl_program_build_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseDevice, cl_device_id)
+OCL_DEFINE5(cl_int, clGetDeviceIDs, cl_platform_id, cl_device_type, cl_uint, cl_device_id *, cl_uint *)
+OCL_DEFINE5(cl_int, clGetContextInfo, cl_context, cl_context_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetDeviceInfo, cl_device_id, cl_device_info, size_t, void *, size_t *)
+OCL_DEFINE1(cl_int, clReleaseCommandQueue, cl_command_queue)
+OCL_DEFINE3(cl_int, clGetPlatformIDs, cl_uint, cl_platform_id *, cl_uint *)
+OCL_DEFINE5(cl_int, clGetPlatformInfo, cl_platform_id, cl_platform_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetEventProfilingInfo, cl_event, cl_profiling_info, size_t, void *, size_t *)
+OCL_DEFINE7(cl_program, clCreateProgramWithBinary, cl_context, cl_uint, const cl_device_id *, const size_t *, const unsigned char **, cl_int *, cl_int *)
+OCL_DEFINE4(cl_command_queue, clCreateCommandQueue, cl_context, cl_device_id, cl_command_queue_properties, cl_int *)
+OCL_DEFINE1(cl_int, clRetainEvent, cl_event)
+OCL_DEFINE1(cl_int, clReleaseProgram, cl_program)
+OCL_DEFINE1(cl_int, clFlush, cl_command_queue)
+OCL_DEFINE5(cl_int, clGetProgramInfo, cl_program, cl_program_info, size_t, void *, size_t *)
+OCL_DEFINE5(cl_int, clGetKernelInfo, cl_kernel, cl_kernel_info, size_t, void *, size_t *)
+OCL_DEFINE6(cl_int, clGetKernelWorkGroupInfo, cl_kernel, cl_device_id, cl_kernel_work_group_info, size_t, void *, size_t *)
+OCL_DEFINE3(cl_kernel, clCreateKernel, cl_program, const char *, cl_int *)
+OCL_DEFINE4(cl_int, clCreateKernelsInProgram, cl_program, cl_uint, cl_kernel*, cl_uint*)
+OCL_DEFINE5(cl_mem, clCreateBuffer, cl_context, cl_mem_flags, size_t, void *, cl_int *)
+OCL_DEFINE5(cl_program, clCreateProgramWithSource, cl_context, cl_uint, const char **, const size_t *, cl_int *)
+OCL_DEFINE1(cl_int, clReleaseKernel, cl_kernel)
+
+// LLVM to SPIR-V
+int dispatch::initializeLLVMToSPIRVPass(llvm::PassRegistry &registry){
+  return f_impl<dispatch::spvllvminit>(spvllvm_, initializeLLVMToSPIRVPass, initializeLLVMToSPIRVPass_, "initializeLLVMToSPIRVPass", std::ref(registry));
+}
+
+bool dispatch::writeSpirv(llvm::Module *M, std::ostream &OS, std::string &ErrMsg){
+  return f_impl<dispatch::spvllvminit>(spvllvm_, writeSpirv, writeSpirv_, "writeSpirv", M, std::ref(OS), std::ref(ErrMsg));
+}
+
+// Release
+void dispatch::release(){
+  if(cuda_){
+    dlclose(cuda_);
+    cuda_ = nullptr;
+  }
+}
+
+void * dispatch::opencl_;
+void* dispatch::cuda_;
+void* dispatch::nvml_;
+void* dispatch::spvllvm_;
+
+//OpenCL
+void* dispatch::clBuildProgram_;
+void* dispatch::clEnqueueNDRangeKernel_;
+void* dispatch::clSetKernelArg_;
+void* dispatch::clReleaseMemObject_;
+void* dispatch::clFinish_;
+void* dispatch::clGetMemObjectInfo_;
+void* dispatch::clGetCommandQueueInfo_;
+void* dispatch::clReleaseContext_;
+void* dispatch::clReleaseEvent_;
+void* dispatch::clEnqueueWriteBuffer_;
+void* dispatch::clEnqueueReadBuffer_;
+void* dispatch::clGetProgramBuildInfo_;
+void* dispatch::clReleaseDevice_;
+void* dispatch::clCreateContext_;
+void* dispatch::clGetDeviceIDs_;
+void* dispatch::clGetContextInfo_;
+void* dispatch::clGetDeviceInfo_;
+void* dispatch::clReleaseCommandQueue_;
+void* dispatch::clGetPlatformIDs_;
+void* dispatch::clGetPlatformInfo_;
+void* dispatch::clGetEventProfilingInfo_;
+void* dispatch::clCreateProgramWithBinary_;
+void* dispatch::clCreateCommandQueue_;
+void* dispatch::clRetainEvent_;
+void* dispatch::clReleaseProgram_;
+void* dispatch::clFlush_;
+void* dispatch::clGetProgramInfo_;
+void* dispatch::clGetKernelInfo_;
+void* dispatch::clGetKernelWorkGroupInfo_;
+void* dispatch::clCreateKernel_;
+void* dispatch::clCreateKernelsInProgram_;
+void* dispatch::clCreateBuffer_;
+void* dispatch::clCreateProgramWithSource_;
+void* dispatch::clReleaseKernel_;
+
+//CUDA
+void* dispatch::cuCtxGetCurrent_;
+void* dispatch::cuCtxSetCurrent_;
+void* dispatch::cuCtxDestroy_v2_;
+void* dispatch::cuEventCreate_;
+void* dispatch::cuDeviceGet_;
+void* dispatch::cuMemcpyDtoH_v2_;
+void* dispatch::cuStreamCreate_;
+void* dispatch::cuEventElapsedTime_;
+void* dispatch::cuMemFree_v2_;
+void* dispatch::cuMemcpyDtoHAsync_v2_;
+void* dispatch::cuDriverGetVersion_;
+void* dispatch::cuDeviceGetName_;
+void* dispatch::cuDeviceGetPCIBusId_;
+void* dispatch::cuModuleGetGlobal_v2_;
+
+void* dispatch::cuMemcpyHtoDAsync_v2_;
+void* dispatch::cuModuleLoad_;
+void* dispatch::cuLaunchKernel_;
+void* dispatch::cuModuleUnload_;
+void* dispatch::cuModuleLoadDataEx_;
+void* dispatch::cuDeviceGetAttribute_;
+void* dispatch::cuDeviceGetCount_;
+void* dispatch::cuMemcpyHtoD_v2_;
+void* dispatch::cuInit_;
+void* dispatch::cuEventRecord_;
+void* dispatch::cuCtxCreate_v2_;
+void* dispatch::cuModuleGetFunction_;
+void* dispatch::cuStreamSynchronize_;
+void* dispatch::cuStreamDestroy_v2_;
+void* dispatch::cuEventDestroy_v2_;
+void* dispatch::cuMemAlloc_v2_;
+void* dispatch::cuPointerGetAttribute_;
+void* dispatch::cuCtxGetDevice_;
+void* dispatch::cuMemsetD8Async_;
+void* dispatch::cuCtxPushCurrent_v2_;
+void* dispatch::cuCtxPopCurrent_v2_;
+void* dispatch::cuFuncGetAttribute_;
+void* dispatch::cuFuncSetAttribute_;
+void* dispatch::cuFuncSetCacheConfig_;
+
+void* dispatch::nvmlInit_v2_;
+void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
+void* dispatch::nvmlDeviceGetClockInfo_;
+void* dispatch::nvmlDeviceGetMaxClockInfo_;
+void* dispatch::nvmlDeviceSetApplicationsClocks_;
+
+// SPIR-V
+void* dispatch::initializeLLVMToSPIRVPass_;
+void* dispatch::writeSpirv_;
+
+}
+}
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -0,0 +1,160 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/error.h"
+
+namespace triton
+{
+namespace driver
+{
+
+void check(CUresult err)
+{
+  using namespace exception::cuda;
+  switch(err)
+  {
+  case CUDA_SUCCESS                              : break;
+  case CUDA_ERROR_INVALID_VALUE                  : throw invalid_value();
+  case CUDA_ERROR_OUT_OF_MEMORY                  : throw out_of_memory();
+  case CUDA_ERROR_NOT_INITIALIZED                : throw not_initialized();
+  case CUDA_ERROR_DEINITIALIZED                  : throw deinitialized();
+  case CUDA_ERROR_PROFILER_DISABLED              : throw profiler_disabled();
+  case CUDA_ERROR_PROFILER_NOT_INITIALIZED       : throw profiler_not_initialized();
+  case CUDA_ERROR_PROFILER_ALREADY_STARTED       : throw profiler_already_started();
+  case CUDA_ERROR_PROFILER_ALREADY_STOPPED       : throw profiler_already_stopped();
+  case CUDA_ERROR_NO_DEVICE                      : throw no_device();
+  case CUDA_ERROR_INVALID_DEVICE                 : throw invalid_device();
+  case CUDA_ERROR_INVALID_IMAGE                  : throw invalid_image();
+  case CUDA_ERROR_INVALID_CONTEXT                : throw invalid_context();
+  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT        : throw context_already_current();
+  case CUDA_ERROR_MAP_FAILED                     : throw map_failed();
+  case CUDA_ERROR_UNMAP_FAILED                   : throw unmap_failed();
+  case CUDA_ERROR_ARRAY_IS_MAPPED                : throw array_is_mapped();
+  case CUDA_ERROR_ALREADY_MAPPED                 : throw already_mapped();
+  case CUDA_ERROR_NO_BINARY_FOR_GPU              : throw no_binary_for_gpu();
+  case CUDA_ERROR_ALREADY_ACQUIRED               : throw already_acquired();
+  case CUDA_ERROR_NOT_MAPPED                     : throw not_mapped();
+  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY            : throw not_mapped_as_array();
+  case CUDA_ERROR_NOT_MAPPED_AS_POINTER          : throw not_mapped_as_pointer();
+  case CUDA_ERROR_ECC_UNCORRECTABLE              : throw ecc_uncorrectable();
+  case CUDA_ERROR_UNSUPPORTED_LIMIT              : throw unsupported_limit();
+  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE         : throw context_already_in_use();
+  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED        : throw peer_access_unsupported();
+  case CUDA_ERROR_INVALID_PTX                    : throw invalid_ptx();
+  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT       : throw invalid_graphics_context();
+  case CUDA_ERROR_INVALID_SOURCE                 : throw invalid_source();
+  case CUDA_ERROR_FILE_NOT_FOUND                 : throw file_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND : throw shared_object_symbol_not_found();
+  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED      : throw shared_object_init_failed();
+  case CUDA_ERROR_OPERATING_SYSTEM               : throw operating_system();
+  case CUDA_ERROR_INVALID_HANDLE                 : throw invalid_handle();
+  case CUDA_ERROR_NOT_FOUND                      : throw not_found();
+  case CUDA_ERROR_NOT_READY                      : throw not_ready();
+  case CUDA_ERROR_ILLEGAL_ADDRESS                : throw illegal_address();
+  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES        : throw launch_out_of_resources();
+  case CUDA_ERROR_LAUNCH_TIMEOUT                 : throw launch_timeout();
+  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING  : throw launch_incompatible_texturing();
+  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED    : throw peer_access_already_enabled();
+  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED        : throw peer_access_not_enabled();
+  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE         : throw primary_context_active();
+  case CUDA_ERROR_CONTEXT_IS_DESTROYED           : throw context_is_destroyed();
+  case CUDA_ERROR_ASSERT                         : throw assert_error();
+  case CUDA_ERROR_TOO_MANY_PEERS                 : throw too_many_peers();
+  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED : throw host_memory_already_registered();
+  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED     : throw host_memory_not_registered();
+  case CUDA_ERROR_HARDWARE_STACK_ERROR           : throw hardware_stack_error();
+  case CUDA_ERROR_ILLEGAL_INSTRUCTION            : throw illegal_instruction();
+  case CUDA_ERROR_MISALIGNED_ADDRESS             : throw misaligned_address();
+  case CUDA_ERROR_INVALID_ADDRESS_SPACE          : throw invalid_address_space();
+  case CUDA_ERROR_INVALID_PC                     : throw invalid_pc();
+  case CUDA_ERROR_LAUNCH_FAILED                  : throw launch_failed();
+  case CUDA_ERROR_NOT_PERMITTED                  : throw not_permitted();
+  case CUDA_ERROR_NOT_SUPPORTED                  : throw not_supported();
+  case CUDA_ERROR_UNKNOWN                        : throw unknown();
+  default                                        : throw unknown();
+  }
+}
+
+void check(cl_int err)
+{
+    using namespace exception::ocl;
+    switch(err)
+    {
+        case CL_SUCCESS:                        break;
+        case CL_DEVICE_NOT_FOUND:               throw device_not_found();
+        case CL_DEVICE_NOT_AVAILABLE:           throw device_not_available();
+        case CL_COMPILER_NOT_AVAILABLE:         throw compiler_not_available();
+        case CL_MEM_OBJECT_ALLOCATION_FAILURE:  throw mem_object_allocation_failure();
+        case CL_OUT_OF_RESOURCES:               throw out_of_resources();
+        case CL_OUT_OF_HOST_MEMORY:             throw out_of_host_memory();
+        case CL_PROFILING_INFO_NOT_AVAILABLE:   throw profiling_info_not_available();
+        case CL_MEM_COPY_OVERLAP:               throw mem_copy_overlap();
+        case CL_IMAGE_FORMAT_MISMATCH:          throw image_format_mismatch();
+        case CL_IMAGE_FORMAT_NOT_SUPPORTED:     throw image_format_not_supported();
+        case CL_BUILD_PROGRAM_FAILURE:          throw build_program_failure();
+        case CL_MAP_FAILURE:                    throw map_failure();
+
+        case CL_INVALID_VALUE:                  throw invalid_value();
+        case CL_INVALID_DEVICE_TYPE:            throw invalid_device_type();
+        case CL_INVALID_PLATFORM:               throw invalid_platform();
+        case CL_INVALID_DEVICE:                 throw invalid_device();
+        case CL_INVALID_CONTEXT:                throw invalid_context();
+        case CL_INVALID_QUEUE_PROPERTIES:       throw invalid_queue_properties();
+        case CL_INVALID_COMMAND_QUEUE:          throw invalid_command_queue();
+        case CL_INVALID_HOST_PTR:               throw invalid_host_ptr();
+        case CL_INVALID_MEM_OBJECT:             throw invalid_mem_object();
+        case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR: throw invalid_image_format_descriptor();
+        case CL_INVALID_IMAGE_SIZE:             throw invalid_image_size();
+        case CL_INVALID_SAMPLER:                throw invalid_sampler();
+        case CL_INVALID_BINARY:                 throw invalid_binary();
+        case CL_INVALID_BUILD_OPTIONS:          throw invalid_build_options();
+        case CL_INVALID_PROGRAM:                throw invalid_program();
+        case CL_INVALID_PROGRAM_EXECUTABLE:     throw invalid_program_executable();
+        case CL_INVALID_KERNEL_NAME:            throw invalid_kernel_name();
+        case CL_INVALID_KERNEL_DEFINITION:      throw invalid_kernel_definition();
+        case CL_INVALID_KERNEL:                 throw invalid_kernel();
+        case CL_INVALID_ARG_INDEX:              throw invalid_arg_index();
+        case CL_INVALID_ARG_VALUE:              throw invalid_arg_value();
+        case CL_INVALID_ARG_SIZE:               throw invalid_arg_size();
+        case CL_INVALID_KERNEL_ARGS:            throw invalid_kernel_args();
+        case CL_INVALID_WORK_DIMENSION:         throw invalid_work_dimension();
+        case CL_INVALID_WORK_GROUP_SIZE:        throw invalid_work_group_size();
+        case CL_INVALID_WORK_ITEM_SIZE:         throw invalid_work_item_size();
+        case CL_INVALID_GLOBAL_OFFSET:          throw invalid_global_offset();
+        case CL_INVALID_EVENT_WAIT_LIST:        throw invalid_event_wait_list();
+        case CL_INVALID_EVENT:                  throw invalid_event();
+        case CL_INVALID_OPERATION:              throw invalid_operation();
+        case CL_INVALID_GL_OBJECT:              throw invalid_gl_object();
+        case CL_INVALID_BUFFER_SIZE:            throw invalid_buffer_size();
+        case CL_INVALID_MIP_LEVEL:              throw invalid_mip_level();
+        case CL_INVALID_GLOBAL_WORK_SIZE:       throw invalid_global_work_size();
+    #ifdef CL_INVALID_PROPERTY
+        case CL_INVALID_PROPERTY:               throw invalid_property();
+    #endif
+        default: throw;
+    }
+}
+
+
+}
+}
+
--- a/lib/driver/event.cc
+++ b/lib/driver/event.cc
@@ -0,0 +1,40 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/event.h"
+
+namespace triton
+{
+namespace driver
+{
+
+float event::elapsed_time() const{
+  float time;
+  dispatch::cuEventElapsedTime(&time, cu_->first, cu_->second);
+  return time;
+}
+
+handle<cu_event_t> const & event::cu() const
+{ return cu_; }
+
+}
+}
--- a/lib/driver/handle.cc
+++ b/lib/driver/handle.cc
@@ -0,0 +1,108 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include "triton/driver/handle.h"
+#include "triton/driver/error.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+//Host
+inline void _delete(host_platform_t) { }
+inline void _delete(host_device_t)   { }
+inline void _delete(host_context_t)  { }
+inline void _delete(host_module_t)   { }
+inline void _delete(host_stream_t)   { }
+inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
+inline void _delete(host_function_t) { }
+
+//OpenCL
+inline void _delete(cl_platform_id) { }
+inline void _delete(cl_device_id x) { dispatch::clReleaseDevice(x); }
+inline void _delete(cl_context x) { dispatch::clReleaseContext(x); }
+inline void _delete(cl_program x) { dispatch::clReleaseProgram(x); }
+inline void _delete(cl_kernel x) { dispatch::clReleaseKernel(x); }
+inline void _delete(cl_command_queue x) { dispatch::clReleaseCommandQueue(x); }
+inline void _delete(cl_mem x) { dispatch::clReleaseMemObject(x); }
+
+//CUDA
+inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
+inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
+inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
+inline void _delete(CUdevice) { }
+inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
+inline void _delete(CUfunction) { }
+inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
+inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
+inline void _delete(CUPlatform){}
+
+//Constructor
+template<class T>
+handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
+{ }
+
+template<class T>
+handle<T>::handle(): has_ownership_(false){ }
+
+
+template<class T>
+handle<T>::~handle(){
+  try{
+    if(has_ownership_ && h_ && h_.unique())
+      _delete(*h_);
+  }catch(const exception::cuda::base&){
+    // order of destruction for global variables
+    // is not guaranteed
+  }
+}
+
+template class handle<CUdeviceptr>;
+template class handle<CUstream>;
+template class handle<CUcontext>;
+template class handle<CUdevice>;
+template class handle<cu_event_t>;
+template class handle<CUfunction>;
+template class handle<CUmodule>;
+template class handle<CUPlatform>;
+
+template class handle<cl_platform_id>;
+template class handle<cl_device_id>;
+template class handle<cl_context>;
+template class handle<cl_program>;
+template class handle<cl_command_queue>;
+template class handle<cl_mem>;
+template class handle<cl_kernel>;
+
+template class handle<host_platform_t>;
+template class handle<host_device_t>;
+template class handle<host_context_t>;
+template class handle<host_module_t>;
+template class handle<host_stream_t>;
+template class handle<host_buffer_t>;
+template class handle<host_function_t>;
+
+
+}
+}
--- a/lib/driver/kernel.cc
+++ b/lib/driver/kernel.cc
@@ -0,0 +1,152 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <string.h>
+#include "triton/driver/kernel.h"
+#include "triton/driver/buffer.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel::kernel(driver::module *program, cl_kernel fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
+  polymorphic_resource(fn, has_ownership), program_(program){
+}
+
+kernel* kernel::create(driver::module* program, const char* name) {
+    switch(program->backend()){
+    case CUDA: return new cu_kernel(program, name);
+    case OpenCL: return new ocl_kernel(program, name);
+    case Host: return new host_kernel(program, name);
+    default: throw std::runtime_error("unknown backend");
+    }
+}
+
+driver::module* kernel::module() {
+  return program_;
+}
+
+/* ------------------------ */
+//         Host             //
+/* ------------------------ */
+
+host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
+  hst_->fn = program->hst()->functions.at(name);
+}
+
+void host_kernel::setArg(unsigned int index, std::size_t size, void* ptr){
+  if(index + 1> params_store_.size()){
+    params_store_.resize(index+1);
+    params_.resize(index+1);
+  }
+  params_store_[index].reset(malloc(size), free);
+  memcpy(params_store_[index].get(), ptr, size);
+  params_[index] = params_store_[index].get();
+}
+
+void host_kernel::setArg(unsigned int index, driver::buffer* buffer){
+  if(buffer)
+    kernel::setArg(index, (void*)buffer->hst()->data);
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+const std::vector<void *> &host_kernel::params(){
+  return params_;
+}
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_kernel::ocl_kernel(driver::module* program, const char* name): kernel(program, cl_kernel(), true) {
+//  cl_uint res;
+//  check(dispatch::clCreateKernelsInProgram(*program->cl(), 0, NULL, &res));
+//  std::cout << res << std::endl;
+  cl_int err;
+  *cl_ = dispatch::clCreateKernel(*program->cl(), "matmul", &err);
+  check(err);
+}
+
+void ocl_kernel::setArg(unsigned int index, std::size_t size, void* ptr) {
+  check(dispatch::clSetKernelArg(*cl_, index, size, ptr));
+}
+
+void ocl_kernel::setArg(unsigned int index, driver::buffer* buffer) {
+  if(buffer)
+    check(dispatch::clSetKernelArg(*cl_, index, sizeof(cl_mem), (void*)&*buffer->cl()));
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
+  cu_params_store_.reserve(64);
+  cu_params_.reserve(64);
+  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
+//  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
+}
+
+void cu_kernel::setArg(unsigned int index, std::size_t size, void* ptr){
+  if(index + 1> cu_params_store_.size()){
+    cu_params_store_.resize(index+1);
+    cu_params_.resize(index+1);
+  }
+  cu_params_store_[index].reset(malloc(size), free);
+  memcpy(cu_params_store_[index].get(), ptr, size);
+  cu_params_[index] = cu_params_store_[index].get();
+}
+
+void cu_kernel::setArg(unsigned int index, driver::buffer* data){
+  if(data)
+    kernel::setArg(index, *data->cu());
+  else
+    kernel::setArg(index, (std::ptrdiff_t)0);
+}
+
+void* const* cu_kernel::cu_params() const
+{ return cu_params_.data(); }
+
+
+}
+
+}
+
--- a/lib/driver/module.cc
+++ b/lib/driver/module.cc
@@ -0,0 +1,288 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include <fstream>
+#include <memory>
+#include "triton/driver/module.h"
+#include "triton/driver/context.h"
+#include "triton/driver/error.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace triton
+{
+namespace driver
+{
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+void module::init_llvm() {
+  static bool init = false;
+  if(!init){
+    llvm::InitializeAllTargetInfos();
+    llvm::InitializeAllTargets();
+    llvm::InitializeAllTargetMCs();
+    llvm::InitializeAllAsmParsers();
+    llvm::InitializeAllAsmPrinters();
+    init = true;
+  }
+}
+
+module::module(driver::context* ctx, CUmodule mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+module::module(driver::context* ctx, cl_program mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+module::module(driver::context* ctx, host_module_t mod, bool has_ownership)
+  : polymorphic_resource(mod, has_ownership), ctx_(ctx) {
+}
+
+driver::context* module::context() const {
+  return ctx_;
+}
+
+module* module::create(driver::context* ctx, std::unique_ptr<llvm::Module> src) {
+  switch(ctx->backend()){
+    case CUDA: return new cu_module(ctx, std::move(src));
+    case OpenCL: return new ocl_module(ctx, std::move(src));
+    case Host: return new host_module(ctx, std::move(src));
+    default: throw std::runtime_error("unknown backend");
+  }
+}
+
+void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
+                                 const std::string &proc, std::string layout,
+                                 llvm::SmallVectorImpl<char> &buffer,
+                                 const std::string& features,
+                                 file_type_t ft) {
+  init_llvm();
+  // debug
+//  llvm::legacy::PassManager pm;
+//  pm.add(llvm::createPrintModulePass(llvm::outs()));
+//  pm.add(llvm::createVerifierPass());
+//  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // convert triton file type to llvm file type
+  auto ll_file_type = [&](module::file_type_t type){
+    if(type == Object)
+      return llvm::TargetMachine::CGFT_ObjectFile;
+    return llvm::TargetMachine::CGFT_AssemblyFile;
+  };
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr, ll_file_type(ft));
+  pass.run(*module);
+}
+
+
+/* ------------------------ */
+//        Host              //
+/* ------------------------ */
+
+host_module::host_module(driver::context * context, std::unique_ptr<llvm::Module> src): module(context, host_module_t(), true) {
+  init_llvm();
+  // host info
+//  std::string triple = llvm::sys::getDefaultTargetTriple();
+//  std::string cpu = llvm::sys::getHostCPUName();
+//  llvm::SmallVector<char, 0> buffer;
+//  module::compile_llvm_module(src, triple, cpu, "", buffer, "", Assembly);
+
+  // create kernel wrapper
+  llvm::LLVMContext &ctx = src->getContext();
+  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
+  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
+  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
+  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, {args_ty, int32_ty, int32_ty, int32_ty}, false);
+  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "main", &*src);
+  llvm::Function* fn = src->getFunction("matmul");
+  llvm::FunctionType *fn_ty = fn->getFunctionType();
+  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
+  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
+  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
+  llvm::IRBuilder<> ir_builder(ctx);
+  ir_builder.SetInsertPoint(entry);
+  for(unsigned i = 0; i < ptrs.size(); i++)
+    ptrs[i] = ir_builder.CreateGEP(main->arg_begin(), ir_builder.getInt32(i));
+  for(unsigned i = 0; i < ptrs.size(); i++){
+    llvm::Value* addr = ir_builder.CreateBitCast(ir_builder.CreateLoad(ptrs[i]), fn_ty->getParamType(i)->getPointerTo());
+    fn_args[i] = ir_builder.CreateLoad(addr);
+  }
+  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
+  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
+  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
+  ir_builder.CreateCall(fn, fn_args);
+  ir_builder.CreateRetVoid();
+
+
+  // create execution engine
+  for(llvm::Function& fn: src->functions())
+    hst_->functions[fn.getName()] = &fn;
+  llvm::EngineBuilder builder(std::move(src));
+  builder.setErrorStr(&hst_->error);
+  builder.setMCJITMemoryManager(llvm::make_unique<llvm::SectionMemoryManager>());
+  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
+  builder.setEngineKind(llvm::EngineKind::JIT);
+  builder.setUseOrcMCJITReplacement(true);
+  hst_->engine = builder.create();
+}
+
+std::unique_ptr<buffer> host_module::symbol(const char *name) const {
+  throw std::runtime_error("not implemented");
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+ocl_module::ocl_module(driver::context * context, std::unique_ptr<llvm::Module> src): module(context, cl_program(), true) {
+  throw std::runtime_error("not supported");
+//  init_llvm();
+//  llvm::SmallVector<char, 0> buffer;
+//  module::compile_llvm_module(src, "amdgcn-amd-amdhsa-amdgizcl", "gfx902", "", buffer, "code-object-v3", Object);
+//  std::ofstream output("/tmp/tmp.o", std::ios::binary);
+//  std::copy(buffer.begin(), buffer.end(), std::ostreambuf_iterator<char>(output));
+//  system("ld.lld-8 /tmp/tmp.o -shared -o /tmp/tmp.o");
+//  std::ifstream input("/tmp/tmp.o", std::ios::in | std::ios::binary );
+//  std::vector<unsigned char> in_buffer(std::istreambuf_iterator<char>(input), {});
+//  size_t sizes[] = {in_buffer.size()};
+//  const unsigned char* data[] = {(unsigned char*)in_buffer.data()};
+//  cl_int status;
+//  cl_int err;
+//  *cl_ = dispatch::clCreateProgramWithBinary(*context->cl(), 1, &*context->device()->cl(), sizes, data, &status, &err);
+//  check(status);
+//  check(err);
+//  try{
+//  dispatch::clBuildProgram(*cl_, 1, &*context->device()->cl(), NULL, NULL, NULL);
+//  }
+//  catch(...){
+//  char log[2048];
+//  dispatch::clGetProgramBuildInfo(*cl_, *context->device()->cl(), CL_PROGRAM_BUILD_LOG, 1024, log, NULL);
+//  throw;
+//  }
+}
+
+std::unique_ptr<buffer> ocl_module::symbol(const char *name) const {
+  throw std::runtime_error("not implemented");
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
+  size_t start_replace = str.find(begin);
+  size_t end_replace = str.find(end, start_replace);
+  if(start_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+std::string cu_module::compile_llvm_module(std::unique_ptr<llvm::Module> module, driver::device* device) {
+   // options
+   auto options = llvm::cl::getRegisteredOptions();
+//   for(auto& opt: options)
+//     std::cout << opt.getKey().str() << std::endl;
+   auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
+   assert(short_ptr);
+   short_ptr->setValue(true);
+   // compute capability
+   auto cc = ((driver::cu_device*)device)->compute_capability();
+   std::string sm = "sm_" + std::to_string(cc.first) + std::to_string(cc.second);
+   // create
+   llvm::SmallVector<char, 0> buffer;
+   module::compile_llvm_module(std::move(module), "nvptx64-nvidia-cuda", sm, "", buffer, "ptx63", Assembly);
+   std::string result(buffer.begin(), buffer.end());
+   find_and_replace(result, ".version", "\n", ".version 6.4\n");
+   while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
+   while(find_and_replace(result, "\t// end inline asm", "\n", ""));
+   return result;
+}
+
+cu_module::cu_module(driver::context * context, std::unique_ptr<llvm::Module> ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { }
+
+cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
+  cu_context::context_switcher ctx(*context);
+//  std::cout << source << std::endl;
+  // JIT compile source-code
+  CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
+  unsigned int errbufsize = 8096;
+  std::string errbuf(errbufsize, 0);
+  void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()};
+  try{
+    dispatch::cuModuleLoadDataEx(&*cu_, source_.data(), 2, opt, optval);
+  }catch(exception::cuda::base const &){
+//#ifdef TRITON_LOG_PTX_ERROR
+    std::cerr << "Compilation Failed! Log: " << std::endl;
+    std::cerr << errbuf << std::endl;
+//#endif
+    throw;
+  }
+}
+
+std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
+  CUdeviceptr handle;
+  size_t size;
+  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
+  std::unique_ptr<buffer> res(new cu_buffer(ctx_, size, handle, false));
+  return std::move(res);
+}
+
+
+}
+}
+
--- a/lib/driver/platform.cc
+++ b/lib/driver/platform.cc
@@ -0,0 +1,89 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <string>
+#include "triton/driver/platform.h"
+#include "triton/driver/device.h"
+
+
+namespace triton
+{
+namespace driver
+{
+
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+std::string cu_platform::version() const{
+  int version;
+  dispatch::cuDriverGetVersion(&version);
+  return std::to_string(version);
+}
+
+void cu_platform::devices(std::vector<device *> &devices) const{
+  int N;
+  dispatch::cuDeviceGetCount(&N);
+  for(int i = 0 ; i < N ; ++i){
+    CUdevice dvc;
+    dispatch::cuDeviceGet(&dvc, i);
+    devices.push_back(new driver::cu_device(dvc));
+  }
+}
+
+/* ------------------------ */
+//        OpenCL            //
+/* ------------------------ */
+
+std::string cl_platform::version() const {
+  size_t size;
+  check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, 0, nullptr, &size));
+  std::string result(size, 0);
+  check(dispatch::clGetPlatformInfo(*cl_, CL_PLATFORM_VERSION, size, (void*)&*result.begin(), nullptr));
+  return result;
+}
+
+void cl_platform::devices(std::vector<device*> &devices) const{
+  cl_uint num_devices;
+  check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, 0, nullptr, &num_devices));
+  std::vector<cl_device_id> ids(num_devices);
+  check(dispatch::clGetDeviceIDs(*cl_, CL_DEVICE_TYPE_GPU, num_devices, ids.data(), nullptr));
+  for(cl_device_id id: ids)
+    devices.push_back(new driver::ocl_device(id));
+}
+
+/* ------------------------ */
+//        Host              //
+/* ------------------------ */
+
+std::string host_platform::version() const {
+  return "1.0";
+}
+
+void host_platform::devices(std::vector<driver::device*> &devices) const {
+  devices.push_back(new driver::host_device());
+}
+
+
+}
+}
--- a/lib/driver/stream.cc
+++ b/lib/driver/stream.cc
@@ -0,0 +1,181 @@
+/* Copyright 2015-2017 Philippe Tillet
+* 
+* Permission is hereby granted, free of charge, to any person obtaining 
+* a copy of this software and associated documentation files 
+* (the "Software"), to deal in the Software without restriction, 
+* including without limitation the rights to use, copy, modify, merge, 
+* publish, distribute, sublicense, and/or sell copies of the Software, 
+* and to permit persons to whom the Software is furnished to do so, 
+* subject to the following conditions:
+* 
+* The above copyright notice and this permission notice shall be 
+* included in all copies or substantial portions of the Software.
+* 
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#include <cassert>
+#include <array>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/driver/context.h"
+#include "triton/driver/device.h"
+#include "triton/driver/event.h"
+#include "triton/driver/kernel.h"
+#include "triton/driver/buffer.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/GenericValue.h"
+
+namespace triton
+{
+
+namespace driver
+{
+
+/* ------------------------ */
+//         Base             //
+/* ------------------------ */
+
+stream::stream(driver::context *ctx, CUstream cu, bool has_ownership)
+  : polymorphic_resource(cu, has_ownership), ctx_(ctx) {
+}
+
+stream::stream(driver::context *ctx, cl_command_queue cl, bool has_ownership)
+  : polymorphic_resource(cl, has_ownership), ctx_(ctx) {
+}
+
+stream::stream(driver::context *ctx, host_stream_t cl, bool has_ownership)
+  : polymorphic_resource(cl, has_ownership), ctx_(ctx) {
+}
+
+driver::stream* stream::create(driver::context* ctx) {
+  switch(ctx->backend()){
+    case CUDA: return new cu_stream(ctx);
+    case OpenCL: return new cl_stream(ctx);
+    case Host: return new host_stream(ctx);
+    default: throw std::runtime_error("unknown backend");
+  }
+}
+
+driver::context* stream::context() const {
+  return ctx_;
+}
+
+/* ------------------------ */
+//          Host            //
+/* ------------------------ */
+
+host_stream::host_stream(driver::context *ctx): stream(ctx, host_stream_t(), true) {
+
+}
+
+void host_stream::synchronize() {
+
+}
+
+void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  driver::host_kernel* hst_kernel = (host_kernel*)kernel;
+  llvm::ExecutionEngine* engine = kernel->module()->hst()->engine;
+  void (*fn)(char**, int32_t, int32_t, int32_t) = (void(*)(char**, int32_t, int32_t, int32_t))engine->getFunctionAddress("main");
+  for(size_t i = 0; i < grid[0]; i++)
+    for(size_t j = 0; j < grid[1]; j++)
+      for(size_t k = 0; k < grid[2]; k++)
+        fn((char**)hst_kernel->params().data(), int32_t(i), int32_t(j), int32_t(k));
+}
+
+void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  std::memcpy((void*)buffer->hst()->data, ptr, size);
+}
+
+void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
+}
+
+
+/* ------------------------ */
+//         OpenCL           //
+/* ------------------------ */
+
+cl_stream::cl_stream(driver::context *ctx): stream(ctx, cl_command_queue(), true) {
+  cl_int err;
+  *cl_ = dispatch::clCreateCommandQueue(*ctx->cl(), *ctx->device()->cl(), 0, &err);
+  check(err);
+}
+
+void cl_stream::synchronize() {
+  check(dispatch::clFinish(*cl_));
+}
+
+void cl_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  std::array<size_t, 3> global = {grid[0]*block[0], grid[1]*block[1], grid[2]*block[2]};
+  check(dispatch::clEnqueueNDRangeKernel(*cl_, *kernel->cl(), grid.size(), NULL, (const size_t*)global.data(), (const size_t*)block.data(), 0, NULL, NULL));
+}
+
+void cl_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  check(dispatch::clEnqueueWriteBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
+}
+
+void cl_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  check(dispatch::clEnqueueReadBuffer(*cl_, *buffer->cl(), blocking?CL_TRUE:CL_FALSE, offset, size, ptr, 0, NULL, NULL));
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+
+inline CUcontext get_context() {
+  CUcontext result;
+  dispatch::cuCtxGetCurrent(&result);
+  return result;
+}
+
+cu_stream::cu_stream(CUstream str, bool take_ownership):
+  stream(backend::contexts::import(get_context()), str, take_ownership) {
+}
+
+cu_stream::cu_stream(driver::context *context): stream((driver::cu_context*)context, CUstream(), true) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  dispatch::cuStreamCreate(&*cu_, 0);
+}
+
+void cu_stream::synchronize() {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  dispatch::cuStreamSynchronize(*cu_);
+}
+
+void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, std::vector<event> const *, event* event) {
+  driver::cu_kernel* cu_kernel = (driver::cu_kernel*)kernel;
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(event)
+    dispatch::cuEventRecord(event->cu()->first, *cu_);
+  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)cu_kernel->cu_params(), NULL);
+  if(event)
+    dispatch::cuEventRecord(event->cu()->second, *cu_);
+}
+
+void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(blocking)
+    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
+  else
+    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
+}
+
+void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
+  cu_context::context_switcher ctx_switch(*ctx_);
+  if(blocking)
+    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
+  else
+    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
+}
+
+
+}
+
+}