[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
2021-09-09 00:04:28 -07:00
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS  -std=gnu++17")
 # LLVM
 ##########
 if("${LLVM_LIBRARY_DIR}" STREQUAL "")
-    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
+    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu")
    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
    if(APPLE)
      set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
@@ -39,14 +39,52 @@ if("${LLVM_LIBRARY_DIR}" STREQUAL "")
 # sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
 else()
    set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
-    set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a
-                       libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a
-                       libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a
-                       libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a
-                       libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMMCParser.a
-                       libLLVMBitReader.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMNVPTXDesc.a
-                       libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMBinaryFormat.a libLLVMNVPTXInfo.a
-                       libLLVMSupport.a libLLVMDemangle.a)
+    set(LLVM_LIBRARIES 
+libLLVMNVPTXCodeGen.a
+libLLVMNVPTXDesc.a
+libLLVMNVPTXInfo.a
+libLLVMAMDGPUDisassembler.a
+libLLVMMCDisassembler.a
+libLLVMAMDGPUCodeGen.a
+libLLVMMIRParser.a
+libLLVMGlobalISel.a
+libLLVMSelectionDAG.a
+libLLVMipo.a
+libLLVMInstrumentation.a
+libLLVMVectorize.a
+libLLVMLinker.a
+libLLVMIRReader.a
+libLLVMAsmParser.a
+libLLVMFrontendOpenMP.a
+libLLVMAsmPrinter.a
+libLLVMDebugInfoDWARF.a
+libLLVMCodeGen.a
+libLLVMTarget.a
+libLLVMScalarOpts.a
+libLLVMInstCombine.a
+libLLVMAggressiveInstCombine.a
+libLLVMTransformUtils.a
+libLLVMBitWriter.a
+libLLVMAnalysis.a
+libLLVMProfileData.a
+libLLVMObject.a
+libLLVMTextAPI.a
+libLLVMBitReader.a
+libLLVMAMDGPUAsmParser.a
+libLLVMMCParser.a
+libLLVMAMDGPUDesc.a
+libLLVMAMDGPUUtils.a
+libLLVMMC.a
+libLLVMDebugInfoCodeView.a
+libLLVMDebugInfoMSF.a
+libLLVMCore.a
+libLLVMRemarks.a
+libLLVMBitstreamReader.a
+libLLVMBinaryFormat.a
+libLLVMAMDGPUInfo.a
+libLLVMSupport.a
+libLLVMDemangle.a
+)
 endif()
 include_directories("${LLVM_INCLUDE_DIRS}")

--- a/include/triton/codegen/pass.h
+++ b/include/triton/codegen/pass.h
@@ -4,8 +4,17 @@

 #include <memory>

+namespace llvm{
+  class Module;
+  class LLVMContext;
+}
+
 namespace triton{

+namespace codegen {
+  class target;
+}
+
 namespace ir{
  class module;
 }
@@ -21,8 +30,10 @@ namespace codegen{

 // TODO:
 // There should be a proper pass manager there!
-void add_passes_to_emit_bin(ir::module &ir, driver::device* dev, int num_warps, int num_stages, bool force_nc_cache,
-                            driver::module*& mod, driver::kernel*& ker, size_t& shared_mem);
+std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx,
+                                                     codegen::target* target,
+                                                     int sm, int num_warps,
+                                                     int num_stages, bool force_nc_cache, int &shared_static);


 }
--- a/include/triton/driver/backend.h
+++ b/include/triton/driver/backend.h
@@ -1,137 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_BACKEND_H_
-#define _TRITON_DRIVER_BACKEND_H_
-
-
-#include <map>
-#include <list>
-#include <vector>
-#include "triton/driver/context.h"
-
-namespace llvm
-{
-class Module;
-}
-
-namespace triton
-{
-namespace driver
-{
-
-class buffer;
-class stream;
-class device;
-class context;
-class platform;
-class module;
-class kernel;
-
-struct backend
-{
-
-  // platforms
-  class platforms
-  {
-    friend class backend;
-  private:
-    static void init();
-
-  public:
-    static void get(std::vector<driver::platform*> &results);
-
-  private:
-    static std::vector<driver::platform*> cache_;
-  };
-
-  // devices
-  class devices
-  {
-    friend class backend;
-
-  private:
-    static void init(const std::vector<platform *> &platforms);
-
-  public:
-    static void get(std::vector<driver::device*>& devs);
-
-  private:
-    static std::vector<driver::device*> cache_;
-  };
-
-  // modules
-  class modules
-  {
-    friend class backend;
-
-  public:
-    static void release();
-
-  private:
-    static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
-  };
-
-  // kernels
-  class kernels
-  {
-    friend class backend;
-  public:
-    static void release();
-    static driver::kernel* get(driver::module* mod, const std::string & name);
-  private:
-    static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
-  };
-
-  // contexts
-  class contexts
-  {
-    friend class backend;
-  private:
-    static void init(const std::vector<device *> &);
-    static void release();
-  public:
-    static driver::context* get_default();
-
-    static driver::context* import(CUcontext ctx)
-    {
-      for(driver::context* x: cache_){
-        driver::cu_context* cu_x = (driver::cu_context*)x;
-        if(*cu_x->cu()==ctx)
-          return x;
-      }
-      cache_.emplace_back(new driver::cu_context(ctx, false));
-      return cache_.back();
-    }
-
-    static void get(std::list<driver::context*> &);
-
-  private:
-    static std::list<driver::context*> cache_;
-  };
-
-  // streams
-  class streams
-  {
-    friend class backend;
-  private:
-    static void init(std::list<context*> const &);
-    static void release();
-  public:
-    static void get(driver::context*, std::vector<driver::stream *> &streams);
-    static driver::stream* get(driver::context*, unsigned int id = 0);
-    static driver::stream* get_default();
-  private:
-    static std::map<driver::context*, std::vector<driver::stream*> > cache_;
-  };
-
-  static void init();
-  static void release();
-  static void synchronize(triton::driver::context *);
-
-  static unsigned int default_device;
-};
-
-}
-}
-
-#endif
--- a/include/triton/driver/buffer.h
+++ b/include/triton/driver/buffer.h
@@ -1,48 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_BUFFER_H_
-#define _TRITON_DRIVER_BUFFER_H_
-
-#include "triton/driver/handle.h"
-#include "triton/driver/context.h"
-
-namespace triton
-{
-namespace driver
-{
-
-class stream;
-
-// Base
-class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
-public:
-  buffer(size_t size, CUdeviceptr cl, bool take_ownership);
-  buffer(size_t size, host_buffer_t hst, bool take_ownership);
-  uintptr_t addr_as_uintptr_t();
-  static buffer* create(driver::context* ctx, size_t size);
-  size_t size();
-
-protected:
-  size_t size_;
-};
-
-// CPU
-class host_buffer: public buffer
-{
-public:
-  host_buffer(size_t size);
-};
-
-// CUDA
-class cu_buffer: public buffer
-{
-public:
-  cu_buffer(size_t size);
-  cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
-  void set_zero(triton::driver::stream *queue, size_t size);
-};
-
-}
-}
-
-#endif
--- a/include/triton/driver/context.h
+++ b/include/triton/driver/context.h
@@ -1,50 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_CONTEXT_H_
-#define _TRITON_DRIVER_CONTEXT_H_
-
-#include "triton/driver/device.h"
-#include "triton/driver/handle.h"
-
-namespace triton
-{
-namespace driver
-{
-
-class context: public polymorphic_resource<CUcontext, host_context_t>{
-protected:
-  static std::string get_cache_path();
-
-public:
-  context(driver::device *dev, CUcontext cu, bool take_ownership);
-  context(driver::device *dev, host_context_t hst, bool take_ownership);
-  driver::device* device() const;
-  std::string const & cache_path() const;
-  // factory methods
-  static context* create(driver::device *dev);
-
-protected:
-  driver::device* dev_;
-  std::string cache_path_;
-};
-
-// Host
-class host_context: public context {
-public:
-  host_context(driver::device* dev);
-};
-
-// CUDA
-class cu_context: public context {
-private:
-  static CUdevice get_device_of(CUcontext);
-public:
-  //Constructors
-  cu_context(CUcontext cu, bool take_ownership = true);
-  cu_context(driver::device* dev);
-};
-
-}
-}
-
-#endif
--- a/include/triton/driver/device.h
+++ b/include/triton/driver/device.h
@@ -1,82 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_DEVICE_H_
-#define _TRITON_DRIVER_DEVICE_H_
-
-#include "triton/driver/platform.h"
-#include "triton/driver/handle.h"
-
-namespace triton
-{
-
-namespace codegen
-{
-class target;
-}
-
-namespace driver
-{
-
-class context;
-
-// Base device
-class device: public polymorphic_resource<CUdevice, host_device_t>{
-public:
-  using polymorphic_resource::polymorphic_resource;
-  virtual size_t max_threads_per_block() const = 0;
-  virtual size_t max_shared_memory() const = 0;
-  virtual std::unique_ptr<codegen::target> make_target() const = 0;
-};
-
-// Host device
-class host_device: public device {
-public:
-  host_device(): device(host_device_t(), true){ }
-  size_t max_threads_per_block() const { return 1; }
-  size_t max_shared_memory() const { return 0; }
-  std::unique_ptr<codegen::target> make_target() const;
-};
-
-// CUDA device
-class cu_device: public device {
-private:
-  //Metaprogramming elper to get cuda info from attribute
-  template<CUdevice_attribute attr>
-  int cuGetInfo() const;
-
-  inline nvmlDevice_t nvml_device() const;
-
-public:
-  cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
-  // Informations
-  std::string infos() const;
-  size_t address_bits() const;
-  std::vector<size_t> max_block_dim() const;
-  size_t warp_size() const;
-  // Compute Capability
-  void interpret_as(int cc);
-  int compute_capability() const;
-  // Identifier
-  std::string name() const;
-  std::string pci_bus_id() const;
-  // Clocks
-  size_t current_sm_clock() const;
-  size_t current_mem_clock() const;
-  size_t max_threads_per_block() const;
-  size_t max_shared_memory() const;
-  size_t max_sm_clock() const;
-  size_t max_mem_clock() const;
-  void set_max_clock();
-  void enable_peer_access(CUdeviceptr peer_mem_ptr) const;
-  // Target
-  std::unique_ptr<codegen::target> make_target() const;
-
-private:
-  std::shared_ptr<int> interpreted_as_;
-};
-
-}
-
-}
-
-#endif
--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -10,6 +10,10 @@
 #include "triton/external/CUDA/cuda.h"
 #include "triton/external/CUDA/nvml.h"

+//// HIP backend
+//#define __HIP_PLATFORM_AMD__
+#include "triton/external/hip.h"
+
 //Exceptions
 #include <iostream>
 #include <stdexcept>
@@ -28,6 +32,7 @@ class cu_context;

 template<class T> void check(T){}
 void check(CUresult err);
+void check(hipError_t err);

 class dispatch
 {
@@ -58,17 +63,18 @@ protected:
  }

 public:
+  static void release();
+  // Nvidia
  static bool nvmlinit();
  static bool cuinit();
-  static void release();
+  // AMD
+  static bool hipinit();

  /* ------------------- *
   * CUDA
   * ------------------- */
  // context management
  static CUresult cuInit(unsigned int Flags);
-  static CUresult cuCtxGetCurrent(CUcontext *pctx);
-  static CUresult cuCtxSetCurrent(CUcontext ctx);
  static CUresult cuCtxDestroy_v2(CUcontext ctx);
  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
@@ -128,6 +134,55 @@ public:
  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);

+  /* ------------------- *
+   * HIP
+   * ------------------- */
+  // context management
+  static hipError_t hipInit(unsigned int Flags);
+  static hipError_t hipCtxDestroy(hipCtx_t ctx);
+  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
+  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
+  static hipError_t hipCtxGetDevice(hipDevice_t* result);
+  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
+  static hipError_t hipDriverGetVersion(int *driverVersion);
+  // device management
+  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
+  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
+  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
+  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+  static hipError_t hipGetDeviceCount(int *count);
+  // module management
+  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
+  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
+  static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+  static hipError_t hipModuleUnload(hipModule_t hmod);
+  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
+  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+  // stream management
+  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
+  static hipError_t hipStreamSynchronize(hipStream_t hStream);
+  static hipError_t hipStreamDestroy(hipStream_t hStream);
+  static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+  // function management
+  static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
+  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
+  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
+  // memory management
+  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
+  static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
+  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
+  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+  static hipError_t hipFree(hipDeviceptr_t dptr);
+  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
+  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
+  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+  // event management
+  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
+  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
+  static hipError_t hipEventDestroy(hipEvent_t hEvent);
+


 private:
@@ -135,6 +190,7 @@ private:
  // Libraries
  static void* cuda_;
  static void* nvml_;
+  static void* hip_;


  /* ------------------- *
@@ -194,9 +250,6 @@ private:
  static void* cuEventRecord_;
  static void* cuEventDestroy_v2_;

-
-
-
  /* ------------------- *
   * NVML
   * ------------------- */
@@ -205,6 +258,55 @@ private:
  static void* nvmlDeviceGetClockInfo_;
  static void* nvmlDeviceGetMaxClockInfo_;
  static void* nvmlDeviceSetApplicationsClocks_;
+
+  /* ------------------- *
+   * HIP
+   * ------------------- */
+  // context management
+  static void* hipInit_;
+  static void* hipCtxDestroy_;
+  static void* hipCtxCreate_;
+  static void* hipCtxPushCurrent_;
+  static void* hipCtxPopCurrent_;
+  static void* hipCtxGetDevice_;
+  static void* hipCtxEnablePeerAccess_;
+  static void* hipDriverGetVersion_;
+  // device management
+  static void* hipGetDevice_;
+  static void* hipDeviceGetName_;
+  static void* hipDeviceGetPCIBusId_;
+  static void* hipDeviceGetAttribute_;
+  static void* hipGetDeviceCount_;
+  // module management
+  static void* hipModuleGetGlobal_;
+  static void* hipModuleLoad_;
+  static void* hipModuleLoadData_;
+  static void* hipModuleUnload_;
+  static void* hipModuleLoadDataEx_;
+  static void* hipModuleGetFunction_;
+  // stream management
+  static void* hipStreamCreate_;
+  static void* hipStreamSynchronize_;
+  static void* hipStreamDestroy_;
+  static void* hipModuleLaunchKernel_;;
+  // function management
+  static void* hipFuncGetAttributes_;
+  static void* hipFuncSetAttribute_;
+  static void* hipFuncSetCacheConfig_;
+  // memory management
+  static void* hipMalloc_;
+  static void* hipPointerGetAttribute_;
+  static void* hipMemsetD8Async_;
+  static void* hipMemcpyDtoH_;
+  static void* hipFree_;
+  static void* hipMemcpyDtoHAsync_;
+  static void* hipMemcpyHtoDAsync_;
+  static void* hipMemcpyHtoD_;
+  // event management
+  static void* hipEventCreate_;
+  static void* hipEventElapsedTime_;
+  static void* hipEventRecord_;
+  static void* hipEventDestroy_;
 };

 }
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -141,6 +141,78 @@ namespace triton
  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
  }

+
+
+
+  namespace hip
+  {
+  class base: public std::exception{};
+
+#define TRITON_CREATE_HIP_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "HIP: Error- " msg; } }
+
+
+  TRITON_CREATE_HIP_EXCEPTION(invalid_value                   ,"invalid value");
+  TRITON_CREATE_HIP_EXCEPTION(out_of_memory                   ,"out of memory");
+  TRITON_CREATE_HIP_EXCEPTION(not_initialized                 ,"not initialized");
+  TRITON_CREATE_HIP_EXCEPTION(deinitialized                   ,"deinitialized");
+  TRITON_CREATE_HIP_EXCEPTION(profiler_disabled               ,"profiler disabled");
+  TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
+  TRITON_CREATE_HIP_EXCEPTION(profiler_already_started        ,"profiler already started");
+  TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
+  TRITON_CREATE_HIP_EXCEPTION(no_device                       ,"no device");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_device                  ,"invalid device");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_image                   ,"invalid image");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_context                 ,"invalid context");
+  TRITON_CREATE_HIP_EXCEPTION(context_already_current         ,"context already current");
+  TRITON_CREATE_HIP_EXCEPTION(map_failed                      ,"map failed");
+  TRITON_CREATE_HIP_EXCEPTION(unmap_failed                    ,"unmap failed");
+  TRITON_CREATE_HIP_EXCEPTION(array_is_mapped                 ,"array is mapped");
+  TRITON_CREATE_HIP_EXCEPTION(already_mapped                  ,"already mapped");
+  TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
+  TRITON_CREATE_HIP_EXCEPTION(already_acquired                ,"already acquired");
+  TRITON_CREATE_HIP_EXCEPTION(not_mapped                      ,"not mapped");
+  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
+  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
+  TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
+  TRITON_CREATE_HIP_EXCEPTION(unsupported_limit               ,"unsupported limit");
+  TRITON_CREATE_HIP_EXCEPTION(context_already_in_use          ,"context already in use");
+  TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_ptx                     ,"invalid ptx");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_source                  ,"invalid source");
+  TRITON_CREATE_HIP_EXCEPTION(file_not_found                  ,"file not found");
+  TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
+  TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
+  TRITON_CREATE_HIP_EXCEPTION(operating_system                ,"operating system");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_handle                  ,"invalid handle");
+  TRITON_CREATE_HIP_EXCEPTION(not_found                       ,"not found");
+  TRITON_CREATE_HIP_EXCEPTION(not_ready                       ,"not ready");
+  TRITON_CREATE_HIP_EXCEPTION(illegal_address                 ,"illegal address");
+  TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
+  TRITON_CREATE_HIP_EXCEPTION(launch_timeout                  ,"launch timeout");
+  TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
+  TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
+  TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
+  TRITON_CREATE_HIP_EXCEPTION(primary_context_active          ,"primary context active");
+  TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed            ,"context is destroyed");
+  TRITON_CREATE_HIP_EXCEPTION(assert_error                    ,"assert");
+  TRITON_CREATE_HIP_EXCEPTION(too_many_peers                  ,"too many peers");
+  TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
+  TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
+  TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error            ,"hardware stack error");
+  TRITON_CREATE_HIP_EXCEPTION(illegal_instruction             ,"illegal instruction");
+  TRITON_CREATE_HIP_EXCEPTION(misaligned_address              ,"misaligned address");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_address_space           ,"invalid address space");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_pc                      ,"invalid pc");
+  TRITON_CREATE_HIP_EXCEPTION(launch_failed                   ,"launch failed");
+  TRITON_CREATE_HIP_EXCEPTION(not_permitted                   ,"not permitted");
+  TRITON_CREATE_HIP_EXCEPTION(not_supported                   ,"not supported");
+  TRITON_CREATE_HIP_EXCEPTION(invalid_symbol                   ,"invalid symbol");
+  TRITON_CREATE_HIP_EXCEPTION(unknown                         ,"unknown");
+
+#undef TRITON_CREATE_CUDA_EXCEPTION
+  }
+
  }
  }
 }
--- a/include/triton/driver/handle.h
+++ b/include/triton/driver/handle.h
@@ -1,146 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_HANDLE_H_
-#define _TRITON_DRIVER_HANDLE_H_
-
-#include <memory>
-#include <map>
-#include <iostream>
-#include <functional>
-#include <type_traits>
-#include "triton/driver/dispatch.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "triton/tools/thread_pool.h"
-
-namespace llvm
-{
-class ExecutionEngine;
-class Function;
-}
-
-namespace triton
-{
-
-namespace driver
-{
-
-enum backend_t {
-  CUDA,
-  Host
-};
-
-// Host handles
-struct host_platform_t{
-
-};
-
-struct host_device_t{
-
-};
-
-struct host_context_t{
-
-};
-
-struct host_stream_t{
-  std::shared_ptr<ThreadPool> pool;
-  std::shared_ptr<std::vector<std::future<void>>> futures;
-  std::vector<std::shared_ptr<char*>> args;
-};
-
-struct host_module_t{
-  std::string error;
-  llvm::ExecutionEngine* engine;
-  std::map<std::string, llvm::Function*> functions;
-  void(*fn)(char**, int32_t, int32_t, int32_t);
-  llvm::orc::ExecutionSession* ES;
-  llvm::orc::RTDyldObjectLinkingLayer* ObjectLayer;
-  llvm::orc::IRCompileLayer* CompileLayer;
-  llvm::DataLayout* DL;
-  llvm::orc::MangleAndInterner* Mangle;
-  llvm::orc::ThreadSafeContext* Ctx;
-  llvm::orc::JITDylib *MainJD;
-};
-
-struct host_function_t{
-  llvm::Function* fn;
-};
-
-struct host_buffer_t{
-  char* data;
-};
-
-
-// Extra CUDA handles
-struct cu_event_t{
-  operator bool() const { return first && second; }
-  CUevent first;
-  CUevent second;
-};
-
-struct CUPlatform{
-  CUPlatform() : status_(dispatch::cuInit(0)) { }
-  operator bool() const { return status_; }
-private:
-  CUresult status_;
-};
-
-template<class T, class CUType>
-class handle_interface{
-public:
-    //Accessors
-    operator CUType() const { return *(((T*)this)->cu().h_); }
-    //Comparison
-    bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
-    bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
-    bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
-};
-
-template<class T>
-class handle{
-public:
-  template<class, class> friend class handle_interface;
-public:
-  //Constructors
-  handle(T h, bool take_ownership = true);
-  handle();
-  ~handle();
-  T& operator*() { return *h_; }
-  T const & operator*() const { return *h_; }
-  T* operator->() const { return h_.get(); }
-
-protected:
-  std::shared_ptr<T> h_;
-  bool has_ownership_;
-};
-
-template<class CUType, class HostType>
-class polymorphic_resource {
-public:
-  polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
-  polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
-  virtual ~polymorphic_resource() { }
-
-  handle<CUType> cu() { return cu_; }
-  handle<HostType> hst() { return hst_; }
-  const handle<CUType>& cu() const { return cu_; }
-  const handle<HostType>& hst() const { return hst_; }
-  backend_t backend() { return backend_; }
-
-protected:
-  handle<CUType> cu_;
-  handle<HostType> hst_;
-  backend_t backend_;
-};
-
-}
-}
-
-#endif
--- a/include/triton/driver/kernel.h
+++ b/include/triton/driver/kernel.h
@@ -1,53 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_KERNEL_H_
-#define _TRITON_DRIVER_KERNEL_H_
-
-#include "triton/driver/module.h"
-#include "triton/driver/handle.h"
-#include <memory>
-
-namespace llvm
-{
-class GenericValue;
-}
-
-namespace triton
-{
-
-namespace driver
-{
-
-class cu_buffer;
-
-// Base
-class kernel: public polymorphic_resource<CUfunction, host_function_t> {
-public:
-  kernel(driver::module* program, CUfunction fn, bool has_ownership);
-  kernel(driver::module* program, host_function_t fn, bool has_ownership);
-  driver::module* module();
-  static kernel* create(driver::module* program, const char* name);
-private:
-  driver::module* program_;
-};
-
-// Host
-class host_kernel: public kernel {
-public:
-  //Constructors
-  host_kernel(driver::module* program, const char* name);
-};
-
-// CUDA
-class cu_kernel: public kernel {
-public:
-  //Constructors
-  cu_kernel(driver::module* program, const char * name);
-};
-
-}
-
-}
-
-#endif
-
--- a/include/triton/driver/llvm.h
+++ b/include/triton/driver/llvm.h
@@ -0,0 +1,18 @@
+#include <string>
+#include "triton/driver/dispatch.h"
+
+namespace llvm{
+class Module;
+}
+
+namespace triton{
+namespace driver{
+
+void init_llvm();
+std::string llir_to_ptx(llvm::Module* module, int cc, int version);
+CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
+std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
+hipModule_t amdgpu_to_hipmodule(const std::string& path);
+
+}
+}
--- a/include/triton/driver/module.h
+++ b/include/triton/driver/module.h
@@ -1,84 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_MODULE_H_
-#define _TRITON_DRIVER_MODULE_H_
-
-#include <map>
-#include "triton/driver/handle.h"
-#include "triton/driver/context.h"
-#include "triton/driver/buffer.h"
-
-namespace llvm
-{
-  class Module;
-  template<class T>
-  class SmallVectorImpl;
-}
-
-namespace triton
-{
-
-namespace driver
-{
-
-class cu_context;
-class cu_device;
-
-// Base
-class module: public polymorphic_resource<CUmodule, host_module_t> {
-protected:
-  void init_llvm();
-
-  enum file_type_t{
-    Object,
-    Assembly
-  };
-
-public:
-  module(CUmodule mod, bool has_ownership);
-  module(host_module_t mod, bool has_ownership);
-  static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
-  void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
-                           const std::string &proc, std::string layout,
-                           llvm::SmallVectorImpl<char> &buffer,
-                           const std::string &features,
-                           file_type_t file_type);
-  virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
-  int spilled() const { return spilled_; }
-
-protected:
-  int spilled_;
-};
-
-// CPU
-class host_module: public module{
-public:
-  host_module(std::unique_ptr<llvm::Module> module);
-  std::unique_ptr<buffer> symbol(const char * name) const;
-};
-
-// CUDA
-class cu_module: public module {
-  std::string compile_llvm_module(llvm::Module* module, driver::device* device);
-  void init_from_ptx(const std::string& ptx, cu_device *device);
-
-public:
-  cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
-  cu_module(driver::device* device, const std::string& source);
-  std::unique_ptr<buffer> symbol(const char * name) const;
-  std::string llir() const { return llir_; }
-  const std::string& ptx() const { return ptx_; }
-  const std::string& cubin() const { return cubin_; }
-
-private:
-  std::string ptx_;
-  std::string cubin_;
-  std::string llir_;
-};
-
-
-}
-
-}
-
-#endif
--- a/include/triton/driver/platform.h
+++ b/include/triton/driver/platform.h
@@ -1,58 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_PLATFORM_H_
-#define _TRITON_DRIVER_PLATFORM_H_
-
-#include <vector>
-#include <string>
-
-#include "triton/driver/handle.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-class device;
-
-class platform
-{
-public:
-  // Constructor
-  platform(const std::string& name): name_(name){ }
-  // Accessors
-  std::string name() const { return name_; }
-  // Virtual methods
-  virtual std::string version() const = 0;
-  virtual void devices(std::vector<driver::device *> &devices) const = 0;
-private:
-  std::string name_;
-};
-
-// CUDA
-class cu_platform: public platform
-{
-public:
-  cu_platform(): platform("CUDA") { }
-  std::string version() const;
-  void devices(std::vector<driver::device*> &devices) const;
-
-private:
-  handle<CUPlatform> cu_;
-};
-
-// Host
-class host_platform: public platform
-{
-public:
-  host_platform(): platform("CPU") { }
-  std::string version() const;
-  void devices(std::vector<driver::device*> &devices) const;
-};
-
-}
-
-}
-
-#endif
--- a/include/triton/driver/stream.h
+++ b/include/triton/driver/stream.h
@@ -1,68 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_STREAM_H_
-#define _TRITON_DRIVER_STREAM_H_
-
-#include <map>
-#include "triton/driver/context.h"
-#include "triton/driver/device.h"
-#include "triton/driver/handle.h"
-#include "triton/driver/buffer.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-class kernel;
-class event;
-class Range;
-class cu_buffer;
-
-// Base
-class stream: public polymorphic_resource<CUstream, host_stream_t> {
-public:
-  stream(CUstream, bool has_ownership);
-  stream(host_stream_t, bool has_ownership);
-  // factory
-  static driver::stream* create(backend_t backend);
-  // methods
-  virtual void synchronize() = 0;
-  virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem = 0) = 0;
-  virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
-  virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
-  // template helpers
-  template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
-  { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
-  template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
-  { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
-};
-
-// Host
-class host_stream: public stream {
-public:
-  host_stream();
-  void synchronize();
-  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
-  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
-  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
-};
-
-// CUDA
-class cu_stream: public stream {
-public:
-  cu_stream(CUstream str, bool take_ownership);
-  cu_stream();
-  void synchronize();
-  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
-  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
-  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
-};
-
-
-}
-
-}
-
-#endif
--- a/include/triton/external/CL/cl.h
+++ b/include/triton/external/CL/cl.h
--- a/include/triton/external/CL/cl.hpp
+++ b/include/triton/external/CL/cl.hpp
--- a/include/triton/external/CL/cl2.hpp
+++ b/include/triton/external/CL/cl2.hpp
--- a/include/triton/external/CL/cl_d3d10.h
+++ b/include/triton/external/CL/cl_d3d10.h
@@ -1,131 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_D3D10_H
-#define __OPENCL_CL_D3D10_H
-
-#include <d3d10.h>
-#include "cl.h"
-#include "cl_platform.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d10_sharing                                                       */
-#define cl_khr_d3d10_sharing 1
-
-typedef cl_uint cl_d3d10_device_source_khr;
-typedef cl_uint cl_d3d10_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D10_DEVICE_KHR                  -1002
-#define CL_INVALID_D3D10_RESOURCE_KHR                -1003
-#define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
-#define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
-
-/* cl_d3d10_device_source_nv */
-#define CL_D3D10_DEVICE_KHR                          0x4010
-#define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
-
-/* cl_d3d10_device_set_nv */
-#define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
-#define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
-#define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
-
-/* cl_mem_info */
-#define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
-
-/* cl_image_info */
-#define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
-#define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d10_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d10_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D10Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D10Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D10_H */
-
--- a/include/triton/external/CL/cl_d3d11.h
+++ b/include/triton/external/CL/cl_d3d11.h
@@ -1,131 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_D3D11_H
-#define __OPENCL_CL_D3D11_H
-
-#include <d3d11.h>
-#include "cl.h"
-#include "cl_platform.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************
- * cl_khr_d3d11_sharing                                                       */
-#define cl_khr_d3d11_sharing 1
-
-typedef cl_uint cl_d3d11_device_source_khr;
-typedef cl_uint cl_d3d11_device_set_khr;
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_D3D11_DEVICE_KHR                  -1006
-#define CL_INVALID_D3D11_RESOURCE_KHR                -1007
-#define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
-#define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
-
-/* cl_d3d11_device_source */
-#define CL_D3D11_DEVICE_KHR                          0x4019
-#define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
-
-/* cl_d3d11_device_set */
-#define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
-#define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
-#define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
-
-/* cl_mem_info */
-#define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
-
-/* cl_image_info */
-#define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
-#define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
-    cl_platform_id             platform,
-    cl_d3d11_device_source_khr d3d_device_source,
-    void *                     d3d_object,
-    cl_d3d11_device_set_khr    d3d_device_set,
-    cl_uint                    num_entries,
-    cl_device_id *             devices,
-    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
-    cl_context     context,
-    cl_mem_flags   flags,
-    ID3D11Buffer * resource,
-    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture2D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
-    cl_context        context,
-    cl_mem_flags      flags,
-    ID3D11Texture3D * resource,
-    UINT              subresource,
-    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_D3D11_H */
-
--- a/include/triton/external/CL/cl_dx9_media_sharing.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing.h
@@ -1,132 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_H
-
-#include "cl.h"
-#include "cl_platform.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************************************************/
-/* cl_khr_dx9_media_sharing                                                   */
-#define cl_khr_dx9_media_sharing 1
-
-typedef cl_uint             cl_dx9_media_adapter_type_khr;
-typedef cl_uint             cl_dx9_media_adapter_set_khr;
-    
-#if defined(_WIN32)
-#include <d3d9.h>
-typedef struct _cl_dx9_surface_info_khr
-{
-    IDirect3DSurface9 *resource;
-    HANDLE shared_handle;
-} cl_dx9_surface_info_khr;
-#endif
-
-
-/******************************************************************************/
-
-/* Error Codes */
-#define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
-#define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
-#define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
-#define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
-
-/* cl_media_adapter_type_khr */
-#define CL_ADAPTER_D3D9_KHR                              0x2020
-#define CL_ADAPTER_D3D9EX_KHR                            0x2021
-#define CL_ADAPTER_DXVA_KHR                              0x2022
-
-/* cl_media_adapter_set_khr */
-#define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
-#define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
-
-/* cl_context_info */
-#define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
-#define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
-#define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
-
-/* cl_mem_info */
-#define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
-#define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
-#define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
-
-/******************************************************************************/
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
-    cl_platform_id                   platform,
-    cl_uint                          num_media_adapters,
-    cl_dx9_media_adapter_type_khr *  media_adapter_type,
-    void *                           media_adapters,
-    cl_dx9_media_adapter_set_khr     media_adapter_set,
-    cl_uint                          num_entries,
-    cl_device_id *                   devices,
-    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
-    cl_context                    context,
-    cl_mem_flags                  flags,
-    cl_dx9_media_adapter_type_khr adapter_type,
-    void *                        surface_info,
-    cl_uint                       plane,                                                                          
-    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
-    cl_command_queue command_queue,
-    cl_uint          num_objects,
-    const cl_mem *   mem_objects,
-    cl_uint          num_events_in_wait_list,
-    const cl_event * event_wait_list,
-    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
-
--- a/include/triton/external/CL/cl_dx9_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing_intel.h
@@ -1,182 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_dx9_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
-
-#include <CL/cl.h>
-#include <CL/cl_platform.h>
-#include <d3d9.h>
-#include <dxvahd.h>
-#include <wtypes.h>
-#include <d3d9types.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_dx9_media_sharing extension *
-****************************************/
-
-#define cl_intel_dx9_media_sharing 1
-
-typedef cl_uint cl_dx9_device_source_intel;
-typedef cl_uint cl_dx9_device_set_intel;
-
-/* error codes */
-#define CL_INVALID_DX9_DEVICE_INTEL                   -1010
-#define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
-#define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
-#define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
-
-/* cl_dx9_device_source_intel */
-#define CL_D3D9_DEVICE_INTEL                          0x4022
-#define CL_D3D9EX_DEVICE_INTEL                        0x4070
-#define CL_DXVA_DEVICE_INTEL                          0x4071
-
-/* cl_dx9_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
-#define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
-
-/* cl_context_info */
-#define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
-#define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
-#define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
-
-/* cl_mem_info */
-#define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
-#define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
-
-/* cl_image_info */
-#define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
-#define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
-/******************************************************************************/
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDsFromDX9INTEL(
-    cl_platform_id              /* platform */,
-    cl_dx9_device_source_intel  /* dx9_device_source */,
-    void*                       /* dx9_object */,
-    cl_dx9_device_set_intel     /* dx9_device_set */,
-    cl_uint                     /* num_entries */, 
-    cl_device_id*               /* devices */, 
-    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
-    cl_platform_id              /* platform */,
-    cl_dx9_device_source_intel  /* dx9_device_source */,
-    void*                       /* dx9_object */,
-    cl_dx9_device_set_intel     /* dx9_device_set */,
-    cl_uint                     /* num_entries */, 
-    cl_device_id*               /* devices */, 
-    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromDX9MediaSurfaceINTEL(
-    cl_context                  /* context */,
-    cl_mem_flags                /* flags */,
-    IDirect3DSurface9*          /* resource */,
-    HANDLE                      /* sharedHandle */,
-    UINT                        /* plane */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
-    cl_context                  /* context */,
-    cl_mem_flags                /* flags */,
-    IDirect3DSurface9*          /* resource */,
-    HANDLE                      /* sharedHandle */,
-    UINT                        /* plane */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireDX9ObjectsINTEL(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    const cl_mem*               /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    const cl_mem*               /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseDX9ObjectsINTEL(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    cl_mem*                     /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
-    cl_command_queue            /* command_queue */,
-    cl_uint                     /* num_objects */,
-    cl_mem*                     /* mem_objects */,
-    cl_uint                     /* num_events_in_wait_list */,
-    const cl_event*             /* event_wait_list */,
-    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
-
--- a/include/triton/external/CL/cl_egl.h
+++ b/include/triton/external/CL/cl_egl.h
@@ -1,136 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-#ifndef __OPENCL_CL_EGL_H
-#define __OPENCL_CL_EGL_H
-
-#ifdef __APPLE__
-
-#else
-#include "cl.h"
-#endif  
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
-/* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
-#define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
-#define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
-#define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
-
-/* Error type for clCreateFromEGLImageKHR */
-#define CL_INVALID_EGL_OBJECT_KHR             -1093
-#define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
-
-/* CLeglImageKHR is an opaque handle to an EGLImage */
-typedef void* CLeglImageKHR;
-
-/* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
-typedef void* CLeglDisplayKHR;
-
-/* CLeglSyncKHR is an opaque handle to an EGLSync object */
-typedef void* CLeglSyncKHR;
-
-/* properties passed to clCreateFromEGLImageKHR */
-typedef intptr_t cl_egl_image_properties_khr;
-
-
-#define cl_khr_egl_image 1
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromEGLImageKHR(cl_context                  /* context */,
-                        CLeglDisplayKHR             /* egldisplay */,
-                        CLeglImageKHR               /* eglimage */,
-                        cl_mem_flags                /* flags */,
-                        const cl_egl_image_properties_khr * /* properties */,
-                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
-	cl_context                  context,
-	CLeglDisplayKHR             egldisplay,
-	CLeglImageKHR               eglimage,
-	cl_mem_flags                flags,
-	const cl_egl_image_properties_khr * properties,
-	cl_int *                    errcode_ret);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
-                              cl_uint          /* num_objects */,
-                              const cl_mem *   /* mem_objects */,
-                              cl_uint          /* num_events_in_wait_list */,
-                              const cl_event * /* event_wait_list */,
-                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
-	cl_command_queue command_queue,
-	cl_uint          num_objects,
-	const cl_mem *   mem_objects,
-	cl_uint          num_events_in_wait_list,
-	const cl_event * event_wait_list,
-	cl_event *       event);
-
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
-                              cl_uint          /* num_objects */,
-                              const cl_mem *   /* mem_objects */,
-                              cl_uint          /* num_events_in_wait_list */,
-                              const cl_event * /* event_wait_list */,
-                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
-	cl_command_queue command_queue,
-	cl_uint          num_objects,
-	const cl_mem *   mem_objects,
-	cl_uint          num_events_in_wait_list,
-	const cl_event * event_wait_list,
-	cl_event *       event);
-
-
-#define cl_khr_egl_event 1
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromEGLSyncKHR(cl_context      /* context */,
-                            CLeglSyncKHR    /* sync */,
-                            CLeglDisplayKHR /* display */,
-                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
-	cl_context      context,
-	CLeglSyncKHR    sync,
-	CLeglDisplayKHR display,
-	cl_int *        errcode_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __OPENCL_CL_EGL_H */
--- a/include/triton/external/CL/cl_ext.h
+++ b/include/triton/external/CL/cl_ext.h
@@ -1,670 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
-
-/* cl_ext.h contains OpenCL extensions which don't have external */
-/* (OpenGL, D3D) dependencies.                                   */
-
-#ifndef __CL_EXT_H
-#define __CL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-        #include <OpenCL/cl.h>
-    #include <AvailabilityMacros.h>
-#else
-        #include "cl.h"
-#endif
-
-/* cl_khr_fp64 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
-
-/* cl_khr_fp16 extension - no extension #define since it has no functions  */
-#define CL_DEVICE_HALF_FP_CONFIG                    0x1033
-
-/* Memory object destruction
- *
- * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
- *
- * Registers a user callback function that will be called when the memory object is deleted and its resources 
- * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
- * stack associated with memobj. The registered user callback functions are called in the reverse order in 
- * which they were registered. The user callback functions are called and then the memory object is deleted 
- * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
- * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
- * the storage bits for the memory object, can be reused or freed.
- *
- * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
- *
- * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- */
-#define cl_APPLE_SetMemObjectDestructor 1
-cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
-                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
-                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
-
-
-/* Context Logging Functions
- *
- * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
- * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
- * before using.
- *
- * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
- */
-#define cl_APPLE_ContextLoggingFunctions 1
-extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
-                                            const void * /* private_info */, 
-                                            size_t       /* cb */, 
-                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
-extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-/* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
-extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
-                                          const void * /* private_info */, 
-                                          size_t       /* cb */, 
-                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
-
-
-/************************ 
-* cl_khr_icd extension *                                                  
-************************/
-#define cl_khr_icd 1
-
-/* cl_platform_info                                                        */
-#define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
-
-/* Additional Error Codes                                                  */
-#define CL_PLATFORM_NOT_FOUND_KHR                   -1001
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
-                       cl_platform_id * /* platforms */,
-                       cl_uint *        /* num_platforms */);
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
-    cl_uint          /* num_entries */,
-    cl_platform_id * /* platforms */,
-    cl_uint *        /* num_platforms */);
-
-
-/* Extension: cl_khr_image2D_buffer
- *
- * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
- * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
- * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
- * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
- * for 2D images created from a buffer.
- *
- * When the 2D image from buffer is created, the client must specify the width,
- * height, image format (i.e. channel order and channel data type) and optionally the row pitch
- *
- * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
- * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
- */
-    
-/*************************************
- * cl_khr_initalize_memory extension *
- *************************************/
-    
-#define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
-    
-    
-/**************************************
- * cl_khr_terminate_context extension *
- **************************************/
-    
-#define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
-#define CL_CONTEXT_TERMINATE_KHR                    0x2032
-
-#define cl_khr_terminate_context 1
-extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
-    
-    
-/*
- * Extension: cl_khr_spir
- *
- * This extension adds support to create an OpenCL program object from a 
- * Standard Portable Intermediate Representation (SPIR) instance
- */
-
-#define CL_DEVICE_SPIR_VERSIONS                     0x40E0
-#define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
-
-
-/*****************************************
- * cl_khr_create_command_queue extension *
- *****************************************/
-#define cl_khr_create_command_queue 1
-
-typedef cl_bitfield cl_queue_properties_khr;
-
-extern CL_API_ENTRY cl_command_queue CL_API_CALL
-clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
-                                       cl_device_id /* device */,
-                                       const cl_queue_properties_khr* /* properties */,
-                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
-typedef CL_API_ENTRY cl_command_queue
-(CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
-                                                         cl_device_id /* device */,
-                                                         const cl_queue_properties_khr* /* properties */,
-                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
-
-
-/******************************************
-* cl_nv_device_attribute_query extension *
-******************************************/
-/* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
-#define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
-#define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
-#define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
-#define CL_DEVICE_WARP_SIZE_NV                      0x4003
-#define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
-#define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
-#define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
-
-/*********************************
-* cl_amd_device_memory_flags *
-*********************************/
-#define cl_amd_device_memory_flags 1
-
-#define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
-
-/* cl_device_info */
-#define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
-
-/*********************************
-* cl_amd_device_attribute_query *
-*********************************/
-#define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
-#define CL_DEVICE_TOPOLOGY_AMD                      0x4037
-#define CL_DEVICE_BOARD_NAME_AMD                    0x4038
-#define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
-#define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
-#define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
-#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
-#define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
-#define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
-#define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
-#define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
-#define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
-
-typedef union
-{
-    struct { cl_uint type; cl_uint data[5]; } raw;
-    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
-} cl_device_topology_amd;
-
-#define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
-
-
-/**************************
-* cl_amd_offline_devices *
-**************************/
-#define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
-
-/*********************************
-* cl_arm_printf extension
-*********************************/
-#define CL_PRINTF_CALLBACK_ARM                      0x40B0
-#define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
-
-#ifdef CL_VERSION_1_1
-   /***********************************
-    * cl_ext_device_fission extension *
-    ***********************************/
-    #define cl_ext_device_fission   1
-    
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
-    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
-    
-    typedef CL_API_ENTRY cl_int 
-    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    typedef cl_ulong  cl_device_partition_property_ext;
-    extern CL_API_ENTRY cl_int CL_API_CALL
-    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
-                            const cl_device_partition_property_ext * /* properties */,
-                            cl_uint /*num_entries*/,
-                            cl_device_id * /*out_devices*/,
-                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    typedef CL_API_ENTRY cl_int 
-    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
-                                                const cl_device_partition_property_ext * /* properties */,
-                                                cl_uint /*num_entries*/,
-                                                cl_device_id * /*out_devices*/,
-                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
-
-    /* cl_device_partition_property_ext */
-    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
-    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
-    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
-    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
-    
-    /* clDeviceGetInfo selectors */
-    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
-    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
-    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
-    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
-    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
-    
-    /* error codes */
-    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
-    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
-    #define CL_INVALID_PARTITION_NAME_EXT               -1059
-    
-    /* CL_AFFINITY_DOMAINs */
-    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
-    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
-    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
-    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
-    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
-    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
-    
-    /* cl_device_partition_property_ext list terminators */
-    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
-    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
-    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
-
-    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
-     * no extension #define since they have no functions
-     */
-    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
-
-/*********************************
-* cl_qcom_ext_host_ptr extension
-*********************************/
-
-#define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
-
-#define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
-#define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
-#define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
-#define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
-#define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
-#define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
-#define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
-#define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
-
-typedef cl_uint                                   cl_image_pitch_info_qcom;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceImageInfoQCOM(cl_device_id             device,
-                         size_t                   image_width,
-                         size_t                   image_height,
-                         const cl_image_format   *image_format,
-                         cl_image_pitch_info_qcom param_name,
-                         size_t                   param_value_size,
-                         void                    *param_value,
-                         size_t                  *param_value_size_ret);
-
-typedef struct _cl_mem_ext_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Legal values will be defined in layered extensions. */
-    cl_uint  allocation_type;
-            
-    /* Host cache policy for this external memory allocation. */
-    cl_uint  host_cache_policy;
-
-} cl_mem_ext_host_ptr;
-
-/*********************************
-* cl_qcom_ion_host_ptr extension
-*********************************/
-
-#define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
-
-typedef struct _cl_mem_ion_host_ptr
-{
-    /* Type of external memory allocation. */
-    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
-    cl_mem_ext_host_ptr  ext_host_ptr;
-
-    /* ION file descriptor */
-    int                  ion_filedesc;
-            
-    /* Host pointer to the ION allocated memory */
-    void*                ion_hostptr;
-
-} cl_mem_ion_host_ptr;
-
-#endif /* CL_VERSION_1_1 */
-
-#if defined(CL_VERSION_1_2)
-
-/******************************************
- * cl_img_yuv_image extension *
- ******************************************/
-
-/* Image formats used in clCreateImage */
-#define CL_NV21_IMG                                 0x40D0
-#define CL_YV12_IMG                                 0x40D1
-
-/******************************************
- * cl_img_cached_allocations extension *
- ******************************************/
-
-/* Flag values used by clCreteBuffer */
-#define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
-#define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
-
-/******************************************
- * cl_img_use_gralloc_ptr extension *
- ******************************************/
-
-/* Flag values used by clCreteBuffer */
-#define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
-
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
-#define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
-
-/* Error code from clEnqueueReleaseGrallocObjectsIMG */
-#define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
-                                  cl_uint               /* num_objects */,
-                                  const cl_mem *        /* mem_objects */,
-                                  cl_uint               /* num_events_in_wait_list */,
-                                  const cl_event *      /* event_wait_list */,
-                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
-                                  cl_uint               /* num_objects */,
-                                  const cl_mem *        /* mem_objects */,
-                                  cl_uint               /* num_events_in_wait_list */,
-                                  const cl_event *      /* event_wait_list */,
-                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-#endif /* CL_VERSION_1_2 */
-
-#ifdef CL_VERSION_2_0
-/*********************************
-* cl_khr_subgroups extension
-*********************************/
-#define cl_khr_subgroups 1
-
-/* cl_kernel_sub_group_info is declared in CL.h. */
-
-/* cl_kernel_sub_group_info */
-#define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
-#define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
-						   cl_device_id /*in_device*/,
-						   cl_kernel_sub_group_info /* param_name */,
-						   size_t /*input_value_size*/,
-						   const void * /*input_value*/,
-						   size_t /*param_value_size*/,
-						   void* /*param_value*/,
-						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
-						   
-typedef CL_API_ENTRY cl_int
-     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
-						      cl_device_id /*in_device*/,
-						      cl_kernel_sub_group_info /* param_name */,
-						      size_t /*input_value_size*/,
-						      const void * /*input_value*/,
-						      size_t /*param_value_size*/,
-						      void* /*param_value*/,
-						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
-#endif /* CL_VERSION_2_0 */
-
-#ifdef CL_VERSION_2_1
-/*********************************
-* cl_khr_priority_hints extension
-*********************************/
-#define cl_khr_priority_hints 1
-
-typedef cl_uint  cl_queue_priority_khr;
-
-/* cl_command_queue_properties */
-#define CL_QUEUE_PRIORITY_KHR 0x1096
-
-/* cl_queue_priority_khr */
-#define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
-#define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
-#define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
-
-#endif /* CL_VERSION_2_1 */
-
-#ifdef CL_VERSION_2_1
-/*********************************
-* cl_khr_throttle_hints extension
-*********************************/
-#define cl_khr_throttle_hints 1
-
-typedef cl_uint  cl_queue_throttle_khr;
-
-/* cl_command_queue_properties */
-#define CL_QUEUE_THROTTLE_KHR 0x1097
-
-/* cl_queue_throttle_khr */
-#define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
-#define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
-#define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
-
-#endif /* CL_VERSION_2_1 */
-
-#ifdef CL_VERSION_2_2
-/*********************************
-* cl_khr_subgroup_named_barrier
-*********************************/
-#define cl_khr_subgroup_named_barrier 1
-
-/* cl_device_info */
-#define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
-
-#endif /* CL_VERSION_2_2 */
-
-/**********************************
- * cl_arm_import_memory extension *
- **********************************/
-
-#ifdef CL_VERSION_1_0
-
-typedef intptr_t cl_import_properties_arm;
-
-/* Default and valid proporties name for cl_arm_import_memory */
-#define CL_IMPORT_TYPE_ARM                        0x40B2
-
-/* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
-
-/* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
-
-/* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
-#define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
-
-/* This extension adds a new function that allows for direct memory import into
- * OpenCL via the clImportMemoryARM function.
- *
- * Memory imported through this interface will be mapped into the device's page
- * tables directly, providing zero copy access. It will never fall back to copy
- * operations and aliased buffers.
- *
- * Types of memory supported for import are specified as additional extension
- * strings.
- *
- * This extension produces cl_mem allocations which are compatible with all other
- * users of cl_mem in the standard API.
- *
- * This extension maps pages with the same properties as the normal buffer creation
- * function clCreateBuffer.
- */
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clImportMemoryARM( cl_context context,
-                   cl_mem_flags flags,
-                   const cl_import_properties_arm *properties,
-                   void *memory,
-                   size_t size,
-                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
-
-
-#endif /* CL_VERSION_1_0 */
-
-/******************************************
- * cl_arm_shared_virtual_memory extension *
- ******************************************/
-
-#ifdef CL_VERSION_1_2
-
-/* Used by clGetDeviceInfo */
-#define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
-
-/* Used by clGetMemObjectInfo */
-#define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
-
-/* Used by clSetKernelExecInfoARM: */
-#define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
-#define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
-
-/* To be used by clGetEventInfo: */
-#define CL_COMMAND_SVM_FREE_ARM                         0x40BA
-#define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
-#define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
-#define CL_COMMAND_SVM_MAP_ARM                          0x40BD
-#define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
-
-/* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
-#define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
-#define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
-#define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
-#define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
-
-/* Flag values used by clSVMAllocARM: */
-#define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
-#define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
-
-typedef cl_bitfield cl_svm_mem_flags_arm;
-typedef cl_uint     cl_kernel_exec_info_arm;
-typedef cl_bitfield cl_device_svm_capabilities_arm;
-
-extern CL_API_ENTRY void * CL_API_CALL
-clSVMAllocARM(cl_context       /* context */,
-              cl_svm_mem_flags_arm /* flags */,
-              size_t           /* size */,
-              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY void CL_API_CALL
-clSVMFreeARM(cl_context        /* context */,
-             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
-                    cl_uint           /* num_svm_pointers */,
-                    void *[]          /* svm_pointers[] */,
-                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
-                                                           cl_uint          /* num_svm_pointers */,
-                                                           void *[]         /* svm_pointers[] */,
-                                                           void *           /* user_data */),
-                    void *            /* user_data */,
-                    cl_uint           /* num_events_in_wait_list */,
-                    const cl_event *  /* event_wait_list */,
-                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
-                      cl_bool           /* blocking_copy */,
-                      void *            /* dst_ptr */,
-                      const void *      /* src_ptr */,
-                      size_t            /* size */,
-                      cl_uint           /* num_events_in_wait_list */,
-                      const cl_event *  /* event_wait_list */,
-                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
-                       void *            /* svm_ptr */,
-                       const void *      /* pattern */,
-                       size_t            /* pattern_size */,
-                       size_t            /* size */,
-                       cl_uint           /* num_events_in_wait_list */,
-                       const cl_event *  /* event_wait_list */,
-                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
-                   cl_bool           /* blocking_map */,
-                   cl_map_flags      /* flags */,
-                   void *            /* svm_ptr */,
-                   size_t            /* size */,
-                   cl_uint           /* num_events_in_wait_list */,
-                   const cl_event *  /* event_wait_list */,
-                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
-                     void *            /* svm_ptr */,
-                     cl_uint           /* num_events_in_wait_list */,
-                     const cl_event *  /* event_wait_list */,
-                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
-                            cl_uint      /* arg_index */,
-                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
-extern CL_API_ENTRY cl_int CL_API_CALL
-clSetKernelExecInfoARM(cl_kernel            /* kernel */,
-                       cl_kernel_exec_info_arm  /* param_name */,
-                       size_t               /* param_value_size */,
-                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
-
-#endif /* CL_VERSION_1_2 */
-
-#ifdef __cplusplus
-}
-#endif
-
-
-#endif /* __CL_EXT_H */
--- a/include/triton/external/CL/cl_ext_intel.h
+++ b/include/triton/external/CL/cl_ext_intel.h
@@ -1,429 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2017 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_ext_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-#ifndef __CL_EXT_INTEL_H
-#define __CL_EXT_INTEL_H
-
-#ifdef __APPLE__
-    #include <OpenCL/cl.h>
-    #include <OpenCL/cl_platform.h>
-#else
-    #include "cl.h"
-    #include "cl_platform.h"
-#endif
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/***************************************
-* cl_intel_thread_local_exec extension *
-****************************************/
-
-#define cl_intel_thread_local_exec 1
-
-#define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
-
-/***********************************************
-* cl_intel_device_partition_by_names extension *
-************************************************/
-
-#define cl_intel_device_partition_by_names 1
-
-#define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
-#define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
-
-/************************************************
-* cl_intel_accelerator extension                *
-* cl_intel_motion_estimation extension          *
-* cl_intel_advanced_motion_estimation extension *
-*************************************************/
-
-#define cl_intel_accelerator 1
-#define cl_intel_motion_estimation 1
-#define cl_intel_advanced_motion_estimation 1
-
-typedef struct _cl_accelerator_intel* cl_accelerator_intel;
-typedef cl_uint cl_accelerator_type_intel;
-typedef cl_uint cl_accelerator_info_intel;
-
-typedef struct _cl_motion_estimation_desc_intel {
-    cl_uint mb_block_type;
-    cl_uint subpixel_mode;
-    cl_uint sad_adjust_mode;
-    cl_uint search_path_type;
-} cl_motion_estimation_desc_intel;
-
-/* error codes */
-#define CL_INVALID_ACCELERATOR_INTEL                              -1094
-#define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
-#define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
-#define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
-
-/* cl_accelerator_type_intel */
-#define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
-
-/* cl_accelerator_info_intel */
-#define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
-#define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
-#define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
-#define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
-
-/* cl_motion_detect_desc_intel flags */
-#define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
-#define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
-#define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
-
-#define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
-#define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
-#define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
-
-#define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
-#define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
-
-#define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
-#define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
-#define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
-
-#define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
-#define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
-#define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
-#define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
-
-#define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
-#define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
-#define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
-
-#define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
-#define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
-#define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
-#define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
-#define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
-
-#define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
-#define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
-#define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
-#define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
-
-#define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
-#define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
-#define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
-#define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
-#define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
-
-#define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
-#define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
-#define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
-
-#define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
-#define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
-#define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
-#define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
-
-/* cl_device_info */
-#define CL_DEVICE_ME_VERSION_INTEL                                0x407E
-
-#define CL_ME_VERSION_LEGACY_INTEL                                0x0
-#define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
-#define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
-
-extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
-clCreateAcceleratorINTEL(
-    cl_context                  /* context */,
-    cl_accelerator_type_intel   /* accelerator_type */,
-    size_t                      /* descriptor_size */,
-    const void*                 /* descriptor */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
-    cl_context                  /* context */,
-    cl_accelerator_type_intel   /* accelerator_type */,
-    size_t                      /* descriptor_size */,
-    const void*                 /* descriptor */,
-    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetAcceleratorInfoINTEL(
-    cl_accelerator_intel        /* accelerator */,
-    cl_accelerator_info_intel   /* param_name */,
-    size_t                      /* param_value_size */,
-    void*                       /* param_value */,
-    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */,
-    cl_accelerator_info_intel   /* param_name */,
-    size_t                      /* param_value_size */,
-    void*                       /* param_value */,
-    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clRetainAcceleratorINTEL(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clReleaseAcceleratorINTEL(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
-    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
-
-/******************************************
-* cl_intel_simultaneous_sharing extension *
-*******************************************/
-
-#define cl_intel_simultaneous_sharing 1
-
-#define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
-#define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
-
-/***********************************
-* cl_intel_egl_image_yuv extension *
-************************************/
-
-#define cl_intel_egl_image_yuv 1
-
-#define CL_EGL_YUV_PLANE_INTEL                           0x4107
-
-/********************************
-* cl_intel_packed_yuv extension *
-*********************************/
-
-#define cl_intel_packed_yuv 1
-
-#define CL_YUYV_INTEL                                    0x4076
-#define CL_UYVY_INTEL                                    0x4077
-#define CL_YVYU_INTEL                                    0x4078
-#define CL_VYUY_INTEL                                    0x4079
-
-/********************************************
-* cl_intel_required_subgroup_size extension *
-*********************************************/
-
-#define cl_intel_required_subgroup_size 1
-
-#define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
-#define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
-#define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
-
-/****************************************
-* cl_intel_driver_diagnostics extension *
-*****************************************/
-
-#define cl_intel_driver_diagnostics 1
-
-typedef cl_uint cl_diagnostics_verbose_level;
-
-#define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
-
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
-#define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
-
-/********************************
-* cl_intel_planar_yuv extension *
-*********************************/
-
-#define CL_NV12_INTEL                                       0x410E
-
-#define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
-#define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
-
-#define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
-#define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
-
-/*******************************************************
-* cl_intel_device_side_avc_motion_estimation extension *
-********************************************************/
-
-#define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
-#define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
-#define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
-
-#define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
-#define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
-
-#define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
-#define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
-#define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
-#define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
-
-#define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
-#define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
-#define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
-#define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
-
-#define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
-#define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
-#define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
-
-#define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
-#define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
-#define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
-#define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
-#define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
-#define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
-#define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
-#define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
-
-#define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
-#define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
-#define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
-#define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
-#define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
-#define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
-#define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
-#define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
-#define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
-#define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
-#define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
-
-#define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
-#define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
-
-#define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
-#define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
-#define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
-
-#define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
-#define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
-#define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
-#define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
-
-#define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
-#define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
-#define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
-#define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
-#define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
-
-#define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
-#define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
-#define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
-#define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
-
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
-#define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
-
-#define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
-#define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
-
-#define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
-#define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
-
-#define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
-#define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
-#define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
-
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
-#define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
-
-#define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
-#define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
-
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
-#define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
-#define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
-
-#define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
-#define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
-#define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
-
-#define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
-#define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
-#define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
-
-#define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
-#define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* __CL_EXT_INTEL_H */
-
--- a/include/triton/external/CL/cl_gl.h
+++ b/include/triton/external/CL/cl_gl.h
@@ -1,167 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-#ifndef __OPENCL_CL_GL_H
-#define __OPENCL_CL_GL_H
-
-#ifdef __APPLE__
-#include <OpenCL/cl.h>
-#else
-#include "cl.h"
-#endif	
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef cl_uint     cl_gl_object_type;
-typedef cl_uint     cl_gl_texture_info;
-typedef cl_uint     cl_gl_platform_info;
-typedef struct __GLsync *cl_GLsync;
-
-/* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
-#define CL_GL_OBJECT_BUFFER                     0x2000
-#define CL_GL_OBJECT_TEXTURE2D                  0x2001
-#define CL_GL_OBJECT_TEXTURE3D                  0x2002
-#define CL_GL_OBJECT_RENDERBUFFER               0x2003
-#define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
-#define CL_GL_OBJECT_TEXTURE1D                  0x200F
-#define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
-#define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
-
-/* cl_gl_texture_info           */
-#define CL_GL_TEXTURE_TARGET                    0x2004
-#define CL_GL_MIPMAP_LEVEL                      0x2005
-#define CL_GL_NUM_SAMPLES                       0x2012
-
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLBuffer(cl_context     /* context */,
-                     cl_mem_flags   /* flags */,
-                     cl_GLuint      /* bufobj */,
-                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLTexture(cl_context      /* context */,
-                      cl_mem_flags    /* flags */,
-                      cl_GLenum       /* target */,
-                      cl_GLint        /* miplevel */,
-                      cl_GLuint       /* texture */,
-                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
-    
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromGLRenderbuffer(cl_context   /* context */,
-                           cl_mem_flags /* flags */,
-                           cl_GLuint    /* renderbuffer */,
-                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLObjectInfo(cl_mem                /* memobj */,
-                  cl_gl_object_type *   /* gl_object_type */,
-                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
-                  
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLTextureInfo(cl_mem               /* memobj */,
-                   cl_gl_texture_info   /* param_name */,
-                   size_t               /* param_value_size */,
-                   void *               /* param_value */,
-                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
-                          cl_uint               /* num_objects */,
-                          const cl_mem *        /* mem_objects */,
-                          cl_uint               /* num_events_in_wait_list */,
-                          const cl_event *      /* event_wait_list */,
-                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
-
-
-/* Deprecated OpenCL 1.1 APIs */
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture2D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
-extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
-clCreateFromGLTexture3D(cl_context      /* context */,
-                        cl_mem_flags    /* flags */,
-                        cl_GLenum       /* target */,
-                        cl_GLint        /* miplevel */,
-                        cl_GLuint       /* texture */,
-                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
-    
-/* cl_khr_gl_sharing extension  */
-    
-#define cl_khr_gl_sharing 1
-    
-typedef cl_uint     cl_gl_context_info;
-    
-/* Additional Error Codes  */
-#define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
-    
-/* cl_gl_context_info  */
-#define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
-#define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
-    
-/* Additional cl_context_properties  */
-#define CL_GL_CONTEXT_KHR                       0x2008
-#define CL_EGL_DISPLAY_KHR                      0x2009
-#define CL_GLX_DISPLAY_KHR                      0x200A
-#define CL_WGL_HDC_KHR                          0x200B
-#define CL_CGL_SHAREGROUP_KHR                   0x200C
-    
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
-                      cl_gl_context_info            /* param_name */,
-                      size_t                        /* param_value_size */,
-                      void *                        /* param_value */,
-                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
-    
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
-    const cl_context_properties * properties,
-    cl_gl_context_info            param_name,
-    size_t                        param_value_size,
-    void *                        param_value,
-    size_t *                      param_value_size_ret);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_GL_H */
--- a/include/triton/external/CL/cl_gl_ext.h
+++ b/include/triton/external/CL/cl_gl_ext.h
@@ -1,74 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-/* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
-/* OpenGL dependencies.                                                         */
-
-#ifndef __OPENCL_CL_GL_EXT_H
-#define __OPENCL_CL_GL_EXT_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-    #include <OpenCL/cl_gl.h>
-#else
-    #include "cl_gl.h"
-#endif
-
-/*
- * For each extension, follow this template
- *  cl_VEN_extname extension  */
-/* #define cl_VEN_extname 1
- * ... define new types, if any
- * ... define new tokens, if any
- * ... define new APIs, if any
- *
- *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
- *  This allows us to avoid having to decide whether to include GL headers or GLES here.
- */
-
-/* 
- *  cl_khr_gl_event  extension
- *  See section 9.9 in the OpenCL 1.1 spec for more information
- */
-#define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
-
-extern CL_API_ENTRY cl_event CL_API_CALL
-clCreateEventFromGLsyncKHR(cl_context           /* context */,
-                           cl_GLsync            /* cl_GLsync */,
-                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/include/triton/external/CL/cl_platform.h
+++ b/include/triton/external/CL/cl_platform.h
--- a/include/triton/external/CL/cl_va_api_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h
@@ -1,172 +0,0 @@
-/**********************************************************************************
- * Copyright (c) 2008-2016 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- **********************************************************************************/
-/*****************************************************************************\
-
-Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
-
-THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
-CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
-OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
-NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
-MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-File Name: cl_va_api_media_sharing_intel.h
-
-Abstract:
-
-Notes:
-
-\*****************************************************************************/
-
-
-#ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
-#define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
-
-#include "cl.h"
-#include "cl_platform.h"
-#include <va/va.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/******************************************
-* cl_intel_va_api_media_sharing extension *
-*******************************************/
-
-#define cl_intel_va_api_media_sharing 1
-
-/* error codes */
-#define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
-#define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
-#define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
-#define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
-
-/* cl_va_api_device_source_intel */
-#define CL_VA_API_DISPLAY_INTEL                             0x4094
-
-/* cl_va_api_device_set_intel */
-#define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
-#define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
-
-/* cl_context_info */
-#define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
-
-/* cl_mem_info */
-#define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
-
-/* cl_image_info */
-#define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
-
-/* cl_command_type */
-#define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
-#define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
-
-typedef cl_uint cl_va_api_device_source_intel;
-typedef cl_uint cl_va_api_device_set_intel;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
-    cl_platform_id                /* platform */,
-    cl_va_api_device_source_intel /* media_adapter_type */,
-    void*                         /* media_adapter */,
-    cl_va_api_device_set_intel    /* media_adapter_set */,
-    cl_uint                       /* num_entries */,
-    cl_device_id*                 /* devices */,
-    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
-    cl_platform_id                /* platform */,
-    cl_va_api_device_source_intel /* media_adapter_type */,
-    void*                         /* media_adapter */,
-    cl_va_api_device_set_intel    /* media_adapter_set */,
-    cl_uint                       /* num_entries */,
-    cl_device_id*                 /* devices */,
-    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_mem CL_API_CALL
-clCreateFromVA_APIMediaSurfaceINTEL(
-    cl_context                    /* context */,
-    cl_mem_flags                  /* flags */,
-    VASurfaceID*                  /* surface */,
-    cl_uint                       /* plane */,
-    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
-    cl_context                    /* context */,
-    cl_mem_flags                  /* flags */,
-    VASurfaceID*                  /* surface */,
-    cl_uint                       /* plane */,
-    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueAcquireVA_APIMediaSurfacesINTEL(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-extern CL_API_ENTRY cl_int CL_API_CALL
-clEnqueueReleaseVA_APIMediaSurfacesINTEL(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-	
-typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
-    cl_command_queue              /* command_queue */,
-    cl_uint                       /* num_objects */,
-    const cl_mem*                 /* mem_objects */,
-    cl_uint                       /* num_events_in_wait_list */,
-    const cl_event*               /* event_wait_list */,
-    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
-
--- a/include/triton/external/CL/opencl.h
+++ b/include/triton/external/CL/opencl.h
@@ -1,59 +0,0 @@
-/*******************************************************************************
- * Copyright (c) 2008-2015 The Khronos Group Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and/or associated documentation files (the
- * "Materials"), to deal in the Materials without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Materials, and to
- * permit persons to whom the Materials are furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Materials.
- *
- * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
- * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
- * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
- *    https://www.khronos.org/registry/
- *
- * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
- ******************************************************************************/
-
-/* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
-
-#ifndef __OPENCL_H
-#define __OPENCL_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __APPLE__
-
-#include <OpenCL/cl.h>
-#include <OpenCL/cl_gl.h>
-#include <OpenCL/cl_gl_ext.h>
-#include <OpenCL/cl_ext.h>
-
-#else
-
-#include "cl.h"
-#include "cl_gl.h"
-#include "cl_gl_ext.h"
-#include "cl_ext.h"
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif  /* __OPENCL_H   */
-
--- a/include/triton/external/hip.h
+++ b/include/triton/external/hip.h
@@ -0,0 +1,288 @@
+/*
+ * @brief hipError_t
+ * @enum
+ * @ingroup Enumerations
+ */
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
+// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
+
+// Ignoring error-code return values from hip APIs is discouraged. On C++17,
+// we can make that yield a warning
+
+/*
+ * @brief hipError_t
+ * @enum
+ * @ingroup Enumerations
+ */
+// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
+// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
+
+#include <cstddef>
+
+typedef enum hipError_t {
+    hipSuccess = 0,  ///< Successful completion.
+    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
+                               ///< or not in an acceptable range.
+    hipErrorOutOfMemory = 2,
+    // Deprecated
+    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
+    hipErrorNotInitialized = 3,
+    // Deprecated
+    hipErrorInitializationError = 3,
+    hipErrorDeinitialized = 4,
+    hipErrorProfilerDisabled = 5,
+    hipErrorProfilerNotInitialized = 6,
+    hipErrorProfilerAlreadyStarted = 7,
+    hipErrorProfilerAlreadyStopped = 8,
+    hipErrorInvalidConfiguration = 9,
+    hipErrorInvalidPitchValue = 12,
+    hipErrorInvalidSymbol = 13,
+    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
+    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
+    hipErrorInsufficientDriver = 35,
+    hipErrorMissingConfiguration = 52,
+    hipErrorPriorLaunchFailure = 53,
+    hipErrorInvalidDeviceFunction = 98,
+    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
+    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
+    hipErrorInvalidImage = 200,
+    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
+    hipErrorContextAlreadyCurrent = 202,
+    hipErrorMapFailed = 205,
+    // Deprecated
+    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
+    hipErrorUnmapFailed = 206,
+    hipErrorArrayIsMapped = 207,
+    hipErrorAlreadyMapped = 208,
+    hipErrorNoBinaryForGpu = 209,
+    hipErrorAlreadyAcquired = 210,
+    hipErrorNotMapped = 211,
+    hipErrorNotMappedAsArray = 212,
+    hipErrorNotMappedAsPointer = 213,
+    hipErrorECCNotCorrectable = 214,
+    hipErrorUnsupportedLimit = 215,
+    hipErrorContextAlreadyInUse = 216,
+    hipErrorPeerAccessUnsupported = 217,
+    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
+    hipErrorInvalidGraphicsContext = 219,
+    hipErrorInvalidSource = 300,
+    hipErrorFileNotFound = 301,
+    hipErrorSharedObjectSymbolNotFound = 302,
+    hipErrorSharedObjectInitFailed = 303,
+    hipErrorOperatingSystem = 304,
+    hipErrorInvalidHandle = 400,
+    // Deprecated
+    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
+    hipErrorNotFound = 500,
+    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
+                             ///< ready.  This is not actually an error, but is used to distinguish
+                             ///< from hipSuccess (which indicates completion).  APIs that return
+                             ///< this error include hipEventQuery and hipStreamQuery.
+    hipErrorIllegalAddress = 700,
+    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
+    hipErrorLaunchTimeOut = 702,
+    hipErrorPeerAccessAlreadyEnabled =
+        704,  ///< Peer access was already enabled from the current device.
+    hipErrorPeerAccessNotEnabled =
+        705,  ///< Peer access was never enabled from the current device.
+    hipErrorSetOnActiveProcess = 708,
+    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
+    hipErrorHostMemoryAlreadyRegistered =
+        712,  ///< Produced when trying to lock a page-locked memory.
+    hipErrorHostMemoryNotRegistered =
+        713,  ///< Produced when trying to unlock a non-page-locked memory.
+    hipErrorLaunchFailure =
+        719,  ///< An exception occurred on the device while executing a kernel.
+    hipErrorCooperativeLaunchTooLarge =
+        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
+              ///< that was launched via cooperative launch APIs exceeds the maximum number of
+              ///< allowed blocks for the current device
+    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
+    hipErrorUnknown = 999,  //< Unknown error.
+    // HSA Runtime Error Codes start here.
+    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
+                                   ///< in production systems.
+    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
+                                  ///< not seen in production systems.
+    hipErrorTbd  ///< Marker that more error codes are needed.
+} hipError_t;
+
+
+typedef struct ihipCtx_t* hipCtx_t;
+
+// Note many APIs also use integer deviceIds as an alternative to the device pointer:
+typedef int hipDevice_t;
+
+typedef enum hipDeviceP2PAttr {
+  hipDevP2PAttrPerformanceRank = 0,
+  hipDevP2PAttrAccessSupported,
+  hipDevP2PAttrNativeAtomicSupported,
+  hipDevP2PAttrHipArrayAccessSupported
+} hipDeviceP2PAttr;
+
+typedef struct ihipStream_t* hipStream_t;
+
+#define hipIpcMemLazyEnablePeerAccess 0
+
+#define HIP_IPC_HANDLE_SIZE 64
+
+typedef struct hipIpcMemHandle_st {
+    char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcMemHandle_t;
+
+typedef struct hipIpcEventHandle_st {
+    char reserved[HIP_IPC_HANDLE_SIZE];
+} hipIpcEventHandle_t;
+
+typedef struct ihipModule_t* hipModule_t;
+
+typedef struct ihipModuleSymbol_t* hipFunction_t;
+
+typedef struct hipFuncAttributes {
+    int binaryVersion;
+    int cacheModeCA;
+    size_t constSizeBytes;
+    size_t localSizeBytes;
+    int maxDynamicSharedSizeBytes;
+    int maxThreadsPerBlock;
+    int numRegs;
+    int preferredShmemCarveout;
+    int ptxVersion;
+    size_t sharedSizeBytes;
+} hipFuncAttributes;
+
+typedef struct ihipEvent_t* hipEvent_t;
+
+/*
+ * @brief hipDeviceAttribute_t
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipDeviceAttribute_t {
+    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
+    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
+    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
+    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
+    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
+    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
+    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
+    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
+                                                ///< bytes.
+    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
+    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
+    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
+                                             ///< thread block. This number is shared by all thread
+                                             ///< blocks simultaneously resident on a
+                                             ///< multiprocessor.
+    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
+    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
+    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
+    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
+    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
+    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
+                                    ///< cache.
+    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
+                                                    ///< multiprocessor.
+    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
+    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
+    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
+                                          ///< concurrently.
+    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
+    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
+    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
+                                                         ///< Multiprocessor.
+    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
+    hipDeviceAttributeIntegrated,                        ///< iGPU
+    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
+    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
+    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
+    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
+    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
+    hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D images in image elements
+    hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimensions height of 3D images in image elements
+    hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimensions depth of 3D images in image elements
+
+    hipDeviceAttributeHdpMemFlushCntl,      ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
+    hipDeviceAttributeHdpRegFlushCntl,      ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
+
+    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
+    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
+    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
+    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
+    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
+    hipDeviceAttributeEccEnabled,           ///<Device has ECC support enabled
+
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,        ///< Supports cooperative launch on multiple
+                                                                  ///devices with unmatched functions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,     ///< Supports cooperative launch on multiple
+                                                                  ///devices with unmatched grid dimensions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
+                                                                  ///devices with unmatched block dimensions
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
+                                                                  ///devices with unmatched shared memories
+    hipDeviceAttributeAsicRevision,         ///< Revision of the GPU in this device
+    hipDeviceAttributeManagedMemory,        ///< Device supports allocating managed memory on this system
+    hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
+                                                      /// the device without migration
+    hipDeviceAttributeConcurrentManagedAccess,  ///< Device can coherently access managed memory
+                                                /// concurrently with the CPU
+    hipDeviceAttributePageableMemoryAccess,     ///< Device supports coherently accessing pageable memory
+                                                /// without calling hipHostRegister on it
+    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
+                                                              /// the host's page tables
+    hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
+                                            ///< hipStreamWaitValue64() , '0' otherwise.
+
+} hipDeviceAttribute_t;
+
+typedef void* hipDeviceptr_t;
+
+/*
+ * @brief hipJitOption
+ * @enum
+ * @ingroup Enumerations
+ */
+typedef enum hipJitOption {
+    hipJitOptionMaxRegisters = 0,
+    hipJitOptionThreadsPerBlock,
+    hipJitOptionWallTime,
+    hipJitOptionInfoLogBuffer,
+    hipJitOptionInfoLogBufferSizeBytes,
+    hipJitOptionErrorLogBuffer,
+    hipJitOptionErrorLogBufferSizeBytes,
+    hipJitOptionOptimizationLevel,
+    hipJitOptionTargetFromContext,
+    hipJitOptionTarget,
+    hipJitOptionFallbackStrategy,
+    hipJitOptionGenerateDebugInfo,
+    hipJitOptionLogVerbose,
+    hipJitOptionGenerateLineInfo,
+    hipJitOptionCacheMode,
+    hipJitOptionSm3xOpt,
+    hipJitOptionFastCompile,
+    hipJitOptionNumOptions
+} hipJitOption;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncAttribute {
+    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
+    hipFuncAttributePreferredSharedMemoryCarveout = 9,
+    hipFuncAttributeMax
+} hipFuncAttribute;
+
+/**
+ * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
+ */
+typedef enum hipFuncCache_t {
+    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
+    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
+    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
+    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
+} hipFuncCache_t;
+
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
+#define HIP_LAUNCH_PARAM_END ((void*)0x03)
--- a/lib/codegen/pass.cc
+++ b/lib/codegen/pass.cc
@@ -13,45 +13,40 @@
 #include "triton/codegen/transform/peephole.h"
 #include "triton/codegen/transform/pipeline.h"
 #include "triton/codegen/transform/prefetch.h"
-#include "triton/driver/device.h"
-#include "triton/driver/kernel.h"
-#include "triton/driver/module.h"
 #include "triton/ir/function.h"
 #include "triton/ir/module.h"
 #include "triton/ir/print.h"
 #include "llvm/IR/Module.h"
-
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
 namespace triton {
 namespace codegen {

 // TODO:
 // There should be a proper pass manager there!
-void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
-                            driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
+std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
+                                                     int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
  // generate llvm code
-  llvm::LLVMContext ctx;
  std::string name = ir.get_function_list()[0]->get_name();
  std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
  // optimizations
-  std::unique_ptr<codegen::target> target = dev->make_target();
-  bool cts_use_async = target->as_nvidia()->sm() >= 80;
+  bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
  // create passes
  codegen::analysis::align align;
  codegen::analysis::axes axes;
  codegen::transform::cts cts(cts_use_async);
  codegen::transform::pipeline pipeline(cts_use_async, num_stages);
  codegen::transform::disassociate disassociate;
-  codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
+  codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
  codegen::analysis::liveness liveness(&layouts);
-  codegen::analysis::swizzle swizzle(&layouts, target.get());
+  codegen::analysis::swizzle swizzle(&layouts, target);
  codegen::analysis::allocation allocation(&liveness);
  codegen::transform::dce dce;
-  codegen::transform::peephole peephole(target.get(), &layouts);
-//  codegen::transform::reassociate reassociate;
+  codegen::transform::peephole peephole(target, &layouts);
  codegen::transform::coalesce coalesce(&align, &layouts);
-  codegen::transform::prefetch prefetch_s(target.get());
-  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
-  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
+  codegen::transform::prefetch prefetch_s(target);
+  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
+  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
  // run passes
  dce.run(ir);
  peephole.run(ir);
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  layouts.run(ir);
  coalesce.run(ir);
  dce.run(ir);
-//  exit(1);
-
  align.run(ir);
  dce.run(ir);
  if (target->is_gpu())
    cts.run(ir);
  dce.run(ir);
  align.run(ir);
-//  ir::print(ir, std::cout);
  axes.run(ir);
  layouts.run(ir);
  peephole.run(ir);
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  allocation.run(ir);
  prefetch_s.run(ir);
  barriers.run(ir);
-  // ir.print(std::cout);
  isel.visit(ir, *llvm);
-  mod = driver::module::create(dev, std::move(llvm));
-  ker = driver::kernel::create(&*mod, name.c_str());
-  shared_mem = allocation.allocated_size();
+  shared_static = allocation.allocated_size();
+  return llvm;
 }

 } // namespace codegen
--- a/lib/driver/backend.cc
+++ b/lib/driver/backend.cc
@@ -1,231 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <vector>
-#include <stdexcept>
-#include "triton/driver/dispatch.h"
-#include "triton/driver/backend.h"
-#include "triton/driver/buffer.h"
-#include "triton/driver/context.h"
-#include "triton/driver/stream.h"
-#include "triton/driver/kernel.h"
-
-
-namespace triton
-{
-
-namespace driver
-{
-
-/*-----------------------------------*/
-//-----------  Platforms ------------*/
-/*-----------------------------------*/
-
-void backend::platforms::init() {
-  if(!cache_.empty())
-    return;
-  //if CUDA is here
-  if(dispatch::cuinit()){
-    cache_.push_back(new cu_platform());
-  }
-  //if host should be added
-  bool host_visible = true;
-  if(host_visible){
-    cache_.push_back(new host_platform());
-  }
-
-//  //if OpenCL is here
-//  if(dispatch::clinit()){
-//    cl_uint num_platforms;
-//    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
-//    std::vector<cl_platform_id> ids(num_platforms);
-//    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
-//    for(cl_platform_id id: ids)
-//      cache_.push_back(new cl_platform(id));
-//  }
-
-  if(cache_.empty())
-    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
-}
-
-void backend::platforms::get(std::vector<platform *> &results) {
-  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
-}
-
-std::vector<driver::platform*> backend::platforms::cache_;
-
-
-/*-----------------------------------*/
-//-----------  Devices --------------*/
-/*-----------------------------------*/
-
-void backend::devices::init(std::vector<platform*> const & platforms) {
-  if(!cache_.empty())
-    return;
-  for(driver::platform* pf: platforms)
-    pf->devices(cache_);
-  if(cache_.empty())
-    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
-}
-
-void backend::devices::get(std::vector<device*> &devs) {
-  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
-}
-
-std::vector<driver::device*> backend::devices::cache_;
-
-
-
-/*-----------------------------------*/
-//---------- Modules ----------------*/
-/*-----------------------------------*/
-
-void backend::modules::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
-
-/*-----------------------------------*/
-//-----------  Kernels --------------*/
-/*-----------------------------------*/
-
-void backend::kernels::release(){
-  for(auto & x: cache_)
-    delete x.second;
-  cache_.clear();
-}
-
-driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
-  std::tuple<driver::module*, std::string> key(mod, name);
-  if(cache_.find(key)==cache_.end()){
-    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
-  }
-  return cache_.at(key);
-}
-
-std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
-
-/*-----------------------------------*/
-//------------  Queues --------------*/
-/*-----------------------------------*/
-
-void backend::streams::init(std::list<driver::context*> const & contexts){
-  for(driver::context* ctx : contexts)
-    if(cache_.find(ctx)==cache_.end())
-      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
-}
-
-void backend::streams::release(){
-  for(auto & x: cache_)
-    for(auto & y: x.second)
-      delete y;
-  cache_.clear();
-}
-
-driver::stream* backend::streams::get_default()
-{ return get(contexts::get_default(), 0); }
-
-driver::stream* backend::streams::get(driver::context* context, unsigned int id){
-  init(std::list<driver::context*>(1,context));
-  for(auto & x : cache_)
-    if(x.first==context)
-      return x.second[id];
-  throw;
-}
-
-void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
-  init(std::list<driver::context*>(1,context));
-  queues = cache_.at(context);
-}
-
-std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
-
-/*-----------------------------------*/
-//------------  Contexts ------------*/
-/*-----------------------------------*/
-
-void backend::contexts::init(std::vector<driver::device*> const & devices){
-  for(driver::device* dvc: devices)
-    cache_.push_back(driver::context::create(dvc));
-}
-
-void backend::contexts::release(){
-  for(auto & x: cache_)
-    delete x;
-  cache_.clear();
-}
-
-driver::context* backend::contexts::get_default(){
-  backend::init();
-  auto it = cache_.begin();
-  std::advance(it, default_device);
-  return *it;
-}
-
-void backend::contexts::get(std::list<driver::context*> & contexts){
-  backend::init();
-  contexts = cache_;
-}
-
-std::list<driver::context*> backend::contexts::cache_;
-
-
-
-/*-----------------------------------*/
-//------------  General -------------*/
-/*-----------------------------------*/
-
-void backend::synchronize(driver::context* context){
-  for(driver::stream * queue: streams::cache_.at(context))
-    queue->synchronize();
-}
-
-
-void backend::release(){
-  backend::kernels::release();
-//  backend::programs::release();
-  backend::streams::release();
-  backend::contexts::release();
-}
-
-
-void backend::init(){
-  if(!contexts::cache_.empty())
-    return;
-  // initialize platforms
-  backend::platforms::init();
-  // initialize devices
-  backend::devices::init(platforms::cache_);
-  // initialize contexts
-  backend::contexts::init(devices::cache_);
-  // initialize streams
-  streams::init(contexts::cache_);
-}
-
-unsigned int backend::default_device = 0;
-
-}
-
-}
--- a/lib/driver/buffer.cc
+++ b/lib/driver/buffer.cc
@@ -1,90 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "triton/driver/stream.h"
-#include "triton/driver/buffer.h"
-#include "triton/driver/context.h"
-#include "triton/driver/dispatch.h"
-
-
-namespace triton
-{
-
-namespace driver
-{
-
-
-//
-
-buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
-  : polymorphic_resource(cu, take_ownership), size_(size) { }
-
-buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
-  : polymorphic_resource(hst, take_ownership), size_(size) { }
-
-size_t buffer::size() {
-  return size_;
-}
-
-uintptr_t buffer::addr_as_uintptr_t() {
-  switch(backend_){
-    case CUDA: return *cu_;
-    case Host: return (uintptr_t)hst_->data;
-    default: return 0;
-  }
-}
-
-
-buffer* buffer::create(driver::context* ctx, size_t size) {
-  switch(ctx->backend()){
-  case CUDA: return new cu_buffer(size);
-  case Host: return new host_buffer(size);
-  default: throw std::runtime_error("unknown backend");
-  }
-}
-
-//
-
-host_buffer::host_buffer(size_t size)
-  :  buffer(size, host_buffer_t(), true){
-  hst_->data = new char[size];
-}
-
-
-//
-
-cu_buffer::cu_buffer(size_t size)
-  : buffer(size, CUdeviceptr(), true) {
-  dispatch::cuMemAlloc(&*cu_, size);
-}
-
-cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
-  : buffer(size, cu, take_ownership){
-}
-
-void cu_buffer::set_zero(driver::stream* queue, size_t size){
-  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
-}
-
-}
-
-}
--- a/lib/driver/context.cc
+++ b/lib/driver/context.cc
@@ -1,118 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <cassert>
-#include "triton/driver/context.h"
-#include "triton/driver/module.h"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//         BASE             //
-/* ------------------------ */
-
-context::context(driver::device *dev, CUcontext cu, bool take_ownership):
-  polymorphic_resource(cu, take_ownership),
-  dev_(dev), cache_path_(get_cache_path()) {
-}
-
-context::context(driver::device *dev, host_context_t hst, bool take_ownership):
-  polymorphic_resource(hst, take_ownership),
-  dev_(dev), cache_path_(get_cache_path()){
-}
-
-context* context::create(driver::device *dev){
-  switch(dev->backend()){
-  case CUDA: return new cu_context(dev);
-  case Host: return new host_context(dev);
-  default: throw std::runtime_error("unknown backend");
-  }
-}
-
-
-driver::device* context::device() const {
-  return dev_;
-}
-
-std::string context::get_cache_path(){
-  //user-specified cache path
-  std::string result = tools::getenv("TRITON_CACHE_PATH");
-  if(!result.empty()){
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //create in home
-  result = tools::getenv("HOME");
-  if(!result.empty())
-  {
-    result = result + "/.triton/cache/";
-    if(tools::mkpath(result)==0)
-      return result;
-  }
-  //couldn't find a directory
-  return "";
-}
-
-std::string const & context::cache_path() const{
-  return cache_path_;
-}
-
-/* ------------------------ */
-//         Host             //
-/* ------------------------ */
-
-host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
-
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-// import CUdevice
-CUdevice cu_context::get_device_of(CUcontext context){
-  dispatch::cuCtxPushCurrent_v2(context);
-  CUdevice res;
-  dispatch::cuCtxGetDevice(&res);
-  dispatch::cuCtxPopCurrent_v2(NULL);
-  return res;
-}
-
-// wrapper for cuda context
-cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
-                                                                                context, take_ownership) {
-}
-
-cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
-  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
-//  dispatch::cuCtxPopCurrent_v2(NULL);
-}
-
-
-}
-}
--- a/lib/driver/device.cc
+++ b/lib/driver/device.cc
@@ -1,192 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <map>
-#include <algorithm>
-#include <sstream>
-#include <cstring>
-#include <memory>
-#include "triton/driver/device.h"
-#include "triton/driver/context.h"
-#include "triton/driver/error.h"
-#include "triton/codegen/target.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//          Host            //
-/* ------------------------ */
-
-std::unique_ptr<codegen::target> host_device::make_target() const {
-  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
-}
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-// information query
-template<CUdevice_attribute attr>
-int cu_device::cuGetInfo() const{
-  int res;
-  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
-  return res;
-}
-
-// convert to nvml
-nvmlDevice_t cu_device::nvml_device() const{
-  std::map<std::string, nvmlDevice_t> map;
-  std::string key = pci_bus_id();
-  if(map.find(key)==map.end()){
-    nvmlDevice_t device;
-    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
-    return map.insert(std::make_pair(key, device)).first->second;
-  }
-  return map.at(key);
-}
-
-// number of address bits
-size_t cu_device::address_bits() const{
-  return sizeof(size_t)*8;
-}
-
-// name
-std::string cu_device::name() const {
-    char tmp[128];
-    dispatch::cuDeviceGetName(tmp, 128, *cu_);
-    return std::string(tmp);
-}
-
-// PCI bus ID
-std::string cu_device::pci_bus_id() const{
-  char tmp[128];
-  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
-  return std::string(tmp);
-}
-
-// force the device to be interpreted as a particular cc
-void cu_device::interpret_as(int cc){
-  interpreted_as_ = std::make_shared<int>(cc);
-}
-
-// compute capability
-int cu_device::compute_capability() const {
-  if(interpreted_as_)
-    return *interpreted_as_;
-  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
-  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
-  return major*10 + minor;
-}
-
-// maximum number of threads per block
-size_t cu_device::max_threads_per_block() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
-}
-
-// maximum amount of shared memory per block
-size_t cu_device::max_shared_memory() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
-}
-
-// warp size
-size_t cu_device::warp_size() const {
-  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
-}
-
-
-// maximum block dimensions
-std::vector<size_t> cu_device::max_block_dim() const {
-  std::vector<size_t> result(3);
-  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
-  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
-  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
-  return result;
-}
-
-// current SM clock
-size_t cu_device::current_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-// max SM clock
-size_t cu_device::max_sm_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
-  return result;
-}
-
-// current memory clock
-size_t cu_device::current_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-// max memory clock
-size_t cu_device::max_mem_clock() const{
-  unsigned int result;
-  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
-  return result;
-}
-
-// max memory clock
-void cu_device::set_max_clock() {
-  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
-}
-
-void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
-  CUcontext context;
-  dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
-  try {
-    dispatch::cuCtxEnablePeerAccess(context, 0);
-  } catch (exception::cuda::peer_access_already_enabled) {}
-}
-
-// print infos
-std::string cu_device::infos() const{
-  std::ostringstream oss;
-  std::vector<size_t> max_wi_sizes = max_block_dim();
-  oss << "Platform: CUDA" << std::endl;
-  oss << "Name: " << name() << std::endl;
-  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
-  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
-  oss << "Local memory size: " << max_shared_memory() << std::endl;
-  return oss.str();
-}
-
-// target
-std::unique_ptr<codegen::target> cu_device::make_target() const {
-  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
-}
-
-
-}
-
-}
-
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -21,7 +21,6 @@
 */

 #include "triton/driver/dispatch.h"
-#include "triton/driver/context.h"
 #include "triton/tools/sys/getenv.hpp"

 namespace triton
@@ -31,65 +30,65 @@ namespace driver

 //Helpers for function definition
 #define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
+void* dispatch::fname ## _;

 #define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
+void* dispatch::fname ## _;

 #define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
+void* dispatch::fname ## _;

 #define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
+void* dispatch::fname ## _;

 #define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
+void* dispatch::fname ## _;

 #define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
+void* dispatch::fname ## _;

 #define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
+void* dispatch::fname ## _;

 #define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
+void* dispatch::fname ## _;

 #define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
+void* dispatch::fname ## _;

 #define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
+void* dispatch::fname ## _;

 #define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
+void* dispatch::fname ## _;

 #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
+void* dispatch::fname ## _;

 #define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
+void* dispatch::fname ## _;

 #define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
+void* dispatch::fname ## _;

-//Specialized helpers for CUDA
-#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
-
-#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)

+/* ------------------- *
+ * CUDA
+ * ------------------- */

 bool dispatch::cuinit(){
  if(cuda_==nullptr){
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
  return true;
 }

+#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
+#define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
+#define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
+#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
+#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
+#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+// context management
+CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
+CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
+CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
+CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
+CUDA_DEFINE1(CUresult, cuInit, unsigned int)
+CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
+// device management
+CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
+CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
+CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
+CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
+
+// link management
+CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
+CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
+CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
+CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
+// module management
+CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
+CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
+CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
+CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
+CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
+CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
+// stream management
+CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
+CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
+CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
+CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
+CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
+// function management
+CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
+CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
+CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
+// memory management
+CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
+CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
+CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
+CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
+CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
+CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
+CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
+// event management
+CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
+CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
+CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
+CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
+
+
+
+/* ------------------- *
+ * NVML
+ * ------------------- */
 bool dispatch::nvmlinit(){
  if(nvml_==nullptr)
    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
  return res;
 }

-//CUDA
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
-CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
-CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
-CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
-CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
-CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
-CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
-CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
-CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
-
-CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
-CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
-CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
-CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
-CUDA_DEFINE1(CUresult, cuInit, unsigned int)
-CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
-CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
-CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
-CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
-CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
-CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
-CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
-CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
-CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
-CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
-CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
-CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
-CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
+#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
+#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
+#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)

 NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)

+/* ------------------- *
+ * HIP
+ * ------------------- */
+bool dispatch::hipinit(){
+  if(hip_==nullptr)
+    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
+  if(hip_ == nullptr)
+    return false;
+  hipError_t (*fptr)();
+  hipInit_ = dlsym(hip_, "hipInit");
+  *reinterpret_cast<void **>(&fptr) = hipInit_;
+  hipError_t res = (*fptr)();
+  check(res);
+  return res;
+}
+
+#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
+#define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
+#define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
+#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
+#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
+#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
+#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
+#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
+#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
+#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
+#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
+
+// context management
+HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
+HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
+HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
+HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
+HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
+HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
+HIP_DEFINE1(hipError_t, hipInit, unsigned int)
+HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
+// device management
+HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
+HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
+HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
+HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
+HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
+// module management
+HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
+HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
+HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
+HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
+HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
+HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
+// stream management
+HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
+HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
+HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
+HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
+// function management
+HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
+HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
+// memory management
+HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
+HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
+HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
+HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
+HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
+HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
+HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
+HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
+// event management
+HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
+HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
+HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
+HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
+
+
+/* ------------------- *
+ * COMMON
+ * ------------------- */

 // Release
 void dispatch::release(){
@@ -190,61 +291,9 @@ void dispatch::release(){

 void* dispatch::cuda_;
 void* dispatch::nvml_;
-
-//CUDA
-void* dispatch::cuCtxGetCurrent_;
-void* dispatch::cuCtxSetCurrent_;
-void* dispatch::cuCtxDestroy_v2_;
-void* dispatch::cuEventCreate_;
-void* dispatch::cuDeviceGet_;
-void* dispatch::cuMemcpyDtoH_v2_;
-void* dispatch::cuStreamCreate_;
-void* dispatch::cuEventElapsedTime_;
-void* dispatch::cuMemFree_v2_;
-void* dispatch::cuMemcpyDtoHAsync_v2_;
-void* dispatch::cuDriverGetVersion_;
-void* dispatch::cuDeviceGetName_;
-void* dispatch::cuDeviceGetPCIBusId_;
-void* dispatch::cuModuleGetGlobal_v2_;
-
-void* dispatch::cuLinkAddData_v2_;
-void* dispatch::cuLinkCreate_v2_;
-void* dispatch::cuLinkDestroy_;
-void* dispatch::cuModuleLoadData_;
-void* dispatch::cuLinkComplete_;
-
-void* dispatch::cuMemcpyHtoDAsync_v2_;
-void* dispatch::cuModuleLoad_;
-void* dispatch::cuLaunchKernel_;
-void* dispatch::cuModuleUnload_;
-void* dispatch::cuModuleLoadDataEx_;
-void* dispatch::cuDeviceGetAttribute_;
-void* dispatch::cuDeviceGetCount_;
-void* dispatch::cuMemcpyHtoD_v2_;
-void* dispatch::cuInit_;
-void* dispatch::cuEventRecord_;
-void* dispatch::cuCtxCreate_v2_;
-void* dispatch::cuModuleGetFunction_;
-void* dispatch::cuStreamSynchronize_;
-void* dispatch::cuStreamDestroy_v2_;
-void* dispatch::cuStreamGetCtx_;
-void* dispatch::cuEventDestroy_v2_;
-void* dispatch::cuMemAlloc_v2_;
-void* dispatch::cuPointerGetAttribute_;
-void* dispatch::cuCtxGetDevice_;
-void* dispatch::cuMemsetD8Async_;
-void* dispatch::cuCtxPushCurrent_v2_;
-void* dispatch::cuCtxPopCurrent_v2_;
-void* dispatch::cuFuncGetAttribute_;
-void* dispatch::cuFuncSetAttribute_;
-void* dispatch::cuFuncSetCacheConfig_;
-void* dispatch::cuCtxEnablePeerAccess_;
-
 void* dispatch::nvmlInit_v2_;
-void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
-void* dispatch::nvmlDeviceGetClockInfo_;
-void* dispatch::nvmlDeviceGetMaxClockInfo_;
-void* dispatch::nvmlDeviceSetApplicationsClocks_;
+void* dispatch::hip_;
+

 }
 }
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -94,6 +94,73 @@ void check(CUresult err)
  }
 }

+void check(hipError_t error) {
+  using namespace exception::hip;
+  switch(error)
+  {
+  case hipSuccess                              : break;
+    case hipErrorInvalidValue                  : throw invalid_value();
+    case hipErrorMemoryAllocation                  : throw out_of_memory();
+    case hipErrorNotInitialized                : throw not_initialized();
+    case hipErrorDeinitialized                  : throw deinitialized();
+    case hipErrorProfilerDisabled              : throw profiler_disabled();
+    case hipErrorProfilerNotInitialized       : throw profiler_not_initialized();
+    case hipErrorProfilerAlreadyStarted       : throw profiler_already_started();
+    case hipErrorProfilerAlreadyStopped       : throw profiler_already_stopped();
+    case hipErrorNoDevice                      : throw no_device();
+    case hipErrorInvalidSymbol                      : throw invalid_symbol();
+    case hipErrorInvalidDevice                 : throw invalid_device();
+    case hipErrorInvalidImage                  : throw invalid_image();
+    case hipErrorInvalidContext                : throw invalid_context();
+    case hipErrorContextAlreadyCurrent        : throw context_already_current();
+    case hipErrorMapFailed                     : throw map_failed();
+    case hipErrorUnmapFailed                   : throw unmap_failed();
+    case hipErrorArrayIsMapped                : throw array_is_mapped();
+    case hipErrorAlreadyMapped                 : throw already_mapped();
+    case hipErrorNoBinaryForGpu              : throw no_binary_for_gpu();
+    case hipErrorAlreadyAcquired               : throw already_acquired();
+    case hipErrorNotMapped                     : throw not_mapped();
+    case hipErrorNotMappedAsArray             : throw not_mapped_as_array();
+    case hipErrorNotMappedAsPointer           : throw not_mapped_as_pointer();
+    case hipErrorECCNotCorrectable            : throw ecc_uncorrectable();
+    case hipErrorUnsupportedLimit             : throw unsupported_limit();
+    case hipErrorContextAlreadyInUse          : throw context_already_in_use();
+    case hipErrorPeerAccessUnsupported        : throw peer_access_unsupported();
+    case hipErrorInvalidKernelFile            : throw invalid_ptx();
+    case hipErrorInvalidGraphicsContext       : throw invalid_graphics_context();
+    case hipErrorInvalidSource                 : throw invalid_source();
+    case hipErrorFileNotFound                 : throw file_not_found();
+    case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
+    case hipErrorSharedObjectInitFailed      : throw shared_object_init_failed();
+    case hipErrorOperatingSystem               : throw operating_system();
+    case hipErrorInvalidResourceHandle                 : throw invalid_handle();
+    case hipErrorNotFound                      : throw not_found();
+    case hipErrorNotReady                      : throw not_ready();
+    case hipErrorIllegalAddress                : throw illegal_address();
+    case hipErrorLaunchOutOfResources        : throw launch_out_of_resources();
+    case hipErrorLaunchTimeOut                 : throw launch_timeout();
+    // case hipErrorLaunchIncompatibleTexturing  : throw launch_incompatible_texturing();
+    case hipErrorPeerAccessAlreadyEnabled    : throw peer_access_already_enabled();
+    case hipErrorPeerAccessNotEnabled        : throw peer_access_not_enabled();
+    // case hipErrorPrimaryContextActive         : throw primary_context_active();
+    // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
+    case hipErrorAssert                         : throw assert_error();
+    // case hipErrorTooManyPeers                 : throw too_many_peers();
+    case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
+    case hipErrorHostMemoryNotRegistered     : throw host_memory_not_registered();
+    // case hipErrorHardwareStackError           : throw hardware_stack_error();
+    // case hipErrorIllegalInstruction            : throw illegal_instruction();
+    // case hipErrorMisalignedAddress             : throw misaligned_address();
+    // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
+    // case hipErrorInvalidPc                     : throw invalid_pc();
+    case hipErrorLaunchFailure                  : throw launch_failed();
+    // case hipErrorNotPermitted                  : throw not_permitted();
+    case hipErrorNotSupported                  : throw not_supported();
+    case hipErrorUnknown                        : throw unknown();
+    default                                        : throw unknown();
+}
+}
+
 }
 }

--- a/lib/driver/handle.cc
+++ b/lib/driver/handle.cc
@@ -1,91 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include "triton/driver/handle.h"
-#include "triton/driver/error.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-//Host
-inline void _delete(host_platform_t) { }
-inline void _delete(host_device_t)   { }
-inline void _delete(host_context_t)  { }
-inline void _delete(host_module_t)   { }
-inline void _delete(host_stream_t)   { }
-inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
-inline void _delete(host_function_t) { }
-
-//CUDA
-inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
-inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
-inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
-inline void _delete(CUdevice) { }
-inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
-inline void _delete(CUfunction) { }
-inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
-inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
-inline void _delete(CUPlatform){}
-
-//Constructor
-template<class T>
-handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
-{ }
-
-template<class T>
-handle<T>::handle(): has_ownership_(false){ }
-
-
-template<class T>
-handle<T>::~handle(){
-  try{
-    if(has_ownership_ && h_ && h_.unique())
-      _delete(*h_);
-  }catch(const exception::cuda::base&){
-    // order of destruction for global variables
-    // is not guaranteed
-  }
-}
-
-template class handle<CUdeviceptr>;
-template class handle<CUstream>;
-template class handle<CUcontext>;
-template class handle<CUdevice>;
-template class handle<cu_event_t>;
-template class handle<CUfunction>;
-template class handle<CUmodule>;
-template class handle<CUPlatform>;
-
-template class handle<host_platform_t>;
-template class handle<host_device_t>;
-template class handle<host_context_t>;
-template class handle<host_module_t>;
-template class handle<host_stream_t>;
-template class handle<host_buffer_t>;
-template class handle<host_function_t>;
-
-
-}
-}
--- a/lib/driver/kernel.cc
+++ b/lib/driver/kernel.cc
@@ -1,94 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <string.h>
-#include "triton/driver/kernel.h"
-#include "triton/driver/buffer.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
-  polymorphic_resource(fn, has_ownership), program_(program){
-}
-
-
-kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
-  polymorphic_resource(fn, has_ownership), program_(program){
-}
-
-kernel* kernel::create(driver::module* program, const char* name) {
-    switch(program->backend()){
-    case CUDA: return new cu_kernel(program, name);
-    case Host: return new host_kernel(program, name);
-    default: throw std::runtime_error("unknown backend");
-    }
-}
-
-driver::module* kernel::module() {
-  return program_;
-}
-
-/* ------------------------ */
-//         Host             //
-/* ------------------------ */
-
-host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
-  hst_->fn = program->hst()->functions.at(name);
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
-  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
-  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
-  // properties
-  int shared_total, shared_optin, shared_static;
-  int n_spills, n_reg;
-  CUdevice dev;
-  dispatch::cuCtxGetDevice(&dev);
-  dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
-  dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
-  dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
-  dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  *cu_);
-  dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
-//  std::cout << n_reg << std::endl;
-  if (shared_optin > 49152){
-//      std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
-      dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
-  }
-}
-
-}
-
-}
-
--- a/lib/driver/llvm.cc
+++ b/lib/driver/llvm.cc
@@ -0,0 +1,324 @@
+/* Copyright 2015-2017 Philippe Tillet
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files
+* (the "Software"), to deal in the Software without restriction,
+* including without limitation the rights to use, copy, modify, merge,
+* publish, distribute, sublicense, and/or sell copies of the Software,
+* and to permit persons to whom the Software is furnished to do so,
+* subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include <fstream>
+#include <unistd.h>
+#include <memory>
+#include <regex>
+#include "triton/driver/llvm.h"
+#include "triton/driver/dispatch.h"
+#include "triton/driver/error.h"
+#include "triton/tools/sha1.hpp"
+#include "triton/tools/sys/getenv.hpp"
+#include "triton/tools/sys/mkdir.hpp"
+#include "triton/tools/sys/exec.hpp"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+// begin AMD stuff
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/Program.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+// end AMD stuff
+
+namespace triton{
+namespace driver{
+
+void init_llvm() {
+  static bool init = false;
+  if(!init){
+    LLVMInitializeNVPTXTargetInfo();
+    LLVMInitializeNVPTXTarget();
+    LLVMInitializeNVPTXTargetMC();
+    LLVMInitializeNVPTXAsmPrinter();
+    LLVMInitializeAMDGPUTargetInfo();
+    LLVMInitializeAMDGPUTarget();
+    LLVMInitializeAMDGPUTargetMC();
+    LLVMInitializeAMDGPUAsmPrinter();
+    init = true;
+  }
+}
+
+/* ------------------------ */
+//         CUDA             //
+/* ------------------------ */
+static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
+  size_t start_replace = str.find(begin);
+  size_t end_replace = str.find(end, start_replace);
+  if(start_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+int vptx(int version){
+  if(version >= 11030) return 73;
+  if(version >= 11020) return 72;
+  if(version >= 11010) return 71;
+  if(version >= 11000) return 70;
+  if(version >= 10020) return 65;
+  if(version >= 10010) return 64;
+  if(version >= 10000) return 63;
+  throw std::runtime_error("Triton requires CUDA 10+");
+}
+
+std::string llir_to_ptx(llvm::Module* module, int cc, int version){
+  // LLVM version in use may not officially support target hardware
+  int max_nvvm_cc = 75;
+  int max_nvvm_ptx = 64;
+  // options
+  auto options = llvm::cl::getRegisteredOptions();
+  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
+  assert(short_ptr);
+  short_ptr->setValue(true);
+  // compute capability
+  std::string sm = "sm_" + std::to_string(cc);
+  // max PTX version
+  int ptx = vptx(version);
+  int ptx_major = ptx / 10;
+  int ptx_minor = ptx % 10;
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "nvptx64-nvidia-cuda";
+  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
+  std::string layout = "";
+  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
+  init_llvm();
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
+  pass.run(*module);
+
+  // post-process
+  std::string result(buffer.begin(), buffer.end());
+  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
+  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
+  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
+  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
+  return result;
+}
+
+
+CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
+  // JIT compile source-code
+  try{
+    // use ptxas if present in PATH. Otherwise, use JIT from the driver
+    std::string ptxas = "ptxas";
+    std::string version;
+    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
+
+    // Use PTXAS via system call
+    if(use_system_ptxas){
+      // compile ptx with ptxas
+      char _fsrc[] = "/tmp/triton_k_XXXXXX";
+      char _flog[] = "/tmp/triton_l_XXXXXX";
+      mkstemp(_fsrc);
+      mkstemp(_flog);
+      std::string fsrc = _fsrc;
+      std::string flog = _flog;
+      std::ofstream ofs(fsrc);
+      ofs << ptx;
+      ofs.close();
+      std::string cmd;
+      int err;
+      cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
+      err = system(cmd.c_str());
+      CUmodule ret;
+      dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
+      unlink(_fsrc);
+      unlink(_flog);
+      return ret;
+    }
+
+    // Use PTXAS included in driver
+    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
+                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
+                          CU_JIT_LOG_VERBOSE};
+    unsigned int errbufsize = 8192;
+    unsigned int logbufsize = 8192;
+    char _err[errbufsize];
+    char _log[logbufsize];
+    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
+    CUmodule ret;
+    dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
+    return ret;
+  }
+  catch(exception::cuda::invalid_ptx const &){
+    std::cout << ptx << std::endl;
+    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
+    throw;
+  }
+}
+
+/* ------------------------ */
+//         HIP              //
+/* ------------------------ */
+
+std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
+  init_llvm();
+
+//  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
+//  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
+
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "amdgcn-amd-amdhsa";
+  std::string layout = "";
+  std::string features;
+  std::string proc = "gfx908";
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
+                                                             llvm::Reloc::PIC_, llvm::None,
+                                                             llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if(layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+
+  // create dump files
+  std::string module_name = module->getModuleIdentifier();
+  std::error_code ec;
+
+  // Save GCN ISA binary.
+  std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
+  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
+      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
+  if (ec)
+  {
+    std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
+  }
+
+  // emit
+  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
+  pass.run(*module);
+  // Save GCN ISA.
+  std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
+  std::string result(buffer.begin(), buffer.end());
+  std::ofstream amdgcn(amdgcn_path);
+  amdgcn << result;
+  amdgcn.close();
+
+  // generate HASCO file
+  std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
+  std::string error_message;
+  int lld_result =
+      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
+                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
+                                llvm::None, {}, 0, 0, &error_message);
+  if (lld_result)
+  {
+    std::cout << "ld.lld execute fail: " << std::endl;
+    std::cout << error_message << std::endl;
+    std::cout << lld_result << std::endl;
+  }
+
+  return hsaco_path;
+}
+
+
+hipModule_t amdgpu_to_hipmodule(const std::string& path) {
+  // Read HSACO.
+  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
+  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
+
+  std::vector<unsigned char> hsaco(hsaco_file_size);
+  hsaco_file.seekg(0, std::ios::beg);
+  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
+  hsaco_file.close();
+  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
+                            hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
+                            hipJitOptionLogVerbose};
+  unsigned int errbufsize = 8192;
+  unsigned int logbufsize = 8192;
+  char _err[errbufsize];
+  char _log[logbufsize];
+  void* optval[] = {(void*)(uintptr_t)errbufsize,
+                    (void*)_err, (void*)(uintptr_t)logbufsize,
+                    (void*)_log, (void*)1};
+  hipModule_t ret;
+  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
+  return ret;
+}
+
+
+
+}
+}
+
--- a/lib/driver/module.cc
+++ b/lib/driver/module.cc
@@ -1,375 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files
-* (the "Software"), to deal in the Software without restriction,
-* including without limitation the rights to use, copy, modify, merge,
-* publish, distribute, sublicense, and/or sell copies of the Software,
-* and to permit persons to whom the Software is furnished to do so,
-* subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be
-* included in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-#include <fstream>
-#include <unistd.h>
-#include <memory>
-#include <regex>
-#include "triton/driver/module.h"
-#include "triton/driver/context.h"
-#include "triton/driver/error.h"
-#include "triton/tools/sha1.hpp"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-#include "triton/tools/sys/exec.hpp"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-std::string exec(const char* cmd) {
-    std::array<char, 128> buffer;
-    std::string result;
-    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
-    if (!pipe) {
-        throw std::runtime_error("popen() failed!");
-    }
-    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
-        result += buffer.data();
-    }
-    return result;
-}
-
-  void LLVMInitializeNVPTXTargetInfo();
-  void LLVMInitializeNVPTXTarget();
-  void LLVMInitializeNVPTXTargetMC();
-  void LLVMInitializeNVPTXAsmPrinter();
-  void LLVMInitializeNVPTXAsmParser();
-
-
-namespace triton
-{
-namespace driver
-{
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-
-void module::init_llvm() {
-  static bool init = false;
-  if(!init){
-    LLVMInitializeNVPTXTargetInfo();
-    LLVMInitializeNVPTXTarget();
-    LLVMInitializeNVPTXTargetMC();
-    LLVMInitializeNVPTXAsmPrinter();
-    init = true;
-  }
-}
-
-module::module(CUmodule mod, bool has_ownership)
-  : polymorphic_resource(mod, has_ownership), spilled_(0) {
-}
-
-module::module(host_module_t mod, bool has_ownership)
-  : polymorphic_resource(mod, has_ownership), spilled_(0) {
-}
-
-
-module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
-  switch(device->backend()){
-    case CUDA: return new cu_module(device, std::move(src));
-    case Host: return new host_module(std::move(src));
-    default: throw std::runtime_error("unknown backend");
-  }
-}
-
-void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
-                                 const std::string &proc, std::string layout,
-                                 llvm::SmallVectorImpl<char> &buffer,
-                                 const std::string& features,
-                                 file_type_t ft) {
-
-}
-
-
-/* ------------------------ */
-//        Host              //
-/* ------------------------ */
-
-host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
-  throw std::runtime_error("CPU unsupported");
-//  init_llvm();
-//  // create kernel wrapper
-//  llvm::LLVMContext &ctx = src->getContext();
-//  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
-//  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
-//  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
-//  std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
-//  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
-//  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
-//  llvm::Function* fn = &*src->getFunctionList().begin();
-//  llvm::FunctionType *fn_ty = fn->getFunctionType();
-//  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
-//  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
-//  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
-//  llvm::IRBuilder<> ir_builder(ctx);
-//  ir_builder.SetInsertPoint(entry);
-//  auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
-//  llvm::Value* base = main->arg_begin();
-//  llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
-
-//  size_t offset = 0;
-//  for(unsigned i = 0; i < ptrs.size(); i++){
-//    ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
-//    size_t nbytes = get_size(fn_ty->getParamType(i));
-//    offset += nbytes;
-//    if(i < ptrs.size() - 1){
-//      size_t np1bytes = get_size(fn_ty->getParamType(i+1));
-//      offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
-//    }
-//  }
-//  for(unsigned i = 0; i < ptrs.size(); i++)
-//    ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
-//  for(unsigned i = 0; i < ptrs.size(); i++)
-//    fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
-
-//  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
-//  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
-//  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
-//  ir_builder.CreateCall(fn, fn_args);
-//  ir_builder.CreateRetVoid();
-
-////  llvm::legacy::PassManager pm;
-////  pm.add(llvm::createPrintModulePass(llvm::outs()));
-////  pm.add(llvm::createVerifierPass());
-////  pm.run(*src);
-
-////   create execution engine
-//  for(llvm::Function& fn: src->functions())
-//    hst_->functions[fn.getName().str()] = &fn;
-
-////  llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
-////  auto DL = JTMB.getDefaultDataLayoutForTarget();
-////  auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
-////  hst_->ES = new llvm::orc::ExecutionSession();
-////  hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
-////  hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
-////  hst_->DL = new llvm::DataLayout(std::move(*DL));
-////  hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
-////  hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
-////  hst_->MainJD =  &hst_->ES->createJITDylib("<main>");
-////  hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
-////                                            hst_->DL->getGlobalPrefix())));
-////  llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
-////  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
-
-
-
-//  llvm::EngineBuilder builder(std::move(src));
-//  builder.setErrorStr(&hst_->error);
-//  builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
-//  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
-//  builder.setEngineKind(llvm::EngineKind::JIT);
-//  hst_->engine = builder.create();
-//  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
-}
-
-std::unique_ptr<buffer> host_module::symbol(const char *name) const {
-  throw std::runtime_error("not implemented");
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
-  size_t start_replace = str.find(begin);
-  size_t end_replace = str.find(end, start_replace);
-  if(start_replace == std::string::npos)
-    return false;
-  str.replace(start_replace, end_replace + 1 - start_replace, target);
-  return true;
-}
-
-//static std::map<int, int> vptx = {
-//  {10000, 63},
-//  {10010, 64},
-//  {10020, 65},
-//  {11000, 70},
-//  {11010, 71},
-//  {11020, 72},
-//  {11030, 73},
-//  {11040, 73}
-//};
-
-int vptx(int version){
-  if(version >= 11030) return 73;
-  if(version >= 11020) return 72;
-  if(version >= 11010) return 71;
-  if(version >= 11000) return 70;
-  if(version >= 10020) return 65;
-  if(version >= 10010) return 64;
-  if(version >= 10000) return 63;
-  throw std::runtime_error("Triton requires CUDA 10+");
-}
-
-std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
-  // LLVM version in use may not officially support target hardware
-  int max_nvvm_cc = 75;
-  int max_nvvm_ptx = 64;
-  // options
-  auto options = llvm::cl::getRegisteredOptions();
-  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
-  assert(short_ptr);
-  short_ptr->setValue(true);
-  // compute capability
-  int cc = ((driver::cu_device*)device)->compute_capability();
-  std::string sm = "sm_" + std::to_string(cc);
-  // driver version
-  int version;
-  dispatch::cuDriverGetVersion(&version);
-  int ptx = vptx(version);
-  int ptx_major = ptx / 10;
-  int ptx_minor = ptx % 10;
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "nvptx64-nvidia-cuda";
-  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
-  std::string layout = "";
-  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
-  init_llvm();
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
-                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if(layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-  // emit
-  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
-  pass.run(*module);
-
-  // post-process
-  std::string result(buffer.begin(), buffer.end());
-  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
-  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
-  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
-  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
-  return result;
-}
-
-void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
-  // JIT compile source-code
-  try{
-    // use ptxas if present in PATH. Otherwise, use JIT from the driver
-    std::string ptxas = "ptxas";
-    std::string version;
-    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
-
-    // Use PTXAS via system call
-    if(use_system_ptxas){
-      // compile ptx with ptxas
-      char _fsrc[] = "/tmp/triton_k_XXXXXX";
-      char _flog[] = "/tmp/triton_l_XXXXXX";
-      mkstemp(_fsrc);
-      mkstemp(_flog);
-      std::string fsrc = _fsrc;
-      std::string flog = _flog;
-      std::ofstream ofs(fsrc);
-      ofs << ptx;
-      ofs.close();
-      std::string cmd;
-      int err;
-      std::string cc = std::to_string(device->compute_capability());
-      cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
-      err = system(cmd.c_str());
-      dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
-      unlink(_fsrc);
-      unlink(_flog);
-      return;
-    }
-
-    // Use PTXAS included in driver
-    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
-                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
-                          CU_JIT_LOG_VERBOSE};
-    unsigned int errbufsize = 8192;
-    unsigned int logbufsize = 8192;
-    char _err[errbufsize];
-    char _log[logbufsize];
-    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
-    dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
-  }
-  catch(exception::cuda::invalid_ptx const &){
-//#ifdef TRITON_LOG_PTX_ERROR
-     std::cout << ptx << std::endl;
-    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
-//    exit(1);
-//#endif
-    throw;
-  }
-}
-
-cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
-  llvm::raw_string_ostream oss(llir_);
-  oss << *ll_module;
-  oss.flush();
-  ptx_ = compile_llvm_module(ll_module.get(), device);
-  init_from_ptx(ptx_, (driver::cu_device*)device);
-}
-
-cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
-  init_from_ptx(ptx_, (driver::cu_device*)device);
-}
-
-std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
-  CUdeviceptr handle;
-  size_t size;
-  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
-  std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
-  return std::move(res);
-}
-
-
-}
-}
-
--- a/lib/driver/platform.cc
+++ b/lib/driver/platform.cc
@@ -1,68 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <string>
-#include "triton/driver/platform.h"
-#include "triton/driver/device.h"
-
-
-namespace triton
-{
-namespace driver
-{
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-std::string cu_platform::version() const{
-  int version;
-  dispatch::cuDriverGetVersion(&version);
-  return std::to_string(version);
-}
-
-void cu_platform::devices(std::vector<device *> &devices) const{
-  int N;
-  dispatch::cuDeviceGetCount(&N);
-  for(int i = 0 ; i < N ; ++i){
-    CUdevice dvc;
-    dispatch::cuDeviceGet(&dvc, i);
-    devices.push_back(new driver::cu_device(dvc));
-  }
-}
-
-/* ------------------------ */
-//        Host              //
-/* ------------------------ */
-
-std::string host_platform::version() const {
-  return "1.0";
-}
-
-void host_platform::devices(std::vector<driver::device*> &devices) const {
-  devices.push_back(new driver::host_device());
-}
-
-
-}
-}
--- a/lib/driver/stream.cc
+++ b/lib/driver/stream.cc
@@ -1,142 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
-* 
-* Permission is hereby granted, free of charge, to any person obtaining 
-* a copy of this software and associated documentation files 
-* (the "Software"), to deal in the Software without restriction, 
-* including without limitation the rights to use, copy, modify, merge, 
-* publish, distribute, sublicense, and/or sell copies of the Software, 
-* and to permit persons to whom the Software is furnished to do so, 
-* subject to the following conditions:
-* 
-* The above copyright notice and this permission notice shall be 
-* included in all copies or substantial portions of the Software.
-* 
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-*/
-
-#include <cassert>
-#include <unistd.h>
-#include <array>
-#include "triton/driver/backend.h"
-#include "triton/driver/stream.h"
-#include "triton/driver/context.h"
-#include "triton/driver/device.h"
-#include "triton/driver/kernel.h"
-#include "triton/driver/buffer.h"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/GenericValue.h"
-
-namespace triton
-{
-
-namespace driver
-{
-
-/* ------------------------ */
-//         Base             //
-/* ------------------------ */
-
-stream::stream(CUstream cu, bool has_ownership)
-  : polymorphic_resource(cu, has_ownership) {
-}
-
-
-stream::stream(host_stream_t cl, bool has_ownership)
-  : polymorphic_resource(cl, has_ownership) {
-}
-
-driver::stream* stream::create(backend_t backend) {
-  switch(backend){
-    case CUDA: return new cu_stream();
-    case Host: return new host_stream();
-    default: throw std::runtime_error("unknown backend");
-  }
-}
-
-
-/* ------------------------ */
-//          Host            //
-/* ------------------------ */
-
-host_stream::host_stream(): stream(host_stream_t(), true) {
-  hst_->pool.reset(new ThreadPool(1));
-  hst_->futures.reset(new std::vector<std::future<void>>());
-}
-
-void host_stream::synchronize() {
-  for(auto& x: *hst_->futures)
-    x.wait();
-  hst_->futures->clear();
-  hst_->args.clear();
-}
-
-void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
-  auto hst = kernel->module()->hst();
-  hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
-  char* params = new char[args_size];
-  std::memcpy((void*)params, (void*)args, args_size);
-  for(size_t i = 0; i < grid[0]; i++)
-    for(size_t j = 0; j < grid[1]; j++)
-      for(size_t k = 0; k < grid[2]; k++)
-        hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
-}
-
-void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
-  std::memcpy((void*)buffer->hst()->data, ptr, size);
-}
-
-void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
-  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
-}
-
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-
-
-cu_stream::cu_stream(CUstream str, bool take_ownership):
-  stream(str, take_ownership) {
-}
-
-cu_stream::cu_stream(): stream(CUstream(), true) {
-  dispatch::cuStreamCreate(&*cu_, 0);
-}
-
-void cu_stream::synchronize() {
-  dispatch::cuStreamSynchronize(*cu_);
-}
-
-void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
-  void *config[] = {
-      CU_LAUNCH_PARAM_BUFFER_POINTER, args,
-      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
-      CU_LAUNCH_PARAM_END
-  };
-  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
-}
-
-void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
-  if(blocking)
-    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
-  else
-    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
-}
-
-void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
-  if(blocking)
-    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
-  else
-    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
-}
-
-
-}
-
-}
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -1,7 +1,7 @@
 #include "triton/codegen/pass.h"
-#include "triton/driver/kernel.h"
-#include "triton/driver/module.h"
-#include "triton/driver/stream.h"
+#include "triton/codegen/target.h"
+#include "triton/driver/error.h"
+#include "triton/driver/llvm.h"
 #include "triton/ir/builder.h"
 #include "triton/ir/dispatch.h"
 #include "triton/ir/enums.h"
@@ -15,7 +15,9 @@
 #include <pybind11/stl.h>
 #include <regex>
 #include <string>
-#include <sstream>
+#include "llvm/IR/Module.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"

 namespace py = pybind11;
 namespace ir = triton::ir;
@@ -24,72 +26,213 @@ namespace drv = triton::driver;
 /*****************************************************************************/
 /* Python bindings for triton::driver                                        */
 /*****************************************************************************/
+// information query
+template<CUdevice_attribute attr>
+int cuGetInfo(CUdevice device) {
+  int res;
+  drv::dispatch::cuDeviceGetAttribute(&res, attr, device);
+  return res;
+}

-void init_triton_driver(py::module &&m) {
-  // base device
-  py::class_<drv::device>(m, "device");
-  // cuda device
-  py::class_<drv::cu_device, drv::device>(m, "cu_device")
-      .def(py::init([](int dev_id, bool take_ownership) {
-        CUdevice handle;
-        drv::dispatch::cuDeviceGet(&handle, dev_id);
-        return new drv::cu_device(handle, take_ownership);
-      }))
-      .def("max_shared_memory", [](drv::cu_device *self) {
-        return self->max_shared_memory();
-      })
-      .def("enable_peer_access", [](drv::cu_device *self, unsigned long long int peer_mem_ptr) {
-        self->enable_peer_access(peer_mem_ptr);
-      });
-  // host device
-  py::class_<drv::host_device, drv::device>(m, "host_device")
-      .def(py::init<>());
+template<hipDeviceAttribute_t attr>
+int hipGetInfo(hipDevice_t device) {
+  int res;
+  drv::dispatch::hipDeviceGetAttribute(&res, attr, device);
+  return res;
+}

-  // base stream
-  py::class_<drv::stream>(m, "stream");
-  // host stream
-  py::class_<drv::host_stream, drv::stream>(m, "host_stream")
-      .def(py::init<>());
-  // cuda stream
-  py::class_<drv::cu_stream, drv::stream>(m, "cu_stream")
-      // py doesn't support opaque pointer (e.g., CUstream) so
-      // we assume it has been converted to uint64_t
-      .def(py::init([](uint64_t handle, bool take_ownership) {
-        return std::unique_ptr<drv::cu_stream>(new drv::cu_stream((CUstream)handle, take_ownership));
-      }))
-      .def("enqueue", [](drv::cu_stream *self, drv::kernel *kernel,
-                         size_t grid_0, size_t grid_1, size_t grid_2,
-                         size_t block_0, size_t block_1, size_t block_2,
-                         const std::string &args,
-                         size_t shared_mem) {
-        return self->enqueue(kernel, {grid_0, grid_1, grid_2}, {block_0, block_1, block_2},
-                             (void *)args.data(), args.size(), shared_mem);
+enum backend_t {
+  HOST,
+  CUDA,
+  ROCM,
+};
+
+void cu_enable_peer_access(uint64_t peer_ptr){
+  CUcontext context;
+  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_ptr);
+  try {
+      drv::dispatch::cuCtxEnablePeerAccess(context, 0);
+  } catch (drv::exception::cuda::peer_access_already_enabled) {}
+}
+
+void host_enqueue(uint64_t stream, uint64_t kernel,
+                  uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
+                  uint64_t block_0, uint64_t block_1, uint64_t block_2,
+                  void* args_ptr, size_t args_size, int64_t shared_mem){
+  throw std::runtime_error("unsupported");
+// auto hst = kernel->module()->hst();
+// hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
+// char* params = new char[args_size];
+// std::memcpy((void*)params, (void*)args, args_size);
+// for(size_t i = 0; i < grid[0]; i++)
+//   for(size_t j = 0; j < grid[1]; j++)
+//     for(size_t k = 0; k < grid[2]; k++)
+//       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
+}
+
+void cu_enqueue(uint64_t stream, uint64_t kernel,
+                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
+                uint64_t block_0, uint64_t block_1, uint64_t block_2,
+                void* args_ptr, size_t args_size, int64_t shared_mem){
+  void *config[] = {
+      CU_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
+      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
+      CU_LAUNCH_PARAM_END
+  };
+  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
+                                block_0, block_1, block_2, 
+                                shared_mem, (CUstream)stream, nullptr, config);
+}
+
+void hip_enqueue(uint64_t stream, uint64_t kernel,
+                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
+                uint64_t block_0, uint64_t block_1, uint64_t block_2,
+                void* args_ptr, size_t args_size, int64_t shared_mem) {
+  void *config[] = {
+      HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
+      HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
+      HIP_LAUNCH_PARAM_END
+  };
+  drv::dispatch::hipModuleLaunchKernel((hipFunction_t)kernel, grid_0, grid_1, grid_2, 
+                                block_0, block_1, block_2, 
+                                shared_mem, (hipStream_t)stream, nullptr, config);
+
+}
+
+void init_triton_runtime(py::module &&m) {
+
+  // wrap backend_t
+  py::enum_<backend_t>(m, "backend")
+    .value("HOST", HOST)
+    .value("CUDA", CUDA)
+    .value("ROCM", ROCM)
+    .export_values();
+
+  // enable peer-to-peer
+  m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
+      if (backend != CUDA)
+        throw std::runtime_error("P2P only supported on CUDA devices!");
+      cu_enable_peer_access(peer_ptr);
+    }
+  );
+
+  // query maximum shared memory
+  m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
+      if (backend == HOST)
+        return 0;
+      if(backend == CUDA) 
+        return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(device);
+      if(backend == ROCM)
+        return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
+      return -1;
  });

-  py::class_<drv::module>(m, "module");
+  // enqueue
+  m.def("enqueue", [](backend_t backend, uint64_t stream, uint64_t kernel,
+                      uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
+                      uint64_t block_0, uint64_t block_1, uint64_t block_2,
+                      const std::string &args, int64_t shared_mem){
+    void* args_ptr = (void*)args.data();
+    size_t args_size = args.size();
+    if(backend == HOST)
+      host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
+    if(backend == CUDA)
+      cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
+    if(backend == ROCM)
+      hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
+  });

-  py::class_<drv::cu_module, drv::module>(m, "cu_module")
-      .def("ptx", &drv::cu_module::ptx)
-      .def("cubin", [](drv::cu_module *self) { return py::bytes(self->cubin()); })
-      .def("llir", &drv::cu_module::llir);
  
-  py::class_<drv::kernel>(m, "kernel");
 }

 /*****************************************************************************/
 /* Python bindings for triton::codegen                                       */
 /*****************************************************************************/
+typedef std::map<std::string, std::string> asm_map_t;
+
+
+std::tuple<uint64_t, uint64_t> cu_compile_llir(const std::string& name, size_t n_shared_bytes, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map, int cc, int version){
+  // LLVM-IR -> PTX
+  std::string ptx = drv::llir_to_ptx(llvm, cc, version);
+  asm_map["ptx"] = ptx;
+  // PTX -> Binary
+  CUmodule mod = drv::ptx_to_cumodule(ptx, cc);
+  // Handle to the kernel
+  CUfunction fun;
+  drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
+  // Dynamic shared memory
+  int shared_optin;
+  drv::dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
+  if(n_shared_bytes > 49152 && shared_optin > 49152){
+    drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
+    int shared_total, shared_static;
+    int n_spills, n_reg;
+    drv::dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
+    drv::dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
+    drv::dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  fun);
+    drv::dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, fun);
+    drv::dispatch::cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
+  }
+  
+  // record asm
+  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
+}
+
+std::tuple<uint64_t, uint64_t> hip_compile_llir(const std::string& name, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map){
+  // LLVM-IR -> HSA-CO
+  std::string path = drv::llir_to_amdgpu(llvm, "gfx908");
+  // HSA-CO -> hipModule
+  hipModule_t mod = drv::amdgpu_to_hipmodule(path);
+  // Handle to the kernel
+  hipFunction_t fun;
+  drv::dispatch::hipModuleGetFunction(&fun, mod, name.c_str());
+  // record asm
+  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
+}

 void init_triton_codegen(py::module &&m) {
  m.def(
-      "add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps, int num_stages, bool force_nc_cache) {
-        drv::module *mod;
-        drv::kernel *ker;
-        size_t shared_mem;
-        triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, num_stages, force_nc_cache, mod, ker, shared_mem);
-        std::stringstream ss;
-        ir::print(ir, ss);
-        return std::make_tuple(mod, ker, shared_mem, ss.str());
+      "compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages, bool force_nc_cache) {
+        std::string name = ir.get_function_list()[0]->get_name();
+        // record asm as we generate
+        asm_map_t asm_map;
+        std::ostringstream ttir;
+        ir::print(ir, ttir);
+        asm_map["ttir"] = ttir.str();
+        llvm::LLVMContext ctx;
+        if(backend == CUDA){
+          // device properties
+          CUdevice dev = (CUdevice)device;
+          size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
+          size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
+          size_t cc = major*10 + minor;
+          int version;
+          drv::dispatch::cuDriverGetVersion(&version);
+          // Triton-IR -> NVPTX LLVM-IR
+          triton::codegen::nvidia_cu_target target(cc);
+          int n_shared_bytes;
+          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc, num_warps, num_stages, force_nc_cache, n_shared_bytes);
+          llvm::raw_string_ostream llir(asm_map["llir"]);
+          llir << *llvm;
+          llir.flush();
+          // LLVM-IR -> Bin
+          uint64_t mod, fun;
+          std::tie(mod, fun) = cu_compile_llir(name, n_shared_bytes, &*llvm, device, asm_map, cc, version);
+          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
+        }
+        if(backend == ROCM){
+          // Triton-IR -> NVPTX LLVM-IR
+          triton::codegen::amd_cl_target target;
+          int n_shared_bytes;
+          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70, num_warps, num_stages, force_nc_cache, n_shared_bytes);
+          llvm::raw_string_ostream llir(asm_map["llir"]);
+          llir << *llvm;
+          llir.flush();
+          // LLVM-IR -> Bin
+          uint64_t mod, fun;
+          std::tie(mod, fun) = hip_compile_llir(name, &*llvm, device, asm_map);
+          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
+        }
      },
      py::return_value_policy::take_ownership);
 }
@@ -302,7 +445,7 @@ void init_triton_ir(py::module &&m) {
 void init_triton(py::module &m) {
  py::module subm = m.def_submodule("triton");
  init_triton_codegen(std::move(subm.def_submodule("code_gen")));
-  init_triton_driver(std::move(subm.def_submodule("driver")));
+  init_triton_runtime(std::move(subm.def_submodule("runtime")));
  init_triton_ir(std::move(subm.def_submodule("ir")));
  init_triton_frontend(std::move(subm.def_submodule("frontend")));
 }
--- a/python/test/language/test_core.py
+++ b/python/test/language/test_core.py
@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
    return kernel


+
+
 # generic test functions
 def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
    SIZE = 128
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
    # compare
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # parse ptx to make sure ld/st are vectorized
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    assert 'ld.global.v4' in ptx
    assert 'st.global.v4' in ptx

@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
        z_ref += z[0,:][None, :]
    z_ref = z_ref.to(torch.float16)
    # compare
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    # print(ptx)
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # make sure ld/st are vectorized
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
 # ---------------
 # test while
 # ---------------
+
+# ---------------
+# test noop
+#----------------
+def test_noop(device='cuda'):
+    @triton.jit
+    def kernel(**meta):
+        pass
+    x = triton.testing.random((1,), dtype=torch.int32, device=device)
+    kernel[(1, )](x)
--- a/python/triton/code_gen.py
+++ b/python/triton/code_gen.py
@@ -411,9 +411,9 @@ class CodeGenerator(ast.NodeVisitor):


 class Binary:
-    def __init__(self, module, kernel, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm):
+    def __init__(self, backend, module, kernel, asm, num_warps, num_stages, force_nc_cache, shared_mem):
        # cache ir asm
-        self.ir_asm = ir_asm
+        self.asm = asm
        self.module = module
        self.kernel = kernel
        self.shared_mem = shared_mem
@@ -421,29 +421,13 @@ class Binary:
        self.num_stages = num_stages
        self.force_nc_cache = force_nc_cache
        self.sass = None
-
-    def asm(self, mode):
-        if mode == 'ttir':
-            return self.ir_asm
-        if mode == 'ptx':
-            return self.module.ptx()
-        if mode == 'sass':
-            if self.sass is None:
-                cubin = self.module.cubin()
-                # get a temporary file name
-                fd, path = tempfile.mkstemp(suffix='.cubin')
-                f = open(path, 'wb')
-                f.write(cubin)
-                f.close()
-                # extract SASS from cubin
-                self.sass = extract(path, None)
-            return self.sass
-        if mode == 'llir':
-            return self.module.llir()
-        raise ValueError('Unsupported mode ' + mode)
+        self.backend = backend

    def __call__(self, stream, args, grid_0, grid_1=1, grid_2=1):
-        stream.enqueue(self.kernel, grid_0, grid_1, grid_2, self.num_warps * 32, 1, 1, args, self.shared_mem)
+        _triton.runtime.enqueue(self.backend, stream, self.kernel,
+                                grid_0, grid_1, grid_2, 
+                                self.num_warps * 32, 1, 1, 
+                                args, self.shared_mem)


 class CompilationError(Exception):
@@ -548,10 +532,15 @@ class Kernel:
                raise e
            raise CompilationError(self.fn.src, node, e)
        # Compile to machine code
-        mod, ker, shared_mem, ir_asm = _triton.code_gen.add_passes_to_emit_bin(generator.module, device, num_warps, num_stages, force_nc_cache)
-        if shared_mem > device.max_shared_memory():
-            raise OutOfResources(shared_mem, device.max_shared_memory(), "shared memory")
-        return Binary(mod, ker, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm)
+        if torch.version.hip is None:
+            backend = _triton.runtime.backend.CUDA
+        else:
+            backend = _triton.runtime.backend.ROCM
+        mod, ker, asm, shared_mem = _triton.code_gen.compile_ttir(backend, generator.module, device, num_warps, num_stages, force_nc_cache)
+        max_shared_memory = _triton.runtime.max_shared_memory(backend, device)
+        if shared_mem > max_shared_memory:
+            raise OutOfResources(shared_mem, max_shared_memory, "shared memory")
+        return Binary(backend, mod, ker, asm, num_warps, num_stages, force_nc_cache, shared_mem)

    def __call__(self, *wargs, grid, num_warps=4, num_stages=2, force_nc_cache=False, **meta):
        # device inference
@@ -571,19 +560,20 @@ class Kernel:
                             " Only CUDA is supported at the moment")

        device = torch.device('cuda', torch.cuda.current_device())
-        tt_device = _triton.driver.cu_device(device.index, False)
-        if len(set(device_ids)) != 1 or device_ids[0] != device.index:
+        device_ty  = device.type
+        device_idx = device.index
+        if len(set(device_ids)) != 1 or device_ids[0] != device_idx:
            # try to enable P2P communication
            for arg_idx, dst_idx in zip(tensor_idxs, device_ids):
-                if dst_idx != device.index:
+                if dst_idx != device_idx:
                    try:
-                        tt_device.enable_peer_access(wargs[arg_idx].data_ptr())
+                        _triton.runtime.enable_peer_access(self.backend, wargs[arg_idx].data_ptr())
                    except RuntimeError as e:
                        raise RuntimeError("Cannot enable P2P access from device {} to device {}: {}"
-                                           .format(device.index, dst_idx, str(e)))
+                                           .format(device_idx, dst_idx, str(e)))

        # enqueue kernel on the current device
-        torch.cuda.set_device(device.index)
+        torch.cuda.set_device(device_idx)
        # attributes
        args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
        attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) if isinstance(a, int)}
@@ -594,12 +584,12 @@ class Kernel:
        attr_key = frozenset(attributes.items())
        meta_key = frozenset(meta.items())
        const_key = frozenset(constants.items())
-        key = (device.type, device.index, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
+        key = (device_ty, device_idx, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
        cache = self.fn.cache
        if key not in cache:
            # compile and cache configuration if necessary
            cache[key] = self._compile(
-                *wargs, device=tt_device, attributes=attributes,
+                *wargs, device=device_idx, attributes=attributes,
                num_warps=num_warps, num_stages=num_stages, force_nc_cache=force_nc_cache, 
                constants=constants, **meta
            )
@@ -608,8 +598,7 @@ class Kernel:
        params = struct.pack(fmt, *args)
        # enqueue cached function into stream
        binary = cache[key]
-        cu_stream = torch.cuda.current_stream(device.index).cuda_stream
-        stream = _triton.driver.cu_stream(cu_stream, False)
+        stream = torch.cuda.current_stream(device_idx).cuda_stream
        grid = grid(meta) if hasattr(grid, '__call__') else grid
        binary(stream, params, *grid)
        return binary
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -64,7 +64,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
-    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    return output
@@ -85,6 +85,7 @@ print(
    f'The maximum difference between torch and triton is '
    f'{torch.max(torch.abs(output_torch - output_triton))}'
 )
+exit()

 # %%
 # Seems like we're good to go!