[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
2021-09-09 00:04:28 -07:00
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -31,7 +31,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_FORMAT_MACROS  -std=gnu++17")
 # LLVM
 ##########
 if("${LLVM_LIBRARY_DIR}" STREQUAL "")
-    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx")
+    find_package(LLVM 11 REQUIRED COMPONENTS "nvptx;amdgpu")
    message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}")
    if(APPLE)
      set(CMAKE_OSX_DEPLOYMENT_TARGET "10.14")
@@ -39,14 +39,52 @@ if("${LLVM_LIBRARY_DIR}" STREQUAL "")
 # sometimes we don't want to use llvm-config, since it may have been downloaded for some specific linux distros
 else()
    set(LLVM_LDFLAGS "-L${LLVM_LIBRARY_DIR}")
-    set(LLVM_LIBRARIES libLLVMNVPTXCodeGen.a libLLVMSelectionDAG.a libLLVMipo.a libLLVMInstrumentation.a
+    set(LLVM_LIBRARIES 
-                       libLLVMVectorize.a libLLVMLinker.a libLLVMIRReader.a libLLVMAsmParser.a libLLVMFrontendOpenMP.a
+libLLVMNVPTXCodeGen.a
-                       libLLVMAsmPrinter.a libLLVMDebugInfoDWARF.a libLLVMCodeGen.a libLLVMTarget.a libLLVMScalarOpts.a
+libLLVMNVPTXDesc.a
-                       libLLVMInstCombine.a libLLVMAggressiveInstCombine.a libLLVMTransformUtils.a libLLVMBitWriter.a
+libLLVMNVPTXInfo.a
-                       libLLVMAnalysis.a libLLVMProfileData.a libLLVMObject.a libLLVMTextAPI.a libLLVMMCParser.a
+libLLVMAMDGPUDisassembler.a
-                       libLLVMBitReader.a libLLVMCore.a libLLVMRemarks.a libLLVMBitstreamReader.a libLLVMNVPTXDesc.a
+libLLVMMCDisassembler.a
-                       libLLVMMC.a libLLVMDebugInfoCodeView.a libLLVMDebugInfoMSF.a libLLVMBinaryFormat.a libLLVMNVPTXInfo.a
+libLLVMAMDGPUCodeGen.a
-                       libLLVMSupport.a libLLVMDemangle.a)
+libLLVMMIRParser.a
 libLLVMGlobalISel.a
 libLLVMSelectionDAG.a
 libLLVMipo.a
 libLLVMInstrumentation.a
 libLLVMVectorize.a
 libLLVMLinker.a
 libLLVMIRReader.a
 libLLVMAsmParser.a
 libLLVMFrontendOpenMP.a
 libLLVMAsmPrinter.a
 libLLVMDebugInfoDWARF.a
 libLLVMCodeGen.a
 libLLVMTarget.a
 libLLVMScalarOpts.a
 libLLVMInstCombine.a
 libLLVMAggressiveInstCombine.a
 libLLVMTransformUtils.a
 libLLVMBitWriter.a
 libLLVMAnalysis.a
 libLLVMProfileData.a
 libLLVMObject.a
 libLLVMTextAPI.a
 libLLVMBitReader.a
 libLLVMAMDGPUAsmParser.a
 libLLVMMCParser.a
 libLLVMAMDGPUDesc.a
 libLLVMAMDGPUUtils.a
 libLLVMMC.a
 libLLVMDebugInfoCodeView.a
 libLLVMDebugInfoMSF.a
 libLLVMCore.a
 libLLVMRemarks.a
 libLLVMBitstreamReader.a
 libLLVMBinaryFormat.a
 libLLVMAMDGPUInfo.a
 libLLVMSupport.a
 libLLVMDemangle.a
 )
 endif()
 include_directories("${LLVM_INCLUDE_DIRS}")
@@ -82,4 +120,4 @@ if(BUILD_PYTHON_MODULE)
        set(PYTHON_LDFLAGS "-undefined dynamic_lookup -flto")
    endif()
    target_link_libraries(triton ${CUTLASS_LIBRARIES} ${PYTHON_LDFLAGS})
-endif()
+endif()
--- a/include/triton/codegen/pass.h
+++ b/include/triton/codegen/pass.h
@@ -4,8 +4,17 @@
 #include <memory>
 namespace llvm{
  class Module;
  class LLVMContext;
 }
 namespace triton{
 namespace codegen {
  class target;
 }
 namespace ir{
  class module;
 }
@@ -21,8 +30,10 @@ namespace codegen{
 // TODO:
 // There should be a proper pass manager there!
-void add_passes_to_emit_bin(ir::module &ir, driver::device* dev, int num_warps, int num_stages, bool force_nc_cache,
+std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx,
-                            driver::module*& mod, driver::kernel*& ker, size_t& shared_mem);
+                                                     codegen::target* target,
                                                     int sm, int num_warps,
                                                     int num_stages, bool force_nc_cache, int &shared_static);
 }
--- a/include/triton/driver/backend.h
+++ b/include/triton/driver/backend.h
@@ -1,137 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_BACKEND_H_
 #define _TRITON_DRIVER_BACKEND_H_
 #include <map>
 #include <list>
 #include <vector>
 #include "triton/driver/context.h"
 namespace llvm
 {
 class Module;
 }
 namespace triton
 {
 namespace driver
 {
 class buffer;
 class stream;
 class device;
 class context;
 class platform;
 class module;
 class kernel;
 struct backend
 {
  // platforms
  class platforms
  {
    friend class backend;
  private:
    static void init();
  public:
    static void get(std::vector<driver::platform*> &results);
  private:
    static std::vector<driver::platform*> cache_;
  };
  // devices
  class devices
  {
    friend class backend;
  private:
    static void init(const std::vector<platform *> &platforms);
  public:
    static void get(std::vector<driver::device*>& devs);
  private:
    static std::vector<driver::device*> cache_;
  };
  // modules
  class modules
  {
    friend class backend;
  public:
    static void release();
  private:
    static std::map<std::tuple<driver::stream*, std::string>, driver::module*> cache_;
  };
  // kernels
  class kernels
  {
    friend class backend;
  public:
    static void release();
    static driver::kernel* get(driver::module* mod, const std::string & name);
  private:
    static std::map<std::tuple<module*, std::string>, driver::kernel*> cache_;
  };
  // contexts
  class contexts
  {
    friend class backend;
  private:
    static void init(const std::vector<device *> &);
    static void release();
  public:
    static driver::context* get_default();
    static driver::context* import(CUcontext ctx)
    {
      for(driver::context* x: cache_){
        driver::cu_context* cu_x = (driver::cu_context*)x;
        if(*cu_x->cu()==ctx)
          return x;
      }
      cache_.emplace_back(new driver::cu_context(ctx, false));
      return cache_.back();
    }
    static void get(std::list<driver::context*> &);
  private:
    static std::list<driver::context*> cache_;
  };
  // streams
  class streams
  {
    friend class backend;
  private:
    static void init(std::list<context*> const &);
    static void release();
  public:
    static void get(driver::context*, std::vector<driver::stream *> &streams);
    static driver::stream* get(driver::context*, unsigned int id = 0);
    static driver::stream* get_default();
  private:
    static std::map<driver::context*, std::vector<driver::stream*> > cache_;
  };
  static void init();
  static void release();
  static void synchronize(triton::driver::context *);
  static unsigned int default_device;
 };
 }
 }
 #endif
--- a/include/triton/driver/buffer.h
+++ b/include/triton/driver/buffer.h
@@ -1,48 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_BUFFER_H_
 #define _TRITON_DRIVER_BUFFER_H_
 #include "triton/driver/handle.h"
 #include "triton/driver/context.h"
 namespace triton
 {
 namespace driver
 {
 class stream;
 // Base
 class buffer : public polymorphic_resource<CUdeviceptr, host_buffer_t> {
 public:
  buffer(size_t size, CUdeviceptr cl, bool take_ownership);
  buffer(size_t size, host_buffer_t hst, bool take_ownership);
  uintptr_t addr_as_uintptr_t();
  static buffer* create(driver::context* ctx, size_t size);
  size_t size();
 protected:
  size_t size_;
 };
 // CPU
 class host_buffer: public buffer
 {
 public:
  host_buffer(size_t size);
 };
 // CUDA
 class cu_buffer: public buffer
 {
 public:
  cu_buffer(size_t size);
  cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership);
  void set_zero(triton::driver::stream *queue, size_t size);
 };
 }
 }
 #endif
--- a/include/triton/driver/context.h
+++ b/include/triton/driver/context.h
@@ -1,50 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_CONTEXT_H_
 #define _TRITON_DRIVER_CONTEXT_H_
 #include "triton/driver/device.h"
 #include "triton/driver/handle.h"
 namespace triton
 {
 namespace driver
 {
 class context: public polymorphic_resource<CUcontext, host_context_t>{
 protected:
  static std::string get_cache_path();
 public:
  context(driver::device *dev, CUcontext cu, bool take_ownership);
  context(driver::device *dev, host_context_t hst, bool take_ownership);
  driver::device* device() const;
  std::string const & cache_path() const;
  // factory methods
  static context* create(driver::device *dev);
 protected:
  driver::device* dev_;
  std::string cache_path_;
 };
 // Host
 class host_context: public context {
 public:
  host_context(driver::device* dev);
 };
 // CUDA
 class cu_context: public context {
 private:
  static CUdevice get_device_of(CUcontext);
 public:
  //Constructors
  cu_context(CUcontext cu, bool take_ownership = true);
  cu_context(driver::device* dev);
 };
 }
 }
 #endif
--- a/include/triton/driver/device.h
+++ b/include/triton/driver/device.h
@@ -1,82 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_DEVICE_H_
 #define _TRITON_DRIVER_DEVICE_H_
 #include "triton/driver/platform.h"
 #include "triton/driver/handle.h"
 namespace triton
 {
 namespace codegen
 {
 class target;
 }
 namespace driver
 {
 class context;
 // Base device
 class device: public polymorphic_resource<CUdevice, host_device_t>{
 public:
  using polymorphic_resource::polymorphic_resource;
  virtual size_t max_threads_per_block() const = 0;
  virtual size_t max_shared_memory() const = 0;
  virtual std::unique_ptr<codegen::target> make_target() const = 0;
 };
 // Host device
 class host_device: public device {
 public:
  host_device(): device(host_device_t(), true){ }
  size_t max_threads_per_block() const { return 1; }
  size_t max_shared_memory() const { return 0; }
  std::unique_ptr<codegen::target> make_target() const;
 };
 // CUDA device
 class cu_device: public device {
 private:
  //Metaprogramming elper to get cuda info from attribute
  template<CUdevice_attribute attr>
  int cuGetInfo() const;
  inline nvmlDevice_t nvml_device() const;
 public:
  cu_device(CUdevice cu = CUdevice(), bool take_ownership = true): device(cu, take_ownership){}
  // Informations
  std::string infos() const;
  size_t address_bits() const;
  std::vector<size_t> max_block_dim() const;
  size_t warp_size() const;
  // Compute Capability
  void interpret_as(int cc);
  int compute_capability() const;
  // Identifier
  std::string name() const;
  std::string pci_bus_id() const;
  // Clocks
  size_t current_sm_clock() const;
  size_t current_mem_clock() const;
  size_t max_threads_per_block() const;
  size_t max_shared_memory() const;
  size_t max_sm_clock() const;
  size_t max_mem_clock() const;
  void set_max_clock();
  void enable_peer_access(CUdeviceptr peer_mem_ptr) const;
  // Target
  std::unique_ptr<codegen::target> make_target() const;
 private:
  std::shared_ptr<int> interpreted_as_;
 };
 }
 }
 #endif
--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -10,6 +10,10 @@
 #include "triton/external/CUDA/cuda.h"
 #include "triton/external/CUDA/nvml.h"
 //// HIP backend
 //#define __HIP_PLATFORM_AMD__
 #include "triton/external/hip.h"
 //Exceptions
 #include <iostream>
 #include <stdexcept>
@@ -28,6 +32,7 @@ class cu_context;
 template<class T> void check(T){}
 void check(CUresult err);
 void check(hipError_t err);
 class dispatch
 {
@@ -58,17 +63,18 @@ protected:
  }
 public:
  static void release();
  // Nvidia
  static bool nvmlinit();
  static bool cuinit();
-  static void release();
+  // AMD
  static bool hipinit();
  /* ------------------- *
   * CUDA
   * ------------------- */
  // context management
  static CUresult cuInit(unsigned int Flags);
  static CUresult cuCtxGetCurrent(CUcontext *pctx);
  static CUresult cuCtxSetCurrent(CUcontext ctx);
  static CUresult cuCtxDestroy_v2(CUcontext ctx);
  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
@@ -128,6 +134,55 @@ public:
  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
  /* ------------------- *
   * HIP
   * ------------------- */
  // context management
  static hipError_t hipInit(unsigned int Flags);
  static hipError_t hipCtxDestroy(hipCtx_t ctx);
  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
  static hipError_t hipCtxGetDevice(hipDevice_t* result);
  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
  static hipError_t hipDriverGetVersion(int *driverVersion);
  // device management
  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
  static hipError_t hipGetDeviceCount(int *count);
  // module management
  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
  static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
  static hipError_t hipModuleUnload(hipModule_t hmod);
  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
  // stream management
  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
  static hipError_t hipStreamSynchronize(hipStream_t hStream);
  static hipError_t hipStreamDestroy(hipStream_t hStream);
  static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
  // function management
  static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
  // memory management
  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
  static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
  static hipError_t hipFree(hipDeviceptr_t dptr);
  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
  // event management
  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
  static hipError_t hipEventDestroy(hipEvent_t hEvent);
 private:
@@ -135,6 +190,7 @@ private:
  // Libraries
  static void* cuda_;
  static void* nvml_;
  static void* hip_;
  /* ------------------- *
@@ -194,9 +250,6 @@ private:
  static void* cuEventRecord_;
  static void* cuEventDestroy_v2_;
  /* ------------------- *
   * NVML
   * ------------------- */
@@ -205,6 +258,55 @@ private:
  static void* nvmlDeviceGetClockInfo_;
  static void* nvmlDeviceGetMaxClockInfo_;
  static void* nvmlDeviceSetApplicationsClocks_;
  /* ------------------- *
   * HIP
   * ------------------- */
  // context management
  static void* hipInit_;
  static void* hipCtxDestroy_;
  static void* hipCtxCreate_;
  static void* hipCtxPushCurrent_;
  static void* hipCtxPopCurrent_;
  static void* hipCtxGetDevice_;
  static void* hipCtxEnablePeerAccess_;
  static void* hipDriverGetVersion_;
  // device management
  static void* hipGetDevice_;
  static void* hipDeviceGetName_;
  static void* hipDeviceGetPCIBusId_;
  static void* hipDeviceGetAttribute_;
  static void* hipGetDeviceCount_;
  // module management
  static void* hipModuleGetGlobal_;
  static void* hipModuleLoad_;
  static void* hipModuleLoadData_;
  static void* hipModuleUnload_;
  static void* hipModuleLoadDataEx_;
  static void* hipModuleGetFunction_;
  // stream management
  static void* hipStreamCreate_;
  static void* hipStreamSynchronize_;
  static void* hipStreamDestroy_;
  static void* hipModuleLaunchKernel_;;
  // function management
  static void* hipFuncGetAttributes_;
  static void* hipFuncSetAttribute_;
  static void* hipFuncSetCacheConfig_;
  // memory management
  static void* hipMalloc_;
  static void* hipPointerGetAttribute_;
  static void* hipMemsetD8Async_;
  static void* hipMemcpyDtoH_;
  static void* hipFree_;
  static void* hipMemcpyDtoHAsync_;
  static void* hipMemcpyHtoDAsync_;
  static void* hipMemcpyHtoD_;
  // event management
  static void* hipEventCreate_;
  static void* hipEventElapsedTime_;
  static void* hipEventRecord_;
  static void* hipEventDestroy_;
 };
 }
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -141,6 +141,78 @@ namespace triton
  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
  }
  namespace hip
  {
  class base: public std::exception{};
 #define TRITON_CREATE_HIP_EXCEPTION(name, msg) class name: public base { public:const char * what() const throw(){ return "HIP: Error- " msg; } }
  TRITON_CREATE_HIP_EXCEPTION(invalid_value                   ,"invalid value");
  TRITON_CREATE_HIP_EXCEPTION(out_of_memory                   ,"out of memory");
  TRITON_CREATE_HIP_EXCEPTION(not_initialized                 ,"not initialized");
  TRITON_CREATE_HIP_EXCEPTION(deinitialized                   ,"deinitialized");
  TRITON_CREATE_HIP_EXCEPTION(profiler_disabled               ,"profiler disabled");
  TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
  TRITON_CREATE_HIP_EXCEPTION(profiler_already_started        ,"profiler already started");
  TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
  TRITON_CREATE_HIP_EXCEPTION(no_device                       ,"no device");
  TRITON_CREATE_HIP_EXCEPTION(invalid_device                  ,"invalid device");
  TRITON_CREATE_HIP_EXCEPTION(invalid_image                   ,"invalid image");
  TRITON_CREATE_HIP_EXCEPTION(invalid_context                 ,"invalid context");
  TRITON_CREATE_HIP_EXCEPTION(context_already_current         ,"context already current");
  TRITON_CREATE_HIP_EXCEPTION(map_failed                      ,"map failed");
  TRITON_CREATE_HIP_EXCEPTION(unmap_failed                    ,"unmap failed");
  TRITON_CREATE_HIP_EXCEPTION(array_is_mapped                 ,"array is mapped");
  TRITON_CREATE_HIP_EXCEPTION(already_mapped                  ,"already mapped");
  TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
  TRITON_CREATE_HIP_EXCEPTION(already_acquired                ,"already acquired");
  TRITON_CREATE_HIP_EXCEPTION(not_mapped                      ,"not mapped");
  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
  TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
  TRITON_CREATE_HIP_EXCEPTION(unsupported_limit               ,"unsupported limit");
  TRITON_CREATE_HIP_EXCEPTION(context_already_in_use          ,"context already in use");
  TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
  TRITON_CREATE_HIP_EXCEPTION(invalid_ptx                     ,"invalid ptx");
  TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
  TRITON_CREATE_HIP_EXCEPTION(invalid_source                  ,"invalid source");
  TRITON_CREATE_HIP_EXCEPTION(file_not_found                  ,"file not found");
  TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
  TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
  TRITON_CREATE_HIP_EXCEPTION(operating_system                ,"operating system");
  TRITON_CREATE_HIP_EXCEPTION(invalid_handle                  ,"invalid handle");
  TRITON_CREATE_HIP_EXCEPTION(not_found                       ,"not found");
  TRITON_CREATE_HIP_EXCEPTION(not_ready                       ,"not ready");
  TRITON_CREATE_HIP_EXCEPTION(illegal_address                 ,"illegal address");
  TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
  TRITON_CREATE_HIP_EXCEPTION(launch_timeout                  ,"launch timeout");
  TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
  TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
  TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
  TRITON_CREATE_HIP_EXCEPTION(primary_context_active          ,"primary context active");
  TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed            ,"context is destroyed");
  TRITON_CREATE_HIP_EXCEPTION(assert_error                    ,"assert");
  TRITON_CREATE_HIP_EXCEPTION(too_many_peers                  ,"too many peers");
  TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
  TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
  TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error            ,"hardware stack error");
  TRITON_CREATE_HIP_EXCEPTION(illegal_instruction             ,"illegal instruction");
  TRITON_CREATE_HIP_EXCEPTION(misaligned_address              ,"misaligned address");
  TRITON_CREATE_HIP_EXCEPTION(invalid_address_space           ,"invalid address space");
  TRITON_CREATE_HIP_EXCEPTION(invalid_pc                      ,"invalid pc");
  TRITON_CREATE_HIP_EXCEPTION(launch_failed                   ,"launch failed");
  TRITON_CREATE_HIP_EXCEPTION(not_permitted                   ,"not permitted");
  TRITON_CREATE_HIP_EXCEPTION(not_supported                   ,"not supported");
  TRITON_CREATE_HIP_EXCEPTION(invalid_symbol                   ,"invalid symbol");
  TRITON_CREATE_HIP_EXCEPTION(unknown                         ,"unknown");
 #undef TRITON_CREATE_CUDA_EXCEPTION
  }
  }
  }
 }
--- a/include/triton/driver/handle.h
+++ b/include/triton/driver/handle.h
@@ -1,146 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_HANDLE_H_
 #define _TRITON_DRIVER_HANDLE_H_
 #include <memory>
 #include <map>
 #include <iostream>
 #include <functional>
 #include <type_traits>
 #include "triton/driver/dispatch.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/Core.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "triton/tools/thread_pool.h"
 namespace llvm
 {
 class ExecutionEngine;
 class Function;
 }
 namespace triton
 {
 namespace driver
 {
 enum backend_t {
  CUDA,
  Host
 };
 // Host handles
 struct host_platform_t{
 };
 struct host_device_t{
 };
 struct host_context_t{
 };
 struct host_stream_t{
  std::shared_ptr<ThreadPool> pool;
  std::shared_ptr<std::vector<std::future<void>>> futures;
  std::vector<std::shared_ptr<char*>> args;
 };
 struct host_module_t{
  std::string error;
  llvm::ExecutionEngine* engine;
  std::map<std::string, llvm::Function*> functions;
  void(*fn)(char**, int32_t, int32_t, int32_t);
  llvm::orc::ExecutionSession* ES;
  llvm::orc::RTDyldObjectLinkingLayer* ObjectLayer;
  llvm::orc::IRCompileLayer* CompileLayer;
  llvm::DataLayout* DL;
  llvm::orc::MangleAndInterner* Mangle;
  llvm::orc::ThreadSafeContext* Ctx;
  llvm::orc::JITDylib *MainJD;
 };
 struct host_function_t{
  llvm::Function* fn;
 };
 struct host_buffer_t{
  char* data;
 };
 // Extra CUDA handles
 struct cu_event_t{
  operator bool() const { return first && second; }
  CUevent first;
  CUevent second;
 };
 struct CUPlatform{
  CUPlatform() : status_(dispatch::cuInit(0)) { }
  operator bool() const { return status_; }
 private:
  CUresult status_;
 };
 template<class T, class CUType>
 class handle_interface{
 public:
    //Accessors
    operator CUType() const { return *(((T*)this)->cu().h_); }
    //Comparison
    bool operator==(handle_interface const & y) { return (CUType)(*this) == (CUType)(y); }
    bool operator!=(handle_interface const & y) { return (CUType)(*this) != (CUType)(y); }
    bool operator<(handle_interface const & y) { return (CUType)(*this) < (CUType)(y); }
 };
 template<class T>
 class handle{
 public:
  template<class, class> friend class handle_interface;
 public:
  //Constructors
  handle(T h, bool take_ownership = true);
  handle();
  ~handle();
  T& operator*() { return *h_; }
  T const & operator*() const { return *h_; }
  T* operator->() const { return h_.get(); }
 protected:
  std::shared_ptr<T> h_;
  bool has_ownership_;
 };
 template<class CUType, class HostType>
 class polymorphic_resource {
 public:
  polymorphic_resource(CUType cu, bool take_ownership): cu_(cu, take_ownership), backend_(CUDA){}
  polymorphic_resource(HostType hst, bool take_ownership): hst_(hst, take_ownership), backend_(Host){}
  virtual ~polymorphic_resource() { }
  handle<CUType> cu() { return cu_; }
  handle<HostType> hst() { return hst_; }
  const handle<CUType>& cu() const { return cu_; }
  const handle<HostType>& hst() const { return hst_; }
  backend_t backend() { return backend_; }
 protected:
  handle<CUType> cu_;
  handle<HostType> hst_;
  backend_t backend_;
 };
 }
 }
 #endif
--- a/include/triton/driver/kernel.h
+++ b/include/triton/driver/kernel.h
@@ -1,53 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_KERNEL_H_
 #define _TRITON_DRIVER_KERNEL_H_
 #include "triton/driver/module.h"
 #include "triton/driver/handle.h"
 #include <memory>
 namespace llvm
 {
 class GenericValue;
 }
 namespace triton
 {
 namespace driver
 {
 class cu_buffer;
 // Base
 class kernel: public polymorphic_resource<CUfunction, host_function_t> {
 public:
  kernel(driver::module* program, CUfunction fn, bool has_ownership);
  kernel(driver::module* program, host_function_t fn, bool has_ownership);
  driver::module* module();
  static kernel* create(driver::module* program, const char* name);
 private:
  driver::module* program_;
 };
 // Host
 class host_kernel: public kernel {
 public:
  //Constructors
  host_kernel(driver::module* program, const char* name);
 };
 // CUDA
 class cu_kernel: public kernel {
 public:
  //Constructors
  cu_kernel(driver::module* program, const char * name);
 };
 }
 }
 #endif
--- a/include/triton/driver/llvm.h
+++ b/include/triton/driver/llvm.h
@@ -0,0 +1,18 @@
 #include <string>
 #include "triton/driver/dispatch.h"
 namespace llvm{
 class Module;
 }
 namespace triton{
 namespace driver{
 void init_llvm();
 std::string llir_to_ptx(llvm::Module* module, int cc, int version);
 CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
 std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
 hipModule_t amdgpu_to_hipmodule(const std::string& path);
 }
 }
--- a/include/triton/driver/module.h
+++ b/include/triton/driver/module.h
@@ -1,84 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_MODULE_H_
 #define _TRITON_DRIVER_MODULE_H_
 #include <map>
 #include "triton/driver/handle.h"
 #include "triton/driver/context.h"
 #include "triton/driver/buffer.h"
 namespace llvm
 {
  class Module;
  template<class T>
  class SmallVectorImpl;
 }
 namespace triton
 {
 namespace driver
 {
 class cu_context;
 class cu_device;
 // Base
 class module: public polymorphic_resource<CUmodule, host_module_t> {
 protected:
  void init_llvm();
  enum file_type_t{
    Object,
    Assembly
  };
 public:
  module(CUmodule mod, bool has_ownership);
  module(host_module_t mod, bool has_ownership);
  static module* create(driver::device* device, std::unique_ptr<llvm::Module> src);
  void compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
                           const std::string &proc, std::string layout,
                           llvm::SmallVectorImpl<char> &buffer,
                           const std::string &features,
                           file_type_t file_type);
  virtual std::unique_ptr<buffer> symbol(const char * name) const = 0;
  int spilled() const { return spilled_; }
 protected:
  int spilled_;
 };
 // CPU
 class host_module: public module{
 public:
  host_module(std::unique_ptr<llvm::Module> module);
  std::unique_ptr<buffer> symbol(const char * name) const;
 };
 // CUDA
 class cu_module: public module {
  std::string compile_llvm_module(llvm::Module* module, driver::device* device);
  void init_from_ptx(const std::string& ptx, cu_device *device);
 public:
  cu_module(driver::device* device, std::unique_ptr<llvm::Module> module);
  cu_module(driver::device* device, const std::string& source);
  std::unique_ptr<buffer> symbol(const char * name) const;
  std::string llir() const { return llir_; }
  const std::string& ptx() const { return ptx_; }
  const std::string& cubin() const { return cubin_; }
 private:
  std::string ptx_;
  std::string cubin_;
  std::string llir_;
 };
 }
 }
 #endif
--- a/include/triton/driver/platform.h
+++ b/include/triton/driver/platform.h
@@ -1,58 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_PLATFORM_H_
 #define _TRITON_DRIVER_PLATFORM_H_
 #include <vector>
 #include <string>
 #include "triton/driver/handle.h"
 namespace triton
 {
 namespace driver
 {
 class device;
 class platform
 {
 public:
  // Constructor
  platform(const std::string& name): name_(name){ }
  // Accessors
  std::string name() const { return name_; }
  // Virtual methods
  virtual std::string version() const = 0;
  virtual void devices(std::vector<driver::device *> &devices) const = 0;
 private:
  std::string name_;
 };
 // CUDA
 class cu_platform: public platform
 {
 public:
  cu_platform(): platform("CUDA") { }
  std::string version() const;
  void devices(std::vector<driver::device*> &devices) const;
 private:
  handle<CUPlatform> cu_;
 };
 // Host
 class host_platform: public platform
 {
 public:
  host_platform(): platform("CPU") { }
  std::string version() const;
  void devices(std::vector<driver::device*> &devices) const;
 };
 }
 }
 #endif
--- a/include/triton/driver/stream.h
+++ b/include/triton/driver/stream.h
@@ -1,68 +0,0 @@
 #pragma once
 #ifndef _TRITON_DRIVER_STREAM_H_
 #define _TRITON_DRIVER_STREAM_H_
 #include <map>
 #include "triton/driver/context.h"
 #include "triton/driver/device.h"
 #include "triton/driver/handle.h"
 #include "triton/driver/buffer.h"
 namespace triton
 {
 namespace driver
 {
 class kernel;
 class event;
 class Range;
 class cu_buffer;
 // Base
 class stream: public polymorphic_resource<CUstream, host_stream_t> {
 public:
  stream(CUstream, bool has_ownership);
  stream(host_stream_t, bool has_ownership);
  // factory
  static driver::stream* create(backend_t backend);
  // methods
  virtual void synchronize() = 0;
  virtual void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem = 0) = 0;
  virtual void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr) = 0;
  virtual void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr) = 0;
  // template helpers
  template<class T> void write(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T> const & x)
  { write(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
  template<class T> void read(driver::buffer* buf, bool blocking, std::size_t offset, std::vector<T>& x)
  { read(buf, blocking, offset, x.size()*sizeof(T), x.data()); }
 };
 // Host
 class host_stream: public stream {
 public:
  host_stream();
  void synchronize();
  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
 };
 // CUDA
 class cu_stream: public stream {
 public:
  cu_stream(CUstream str, bool take_ownership);
  cu_stream();
  void synchronize();
  void enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem);
  void write(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void const* ptr);
  void read(driver::buffer* buf, bool blocking, std::size_t offset, std::size_t size, void* ptr);
 };
 }
 }
 #endif
--- a/include/triton/external/CL/cl.h
+++ b/include/triton/external/CL/cl.h
--- a/include/triton/external/CL/cl.hpp
+++ b/include/triton/external/CL/cl.hpp
--- a/include/triton/external/CL/cl2.hpp
+++ b/include/triton/external/CL/cl2.hpp
--- a/include/triton/external/CL/cl_d3d10.h
+++ b/include/triton/external/CL/cl_d3d10.h
@@ -1,131 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 #ifndef __OPENCL_CL_D3D10_H
 #define __OPENCL_CL_D3D10_H
 #include <d3d10.h>
 #include "cl.h"
 #include "cl_platform.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************
 * cl_khr_d3d10_sharing                                                       */
 #define cl_khr_d3d10_sharing 1
 typedef cl_uint cl_d3d10_device_source_khr;
 typedef cl_uint cl_d3d10_device_set_khr;
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_D3D10_DEVICE_KHR                  -1002
 #define CL_INVALID_D3D10_RESOURCE_KHR                -1003
 #define CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR       -1004
 #define CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR           -1005
 /* cl_d3d10_device_source_nv */
 #define CL_D3D10_DEVICE_KHR                          0x4010
 #define CL_D3D10_DXGI_ADAPTER_KHR                    0x4011
 /* cl_d3d10_device_set_nv */
 #define CL_PREFERRED_DEVICES_FOR_D3D10_KHR           0x4012
 #define CL_ALL_DEVICES_FOR_D3D10_KHR                 0x4013
 /* cl_context_info */
 #define CL_CONTEXT_D3D10_DEVICE_KHR                  0x4014
 #define CL_CONTEXT_D3D10_PREFER_SHARED_RESOURCES_KHR 0x402C
 /* cl_mem_info */
 #define CL_MEM_D3D10_RESOURCE_KHR                    0x4015
 /* cl_image_info */
 #define CL_IMAGE_D3D10_SUBRESOURCE_KHR               0x4016
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D10_OBJECTS_KHR         0x4017
 #define CL_COMMAND_RELEASE_D3D10_OBJECTS_KHR         0x4018
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D10KHR_fn)(
    cl_platform_id             platform,
    cl_d3d10_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d10_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D10Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D10Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D10Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D10ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_0;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_D3D10_H */
--- a/include/triton/external/CL/cl_d3d11.h
+++ b/include/triton/external/CL/cl_d3d11.h
@@ -1,131 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 #ifndef __OPENCL_CL_D3D11_H
 #define __OPENCL_CL_D3D11_H
 #include <d3d11.h>
 #include "cl.h"
 #include "cl_platform.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************
 * cl_khr_d3d11_sharing                                                       */
 #define cl_khr_d3d11_sharing 1
 typedef cl_uint cl_d3d11_device_source_khr;
 typedef cl_uint cl_d3d11_device_set_khr;
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_D3D11_DEVICE_KHR                  -1006
 #define CL_INVALID_D3D11_RESOURCE_KHR                -1007
 #define CL_D3D11_RESOURCE_ALREADY_ACQUIRED_KHR       -1008
 #define CL_D3D11_RESOURCE_NOT_ACQUIRED_KHR           -1009
 /* cl_d3d11_device_source */
 #define CL_D3D11_DEVICE_KHR                          0x4019
 #define CL_D3D11_DXGI_ADAPTER_KHR                    0x401A
 /* cl_d3d11_device_set */
 #define CL_PREFERRED_DEVICES_FOR_D3D11_KHR           0x401B
 #define CL_ALL_DEVICES_FOR_D3D11_KHR                 0x401C
 /* cl_context_info */
 #define CL_CONTEXT_D3D11_DEVICE_KHR                  0x401D
 #define CL_CONTEXT_D3D11_PREFER_SHARED_RESOURCES_KHR 0x402D
 /* cl_mem_info */
 #define CL_MEM_D3D11_RESOURCE_KHR                    0x401E
 /* cl_image_info */
 #define CL_IMAGE_D3D11_SUBRESOURCE_KHR               0x401F
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_D3D11_OBJECTS_KHR         0x4020
 #define CL_COMMAND_RELEASE_D3D11_OBJECTS_KHR         0x4021
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromD3D11KHR_fn)(
    cl_platform_id             platform,
    cl_d3d11_device_source_khr d3d_device_source,
    void *                     d3d_object,
    cl_d3d11_device_set_khr    d3d_device_set,
    cl_uint                    num_entries,
    cl_device_id *             devices,
    cl_uint *                  num_devices) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11BufferKHR_fn)(
    cl_context     context,
    cl_mem_flags   flags,
    ID3D11Buffer * resource,
    cl_int *       errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture2DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture2D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromD3D11Texture3DKHR_fn)(
    cl_context        context,
    cl_mem_flags      flags,
    ID3D11Texture3D * resource,
    UINT              subresource,
    cl_int *          errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseD3D11ObjectsKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_D3D11_H */
--- a/include/triton/external/CL/cl_dx9_media_sharing.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing.h
@@ -1,132 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_H
 #define __OPENCL_CL_DX9_MEDIA_SHARING_H
 #include "cl.h"
 #include "cl_platform.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************************************************/
 /* cl_khr_dx9_media_sharing                                                   */
 #define cl_khr_dx9_media_sharing 1
 typedef cl_uint             cl_dx9_media_adapter_type_khr;
 typedef cl_uint             cl_dx9_media_adapter_set_khr;
 #if defined(_WIN32)
 #include <d3d9.h>
 typedef struct _cl_dx9_surface_info_khr
 {
    IDirect3DSurface9 *resource;
    HANDLE shared_handle;
 } cl_dx9_surface_info_khr;
 #endif
 /******************************************************************************/
 /* Error Codes */
 #define CL_INVALID_DX9_MEDIA_ADAPTER_KHR                -1010
 #define CL_INVALID_DX9_MEDIA_SURFACE_KHR                -1011
 #define CL_DX9_MEDIA_SURFACE_ALREADY_ACQUIRED_KHR       -1012
 #define CL_DX9_MEDIA_SURFACE_NOT_ACQUIRED_KHR           -1013
 /* cl_media_adapter_type_khr */
 #define CL_ADAPTER_D3D9_KHR                              0x2020
 #define CL_ADAPTER_D3D9EX_KHR                            0x2021
 #define CL_ADAPTER_DXVA_KHR                              0x2022
 /* cl_media_adapter_set_khr */
 #define CL_PREFERRED_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR   0x2023
 #define CL_ALL_DEVICES_FOR_DX9_MEDIA_ADAPTER_KHR         0x2024
 /* cl_context_info */
 #define CL_CONTEXT_ADAPTER_D3D9_KHR                      0x2025
 #define CL_CONTEXT_ADAPTER_D3D9EX_KHR                    0x2026
 #define CL_CONTEXT_ADAPTER_DXVA_KHR                      0x2027
 /* cl_mem_info */
 #define CL_MEM_DX9_MEDIA_ADAPTER_TYPE_KHR                0x2028
 #define CL_MEM_DX9_MEDIA_SURFACE_INFO_KHR                0x2029
 /* cl_image_info */
 #define CL_IMAGE_DX9_MEDIA_PLANE_KHR                     0x202A
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_DX9_MEDIA_SURFACES_KHR        0x202B
 #define CL_COMMAND_RELEASE_DX9_MEDIA_SURFACES_KHR        0x202C
 /******************************************************************************/
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetDeviceIDsFromDX9MediaAdapterKHR_fn)(
    cl_platform_id                   platform,
    cl_uint                          num_media_adapters,
    cl_dx9_media_adapter_type_khr *  media_adapter_type,
    void *                           media_adapters,
    cl_dx9_media_adapter_set_khr     media_adapter_set,
    cl_uint                          num_entries,
    cl_device_id *                   devices,
    cl_uint *                        num_devices) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceKHR_fn)(
    cl_context                    context,
    cl_mem_flags                  flags,
    cl_dx9_media_adapter_type_khr adapter_type,
    void *                        surface_info,
    cl_uint                       plane,                                                                          
    cl_int *                      errcode_ret) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9MediaSurfacesKHR_fn)(
    cl_command_queue command_queue,
    cl_uint          num_objects,
    const cl_mem *   mem_objects,
    cl_uint          num_events_in_wait_list,
    const cl_event * event_wait_list,
    cl_event *       event) CL_API_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_H */
--- a/include/triton/external/CL/cl_dx9_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_dx9_media_sharing_intel.h
@@ -1,182 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2016 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_dx9_media_sharing_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 #define __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H
 #include <CL/cl.h>
 #include <CL/cl_platform.h>
 #include <d3d9.h>
 #include <dxvahd.h>
 #include <wtypes.h>
 #include <d3d9types.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /***************************************
 * cl_intel_dx9_media_sharing extension *
 ****************************************/
 #define cl_intel_dx9_media_sharing 1
 typedef cl_uint cl_dx9_device_source_intel;
 typedef cl_uint cl_dx9_device_set_intel;
 /* error codes */
 #define CL_INVALID_DX9_DEVICE_INTEL                   -1010
 #define CL_INVALID_DX9_RESOURCE_INTEL                 -1011
 #define CL_DX9_RESOURCE_ALREADY_ACQUIRED_INTEL        -1012
 #define CL_DX9_RESOURCE_NOT_ACQUIRED_INTEL            -1013
 /* cl_dx9_device_source_intel */
 #define CL_D3D9_DEVICE_INTEL                          0x4022
 #define CL_D3D9EX_DEVICE_INTEL                        0x4070
 #define CL_DXVA_DEVICE_INTEL                          0x4071
 /* cl_dx9_device_set_intel */
 #define CL_PREFERRED_DEVICES_FOR_DX9_INTEL            0x4024
 #define CL_ALL_DEVICES_FOR_DX9_INTEL                  0x4025
 /* cl_context_info */
 #define CL_CONTEXT_D3D9_DEVICE_INTEL                  0x4026
 #define CL_CONTEXT_D3D9EX_DEVICE_INTEL                0x4072
 #define CL_CONTEXT_DXVA_DEVICE_INTEL                  0x4073
 /* cl_mem_info */
 #define CL_MEM_DX9_RESOURCE_INTEL                     0x4027
 #define CL_MEM_DX9_SHARED_HANDLE_INTEL                0x4074
 /* cl_image_info */
 #define CL_IMAGE_DX9_PLANE_INTEL                      0x4075
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_DX9_OBJECTS_INTEL          0x402A
 #define CL_COMMAND_RELEASE_DX9_OBJECTS_INTEL          0x402B
 /******************************************************************************/
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromDX9INTEL(
    cl_platform_id              /* platform */,
    cl_dx9_device_source_intel  /* dx9_device_source */,
    void*                       /* dx9_object */,
    cl_dx9_device_set_intel     /* dx9_device_set */,
    cl_uint                     /* num_entries */, 
    cl_device_id*               /* devices */, 
    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL* clGetDeviceIDsFromDX9INTEL_fn)(
    cl_platform_id              /* platform */,
    cl_dx9_device_source_intel  /* dx9_device_source */,
    void*                       /* dx9_object */,
    cl_dx9_device_set_intel     /* dx9_device_set */,
    cl_uint                     /* num_entries */, 
    cl_device_id*               /* devices */, 
    cl_uint*                    /* num_devices */) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromDX9MediaSurfaceINTEL(
    cl_context                  /* context */,
    cl_mem_flags                /* flags */,
    IDirect3DSurface9*          /* resource */,
    HANDLE                      /* sharedHandle */,
    UINT                        /* plane */,
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromDX9MediaSurfaceINTEL_fn)(
    cl_context                  /* context */,
    cl_mem_flags                /* flags */,
    IDirect3DSurface9*          /* resource */,
    HANDLE                      /* sharedHandle */,
    UINT                        /* plane */,
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireDX9ObjectsINTEL(
    cl_command_queue            /* command_queue */,
    cl_uint                     /* num_objects */,
    const cl_mem*               /* mem_objects */,
    cl_uint                     /* num_events_in_wait_list */,
    const cl_event*             /* event_wait_list */,
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireDX9ObjectsINTEL_fn)(
    cl_command_queue            /* command_queue */,
    cl_uint                     /* num_objects */,
    const cl_mem*               /* mem_objects */,
    cl_uint                     /* num_events_in_wait_list */,
    const cl_event*             /* event_wait_list */,
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseDX9ObjectsINTEL(
    cl_command_queue            /* command_queue */,
    cl_uint                     /* num_objects */,
    cl_mem*                     /* mem_objects */,
    cl_uint                     /* num_events_in_wait_list */,
    const cl_event*             /* event_wait_list */,
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseDX9ObjectsINTEL_fn)(
    cl_command_queue            /* command_queue */,
    cl_uint                     /* num_objects */,
    cl_mem*                     /* mem_objects */,
    cl_uint                     /* num_events_in_wait_list */,
    const cl_event*             /* event_wait_list */,
    cl_event*                   /* event */) CL_EXT_SUFFIX__VERSION_1_1;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_DX9_MEDIA_SHARING_INTEL_H */
--- a/include/triton/external/CL/cl_egl.h
+++ b/include/triton/external/CL/cl_egl.h
@@ -1,136 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 ******************************************************************************/
 #ifndef __OPENCL_CL_EGL_H
 #define __OPENCL_CL_EGL_H
 #ifdef __APPLE__
 #else
 #include "cl.h"
 #endif  
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* Command type for events created with clEnqueueAcquireEGLObjectsKHR */
 #define CL_COMMAND_EGL_FENCE_SYNC_OBJECT_KHR  0x202F
 #define CL_COMMAND_ACQUIRE_EGL_OBJECTS_KHR    0x202D
 #define CL_COMMAND_RELEASE_EGL_OBJECTS_KHR    0x202E
 /* Error type for clCreateFromEGLImageKHR */
 #define CL_INVALID_EGL_OBJECT_KHR             -1093
 #define CL_EGL_RESOURCE_NOT_ACQUIRED_KHR      -1092
 /* CLeglImageKHR is an opaque handle to an EGLImage */
 typedef void* CLeglImageKHR;
 /* CLeglDisplayKHR is an opaque handle to an EGLDisplay */
 typedef void* CLeglDisplayKHR;
 /* CLeglSyncKHR is an opaque handle to an EGLSync object */
 typedef void* CLeglSyncKHR;
 /* properties passed to clCreateFromEGLImageKHR */
 typedef intptr_t cl_egl_image_properties_khr;
 #define cl_khr_egl_image 1
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromEGLImageKHR(cl_context                  /* context */,
                        CLeglDisplayKHR             /* egldisplay */,
                        CLeglImageKHR               /* eglimage */,
                        cl_mem_flags                /* flags */,
                        const cl_egl_image_properties_khr * /* properties */,
                        cl_int *                    /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL *clCreateFromEGLImageKHR_fn)(
 	cl_context                  context,
 	CLeglDisplayKHR             egldisplay,
 	CLeglImageKHR               eglimage,
 	cl_mem_flags                flags,
 	const cl_egl_image_properties_khr * properties,
 	cl_int *                    errcode_ret);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireEGLObjectsKHR(cl_command_queue /* command_queue */,
                              cl_uint          /* num_objects */,
                              const cl_mem *   /* mem_objects */,
                              cl_uint          /* num_events_in_wait_list */,
                              const cl_event * /* event_wait_list */,
                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireEGLObjectsKHR_fn)(
 	cl_command_queue command_queue,
 	cl_uint          num_objects,
 	const cl_mem *   mem_objects,
 	cl_uint          num_events_in_wait_list,
 	const cl_event * event_wait_list,
 	cl_event *       event);
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseEGLObjectsKHR(cl_command_queue /* command_queue */,
                              cl_uint          /* num_objects */,
                              const cl_mem *   /* mem_objects */,
                              cl_uint          /* num_events_in_wait_list */,
                              const cl_event * /* event_wait_list */,
                              cl_event *       /* event */) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseEGLObjectsKHR_fn)(
 	cl_command_queue command_queue,
 	cl_uint          num_objects,
 	const cl_mem *   mem_objects,
 	cl_uint          num_events_in_wait_list,
 	const cl_event * event_wait_list,
 	cl_event *       event);
 #define cl_khr_egl_event 1
 extern CL_API_ENTRY cl_event CL_API_CALL
 clCreateEventFromEGLSyncKHR(cl_context      /* context */,
                            CLeglSyncKHR    /* sync */,
                            CLeglDisplayKHR /* display */,
                            cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_event (CL_API_CALL *clCreateEventFromEGLSyncKHR_fn)(
 	cl_context      context,
 	CLeglSyncKHR    sync,
 	CLeglDisplayKHR display,
 	cl_int *        errcode_ret);
 #ifdef __cplusplus
 }
 #endif
 #endif /* __OPENCL_CL_EGL_H */
--- a/include/triton/external/CL/cl_ext.h
+++ b/include/triton/external/CL/cl_ext.h
@@ -1,670 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 ******************************************************************************/
 /* $Revision: 11928 $ on $Date: 2010-07-13 09:04:56 -0700 (Tue, 13 Jul 2010) $ */
 /* cl_ext.h contains OpenCL extensions which don't have external */
 /* (OpenGL, D3D) dependencies.                                   */
 #ifndef __CL_EXT_H
 #define __CL_EXT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifdef __APPLE__
        #include <OpenCL/cl.h>
    #include <AvailabilityMacros.h>
 #else
        #include "cl.h"
 #endif
 /* cl_khr_fp64 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_DOUBLE_FP_CONFIG                  0x1032
 /* cl_khr_fp16 extension - no extension #define since it has no functions  */
 #define CL_DEVICE_HALF_FP_CONFIG                    0x1033
 /* Memory object destruction
 *
 * Apple extension for use to manage externally allocated buffers used with cl_mem objects with CL_MEM_USE_HOST_PTR
 *
 * Registers a user callback function that will be called when the memory object is deleted and its resources 
 * freed. Each call to clSetMemObjectCallbackFn registers the specified user callback function on a callback 
 * stack associated with memobj. The registered user callback functions are called in the reverse order in 
 * which they were registered. The user callback functions are called and then the memory object is deleted 
 * and its resources freed. This provides a mechanism for the application (and libraries) using memobj to be 
 * notified when the memory referenced by host_ptr, specified when the memory object is created and used as 
 * the storage bits for the memory object, can be reused or freed.
 *
 * The application may not call CL api's with the cl_mem object passed to the pfn_notify.
 *
 * Please check for the "cl_APPLE_SetMemObjectDestructor" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 */
 #define cl_APPLE_SetMemObjectDestructor 1
 cl_int  CL_API_ENTRY clSetMemObjectDestructorAPPLE(  cl_mem /* memobj */, 
                                        void (* /*pfn_notify*/)( cl_mem /* memobj */, void* /*user_data*/), 
                                        void * /*user_data */ )             CL_EXT_SUFFIX__VERSION_1_0;  
 /* Context Logging Functions
 *
 * The next three convenience functions are intended to be used as the pfn_notify parameter to clCreateContext().
 * Please check for the "cl_APPLE_ContextLoggingFunctions" extension using clGetDeviceInfo(CL_DEVICE_EXTENSIONS)
 * before using.
 *
 * clLogMessagesToSystemLog fowards on all log messages to the Apple System Logger 
 */
 #define cl_APPLE_ContextLoggingFunctions 1
 extern void CL_API_ENTRY clLogMessagesToSystemLogAPPLE(  const char * /* errstr */, 
                                            const void * /* private_info */, 
                                            size_t       /* cb */, 
                                            void *       /* user_data */ )  CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStdout sends all log messages to the file descriptor stdout */
 extern void CL_API_ENTRY clLogMessagesToStdoutAPPLE(   const char * /* errstr */, 
                                          const void * /* private_info */, 
                                          size_t       /* cb */, 
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 /* clLogMessagesToStderr sends all log messages to the file descriptor stderr */
 extern void CL_API_ENTRY clLogMessagesToStderrAPPLE(   const char * /* errstr */, 
                                          const void * /* private_info */, 
                                          size_t       /* cb */, 
                                          void *       /* user_data */ )    CL_EXT_SUFFIX__VERSION_1_0;
 /************************ 
 * cl_khr_icd extension *                                                  
 ************************/
 #define cl_khr_icd 1
 /* cl_platform_info                                                        */
 #define CL_PLATFORM_ICD_SUFFIX_KHR                  0x0920
 /* Additional Error Codes                                                  */
 #define CL_PLATFORM_NOT_FOUND_KHR                   -1001
 extern CL_API_ENTRY cl_int CL_API_CALL
 clIcdGetPlatformIDsKHR(cl_uint          /* num_entries */,
                       cl_platform_id * /* platforms */,
                       cl_uint *        /* num_platforms */);
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clIcdGetPlatformIDsKHR_fn)(
    cl_uint          /* num_entries */,
    cl_platform_id * /* platforms */,
    cl_uint *        /* num_platforms */);
 /* Extension: cl_khr_image2D_buffer
 *
 * This extension allows a 2D image to be created from a cl_mem buffer without a copy.
 * The type associated with a 2D image created from a buffer in an OpenCL program is image2d_t.
 * Both the sampler and sampler-less read_image built-in functions are supported for 2D images
 * and 2D images created from a buffer.  Similarly, the write_image built-ins are also supported
 * for 2D images created from a buffer.
 *
 * When the 2D image from buffer is created, the client must specify the width,
 * height, image format (i.e. channel order and channel data type) and optionally the row pitch
 *
 * The pitch specified must be a multiple of CL_DEVICE_IMAGE_PITCH_ALIGNMENT pixels.
 * The base address of the buffer must be aligned to CL_DEVICE_IMAGE_BASE_ADDRESS_ALIGNMENT pixels.
 */
 /*************************************
 * cl_khr_initalize_memory extension *
 *************************************/
 #define CL_CONTEXT_MEMORY_INITIALIZE_KHR            0x2030
 /**************************************
 * cl_khr_terminate_context extension *
 **************************************/
 #define CL_DEVICE_TERMINATE_CAPABILITY_KHR          0x2031
 #define CL_CONTEXT_TERMINATE_KHR                    0x2032
 #define cl_khr_terminate_context 1
 extern CL_API_ENTRY cl_int CL_API_CALL clTerminateContextKHR(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clTerminateContextKHR_fn)(cl_context /* context */) CL_EXT_SUFFIX__VERSION_1_2;
 /*
 * Extension: cl_khr_spir
 *
 * This extension adds support to create an OpenCL program object from a 
 * Standard Portable Intermediate Representation (SPIR) instance
 */
 #define CL_DEVICE_SPIR_VERSIONS                     0x40E0
 #define CL_PROGRAM_BINARY_TYPE_INTERMEDIATE         0x40E1
 /*****************************************
 * cl_khr_create_command_queue extension *
 *****************************************/
 #define cl_khr_create_command_queue 1
 typedef cl_bitfield cl_queue_properties_khr;
 extern CL_API_ENTRY cl_command_queue CL_API_CALL
 clCreateCommandQueueWithPropertiesKHR( cl_context /* context */,
                                       cl_device_id /* device */,
                                       const cl_queue_properties_khr* /* properties */,
                                       cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_command_queue
 (CL_API_CALL *clCreateCommandQueueWithPropertiesKHR_fn)( cl_context /* context */,
                                                         cl_device_id /* device */,
                                                         const cl_queue_properties_khr* /* properties */,
                                                         cl_int* /* errcode_ret */ ) CL_EXT_SUFFIX__VERSION_1_2;
 /******************************************
 * cl_nv_device_attribute_query extension *
 ******************************************/
 /* cl_nv_device_attribute_query extension - no extension #define since it has no functions */
 #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV       0x4000
 #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV       0x4001
 #define CL_DEVICE_REGISTERS_PER_BLOCK_NV            0x4002
 #define CL_DEVICE_WARP_SIZE_NV                      0x4003
 #define CL_DEVICE_GPU_OVERLAP_NV                    0x4004
 #define CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV            0x4005
 #define CL_DEVICE_INTEGRATED_MEMORY_NV              0x4006
 /*********************************
 * cl_amd_device_memory_flags *
 *********************************/
 #define cl_amd_device_memory_flags 1
 #define CL_MEM_USE_PERSISTENT_MEM_AMD       (1 << 6)        // Alloc from GPU's CPU visible heap
 /* cl_device_info */
 #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 /*********************************
 * cl_amd_device_attribute_query *
 *********************************/
 #define CL_DEVICE_PROFILING_TIMER_OFFSET_AMD        0x4036
 #define CL_DEVICE_TOPOLOGY_AMD                      0x4037
 #define CL_DEVICE_BOARD_NAME_AMD                    0x4038
 #define CL_DEVICE_GLOBAL_FREE_MEMORY_AMD            0x4039
 #define CL_DEVICE_SIMD_PER_COMPUTE_UNIT_AMD         0x4040
 #define CL_DEVICE_SIMD_WIDTH_AMD                    0x4041
 #define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
 #define CL_DEVICE_WAVEFRONT_WIDTH_AMD               0x4043
 #define CL_DEVICE_GLOBAL_MEM_CHANNELS_AMD           0x4044
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANKS_AMD      0x4045
 #define CL_DEVICE_GLOBAL_MEM_CHANNEL_BANK_WIDTH_AMD 0x4046
 #define CL_DEVICE_LOCAL_MEM_SIZE_PER_COMPUTE_UNIT_AMD   0x4047
 #define CL_DEVICE_LOCAL_MEM_BANKS_AMD               0x4048
 typedef union
 {
    struct { cl_uint type; cl_uint data[5]; } raw;
    struct { cl_uint type; cl_char unused[17]; cl_char bus; cl_char device; cl_char function; } pcie;
 } cl_device_topology_amd;
 #define CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD            1
 /**************************
 * cl_amd_offline_devices *
 **************************/
 #define CL_CONTEXT_OFFLINE_DEVICES_AMD              0x403F
 /*********************************
 * cl_arm_printf extension
 *********************************/
 #define CL_PRINTF_CALLBACK_ARM                      0x40B0
 #define CL_PRINTF_BUFFERSIZE_ARM                    0x40B1
 #ifdef CL_VERSION_1_1
   /***********************************
    * cl_ext_device_fission extension *
    ***********************************/
    #define cl_ext_device_fission   1
    extern CL_API_ENTRY cl_int CL_API_CALL
    clReleaseDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
    typedef CL_API_ENTRY cl_int 
    (CL_API_CALL *clReleaseDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    extern CL_API_ENTRY cl_int CL_API_CALL
    clRetainDeviceEXT( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1; 
    typedef CL_API_ENTRY cl_int 
    (CL_API_CALL *clRetainDeviceEXT_fn)( cl_device_id /*device*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    typedef cl_ulong  cl_device_partition_property_ext;
    extern CL_API_ENTRY cl_int CL_API_CALL
    clCreateSubDevicesEXT(  cl_device_id /*in_device*/,
                            const cl_device_partition_property_ext * /* properties */,
                            cl_uint /*num_entries*/,
                            cl_device_id * /*out_devices*/,
                            cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    typedef CL_API_ENTRY cl_int 
    ( CL_API_CALL * clCreateSubDevicesEXT_fn)(  cl_device_id /*in_device*/,
                                                const cl_device_partition_property_ext * /* properties */,
                                                cl_uint /*num_entries*/,
                                                cl_device_id * /*out_devices*/,
                                                cl_uint * /*num_devices*/ ) CL_EXT_SUFFIX__VERSION_1_1;
    /* cl_device_partition_property_ext */
    #define CL_DEVICE_PARTITION_EQUALLY_EXT             0x4050
    #define CL_DEVICE_PARTITION_BY_COUNTS_EXT           0x4051
    #define CL_DEVICE_PARTITION_BY_NAMES_EXT            0x4052
    #define CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN_EXT  0x4053
    /* clDeviceGetInfo selectors */
    #define CL_DEVICE_PARENT_DEVICE_EXT                 0x4054
    #define CL_DEVICE_PARTITION_TYPES_EXT               0x4055
    #define CL_DEVICE_AFFINITY_DOMAINS_EXT              0x4056
    #define CL_DEVICE_REFERENCE_COUNT_EXT               0x4057
    #define CL_DEVICE_PARTITION_STYLE_EXT               0x4058
    /* error codes */
    #define CL_DEVICE_PARTITION_FAILED_EXT              -1057
    #define CL_INVALID_PARTITION_COUNT_EXT              -1058
    #define CL_INVALID_PARTITION_NAME_EXT               -1059
    /* CL_AFFINITY_DOMAINs */
    #define CL_AFFINITY_DOMAIN_L1_CACHE_EXT             0x1
    #define CL_AFFINITY_DOMAIN_L2_CACHE_EXT             0x2
    #define CL_AFFINITY_DOMAIN_L3_CACHE_EXT             0x3
    #define CL_AFFINITY_DOMAIN_L4_CACHE_EXT             0x4
    #define CL_AFFINITY_DOMAIN_NUMA_EXT                 0x10
    #define CL_AFFINITY_DOMAIN_NEXT_FISSIONABLE_EXT     0x100
    /* cl_device_partition_property_ext list terminators */
    #define CL_PROPERTIES_LIST_END_EXT                  ((cl_device_partition_property_ext) 0)
    #define CL_PARTITION_BY_COUNTS_LIST_END_EXT         ((cl_device_partition_property_ext) 0)
    #define CL_PARTITION_BY_NAMES_LIST_END_EXT          ((cl_device_partition_property_ext) 0 - 1)
    /* cl_ext_atomic_counters_32 and cl_ext_atomic_counters_64 extensions
     * no extension #define since they have no functions
     */
    #define CL_DEVICE_MAX_ATOMIC_COUNTERS_EXT           0x4032
 /*********************************
 * cl_qcom_ext_host_ptr extension
 *********************************/
 #define CL_MEM_EXT_HOST_PTR_QCOM                  (1 << 29)
 #define CL_DEVICE_EXT_MEM_PADDING_IN_BYTES_QCOM   0x40A0      
 #define CL_DEVICE_PAGE_SIZE_QCOM                  0x40A1
 #define CL_IMAGE_ROW_ALIGNMENT_QCOM               0x40A2
 #define CL_IMAGE_SLICE_ALIGNMENT_QCOM             0x40A3
 #define CL_MEM_HOST_UNCACHED_QCOM                 0x40A4
 #define CL_MEM_HOST_WRITEBACK_QCOM                0x40A5
 #define CL_MEM_HOST_WRITETHROUGH_QCOM             0x40A6
 #define CL_MEM_HOST_WRITE_COMBINING_QCOM          0x40A7
 typedef cl_uint                                   cl_image_pitch_info_qcom;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceImageInfoQCOM(cl_device_id             device,
                         size_t                   image_width,
                         size_t                   image_height,
                         const cl_image_format   *image_format,
                         cl_image_pitch_info_qcom param_name,
                         size_t                   param_value_size,
                         void                    *param_value,
                         size_t                  *param_value_size_ret);
 typedef struct _cl_mem_ext_host_ptr
 {
    /* Type of external memory allocation. */
    /* Legal values will be defined in layered extensions. */
    cl_uint  allocation_type;
    /* Host cache policy for this external memory allocation. */
    cl_uint  host_cache_policy;
 } cl_mem_ext_host_ptr;
 /*********************************
 * cl_qcom_ion_host_ptr extension
 *********************************/
 #define CL_MEM_ION_HOST_PTR_QCOM                  0x40A8
 typedef struct _cl_mem_ion_host_ptr
 {
    /* Type of external memory allocation. */
    /* Must be CL_MEM_ION_HOST_PTR_QCOM for ION allocations. */
    cl_mem_ext_host_ptr  ext_host_ptr;
    /* ION file descriptor */
    int                  ion_filedesc;
    /* Host pointer to the ION allocated memory */
    void*                ion_hostptr;
 } cl_mem_ion_host_ptr;
 #endif /* CL_VERSION_1_1 */
 #if defined(CL_VERSION_1_2)
 /******************************************
 * cl_img_yuv_image extension *
 ******************************************/
 /* Image formats used in clCreateImage */
 #define CL_NV21_IMG                                 0x40D0
 #define CL_YV12_IMG                                 0x40D1
 /******************************************
 * cl_img_cached_allocations extension *
 ******************************************/
 /* Flag values used by clCreteBuffer */
 #define CL_MEM_USE_UNCACHED_CPU_MEMORY_IMG         	(1 << 26)
 #define CL_MEM_USE_CACHED_CPU_MEMORY_IMG           	(1 << 27)
 /******************************************
 * cl_img_use_gralloc_ptr extension *
 ******************************************/
 /* Flag values used by clCreteBuffer */
 #define CL_MEM_USE_GRALLOC_PTR_IMG                 	(1 << 28)
 /* To be used by clGetEventInfo: */
 #define CL_COMMAND_ACQUIRE_GRALLOC_OBJECTS_IMG      0x40D2
 #define CL_COMMAND_RELEASE_GRALLOC_OBJECTS_IMG      0x40D3
 /* Error code from clEnqueueReleaseGrallocObjectsIMG */
 #define CL_GRALLOC_RESOURCE_NOT_ACQUIRED_IMG        0x40D4
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireGrallocObjectsIMG(cl_command_queue      /* command_queue */,
                                  cl_uint               /* num_objects */,
                                  const cl_mem *        /* mem_objects */,
                                  cl_uint               /* num_events_in_wait_list */,
                                  const cl_event *      /* event_wait_list */,
                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseGrallocObjectsIMG(cl_command_queue      /* command_queue */,
                                  cl_uint               /* num_objects */,
                                  const cl_mem *        /* mem_objects */,
                                  cl_uint               /* num_events_in_wait_list */,
                                  const cl_event *      /* event_wait_list */,
                                  cl_event *            /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 #endif /* CL_VERSION_1_2 */
 #ifdef CL_VERSION_2_0
 /*********************************
 * cl_khr_subgroups extension
 *********************************/
 #define cl_khr_subgroups 1
 /* cl_kernel_sub_group_info is declared in CL.h. */
 /* cl_kernel_sub_group_info */
 #define CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR	0x2033
 #define CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR		0x2034
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetKernelSubGroupInfoKHR(cl_kernel /* in_kernel */,
 						   cl_device_id /*in_device*/,
 						   cl_kernel_sub_group_info /* param_name */,
 						   size_t /*input_value_size*/,
 						   const void * /*input_value*/,
 						   size_t /*param_value_size*/,
 						   void* /*param_value*/,
 						   size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 typedef CL_API_ENTRY cl_int
     ( CL_API_CALL * clGetKernelSubGroupInfoKHR_fn)(cl_kernel /* in_kernel */,
 						      cl_device_id /*in_device*/,
 						      cl_kernel_sub_group_info /* param_name */,
 						      size_t /*input_value_size*/,
 						      const void * /*input_value*/,
 						      size_t /*param_value_size*/,
 						      void* /*param_value*/,
 						      size_t* /*param_value_size_ret*/ ) CL_EXT_SUFFIX__VERSION_2_0_DEPRECATED;
 #endif /* CL_VERSION_2_0 */
 #ifdef CL_VERSION_2_1
 /*********************************
 * cl_khr_priority_hints extension
 *********************************/
 #define cl_khr_priority_hints 1
 typedef cl_uint  cl_queue_priority_khr;
 /* cl_command_queue_properties */
 #define CL_QUEUE_PRIORITY_KHR 0x1096
 /* cl_queue_priority_khr */
 #define CL_QUEUE_PRIORITY_HIGH_KHR (1<<0)
 #define CL_QUEUE_PRIORITY_MED_KHR (1<<1)
 #define CL_QUEUE_PRIORITY_LOW_KHR (1<<2)
 #endif /* CL_VERSION_2_1 */
 #ifdef CL_VERSION_2_1
 /*********************************
 * cl_khr_throttle_hints extension
 *********************************/
 #define cl_khr_throttle_hints 1
 typedef cl_uint  cl_queue_throttle_khr;
 /* cl_command_queue_properties */
 #define CL_QUEUE_THROTTLE_KHR 0x1097
 /* cl_queue_throttle_khr */
 #define CL_QUEUE_THROTTLE_HIGH_KHR (1<<0)
 #define CL_QUEUE_THROTTLE_MED_KHR (1<<1)
 #define CL_QUEUE_THROTTLE_LOW_KHR (1<<2)
 #endif /* CL_VERSION_2_1 */
 #ifdef CL_VERSION_2_2
 /*********************************
 * cl_khr_subgroup_named_barrier
 *********************************/
 #define cl_khr_subgroup_named_barrier 1
 /* cl_device_info */
 #define CL_DEVICE_MAX_NAMED_BARRIER_COUNT_KHR       0x2035
 #endif /* CL_VERSION_2_2 */
 /**********************************
 * cl_arm_import_memory extension *
 **********************************/
 #ifdef CL_VERSION_1_0
 typedef intptr_t cl_import_properties_arm;
 /* Default and valid proporties name for cl_arm_import_memory */
 #define CL_IMPORT_TYPE_ARM                        0x40B2
 /* Host process memory type default value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_HOST_ARM                   0x40B3
 /* DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_DMA_BUF_ARM                0x40B4
 /* Secure DMA BUF memory type value for CL_IMPORT_TYPE_ARM property */
 #define CL_IMPORT_TYPE_SECURE_ARM                 0x40B5
 /* This extension adds a new function that allows for direct memory import into
 * OpenCL via the clImportMemoryARM function.
 *
 * Memory imported through this interface will be mapped into the device's page
 * tables directly, providing zero copy access. It will never fall back to copy
 * operations and aliased buffers.
 *
 * Types of memory supported for import are specified as additional extension
 * strings.
 *
 * This extension produces cl_mem allocations which are compatible with all other
 * users of cl_mem in the standard API.
 *
 * This extension maps pages with the same properties as the normal buffer creation
 * function clCreateBuffer.
 */
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clImportMemoryARM( cl_context context,
                   cl_mem_flags flags,
                   const cl_import_properties_arm *properties,
                   void *memory,
                   size_t size,
                   cl_int *errcode_ret) CL_EXT_SUFFIX__VERSION_1_0;
 #endif /* CL_VERSION_1_0 */
 /******************************************
 * cl_arm_shared_virtual_memory extension *
 ******************************************/
 #ifdef CL_VERSION_1_2
 /* Used by clGetDeviceInfo */
 #define CL_DEVICE_SVM_CAPABILITIES_ARM                  0x40B6
 /* Used by clGetMemObjectInfo */
 #define CL_MEM_USES_SVM_POINTER_ARM                     0x40B7
 /* Used by clSetKernelExecInfoARM: */
 #define CL_KERNEL_EXEC_INFO_SVM_PTRS_ARM                0x40B8
 #define CL_KERNEL_EXEC_INFO_SVM_FINE_GRAIN_SYSTEM_ARM   0x40B9
 /* To be used by clGetEventInfo: */
 #define CL_COMMAND_SVM_FREE_ARM                         0x40BA
 #define CL_COMMAND_SVM_MEMCPY_ARM                       0x40BB
 #define CL_COMMAND_SVM_MEMFILL_ARM                      0x40BC
 #define CL_COMMAND_SVM_MAP_ARM                          0x40BD
 #define CL_COMMAND_SVM_UNMAP_ARM                        0x40BE
 /* Flag values returned by clGetDeviceInfo with CL_DEVICE_SVM_CAPABILITIES_ARM as the param_name. */
 #define CL_DEVICE_SVM_COARSE_GRAIN_BUFFER_ARM           (1 << 0)
 #define CL_DEVICE_SVM_FINE_GRAIN_BUFFER_ARM             (1 << 1)
 #define CL_DEVICE_SVM_FINE_GRAIN_SYSTEM_ARM             (1 << 2)
 #define CL_DEVICE_SVM_ATOMICS_ARM                       (1 << 3)
 /* Flag values used by clSVMAllocARM: */
 #define CL_MEM_SVM_FINE_GRAIN_BUFFER_ARM                (1 << 10)
 #define CL_MEM_SVM_ATOMICS_ARM                          (1 << 11)
 typedef cl_bitfield cl_svm_mem_flags_arm;
 typedef cl_uint     cl_kernel_exec_info_arm;
 typedef cl_bitfield cl_device_svm_capabilities_arm;
 extern CL_API_ENTRY void * CL_API_CALL
 clSVMAllocARM(cl_context       /* context */,
              cl_svm_mem_flags_arm /* flags */,
              size_t           /* size */,
              cl_uint          /* alignment */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY void CL_API_CALL
 clSVMFreeARM(cl_context        /* context */,
             void *            /* svm_pointer */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMFreeARM(cl_command_queue  /* command_queue */,
                    cl_uint           /* num_svm_pointers */,
                    void *[]          /* svm_pointers[] */,
                    void (CL_CALLBACK * /*pfn_free_func*/)(cl_command_queue /* queue */,
                                                           cl_uint          /* num_svm_pointers */,
                                                           void *[]         /* svm_pointers[] */,
                                                           void *           /* user_data */),
                    void *            /* user_data */,
                    cl_uint           /* num_events_in_wait_list */,
                    const cl_event *  /* event_wait_list */,
                    cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemcpyARM(cl_command_queue  /* command_queue */,
                      cl_bool           /* blocking_copy */,
                      void *            /* dst_ptr */,
                      const void *      /* src_ptr */,
                      size_t            /* size */,
                      cl_uint           /* num_events_in_wait_list */,
                      const cl_event *  /* event_wait_list */,
                      cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMemFillARM(cl_command_queue  /* command_queue */,
                       void *            /* svm_ptr */,
                       const void *      /* pattern */,
                       size_t            /* pattern_size */,
                       size_t            /* size */,
                       cl_uint           /* num_events_in_wait_list */,
                       const cl_event *  /* event_wait_list */,
                       cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMMapARM(cl_command_queue  /* command_queue */,
                   cl_bool           /* blocking_map */,
                   cl_map_flags      /* flags */,
                   void *            /* svm_ptr */,
                   size_t            /* size */,
                   cl_uint           /* num_events_in_wait_list */,
                   const cl_event *  /* event_wait_list */,
                   cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueSVMUnmapARM(cl_command_queue  /* command_queue */,
                     void *            /* svm_ptr */,
                     cl_uint           /* num_events_in_wait_list */,
                     const cl_event *  /* event_wait_list */,
                     cl_event *        /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelArgSVMPointerARM(cl_kernel    /* kernel */,
                            cl_uint      /* arg_index */,
                            const void * /* arg_value */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clSetKernelExecInfoARM(cl_kernel            /* kernel */,
                       cl_kernel_exec_info_arm  /* param_name */,
                       size_t               /* param_value_size */,
                       const void *         /* param_value */) CL_EXT_SUFFIX__VERSION_1_2;
 #endif /* CL_VERSION_1_2 */
 #ifdef __cplusplus
 }
 #endif
 #endif /* __CL_EXT_H */
--- a/include/triton/external/CL/cl_ext_intel.h
+++ b/include/triton/external/CL/cl_ext_intel.h
@@ -1,429 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2008-2017 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 ******************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2017 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_ext_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __CL_EXT_INTEL_H
 #define __CL_EXT_INTEL_H
 #ifdef __APPLE__
    #include <OpenCL/cl.h>
    #include <OpenCL/cl_platform.h>
 #else
    #include "cl.h"
    #include "cl_platform.h"
 #endif
 #ifdef __cplusplus
 extern "C" {
 #endif
 /***************************************
 * cl_intel_thread_local_exec extension *
 ****************************************/
 #define cl_intel_thread_local_exec 1
 #define CL_QUEUE_THREAD_LOCAL_EXEC_ENABLE_INTEL      (((cl_bitfield)1) << 31)
 /***********************************************
 * cl_intel_device_partition_by_names extension *
 ************************************************/
 #define cl_intel_device_partition_by_names 1
 #define CL_DEVICE_PARTITION_BY_NAMES_INTEL          0x4052
 #define CL_PARTITION_BY_NAMES_LIST_END_INTEL        -1
 /************************************************
 * cl_intel_accelerator extension                *
 * cl_intel_motion_estimation extension          *
 * cl_intel_advanced_motion_estimation extension *
 *************************************************/
 #define cl_intel_accelerator 1
 #define cl_intel_motion_estimation 1
 #define cl_intel_advanced_motion_estimation 1
 typedef struct _cl_accelerator_intel* cl_accelerator_intel;
 typedef cl_uint cl_accelerator_type_intel;
 typedef cl_uint cl_accelerator_info_intel;
 typedef struct _cl_motion_estimation_desc_intel {
    cl_uint mb_block_type;
    cl_uint subpixel_mode;
    cl_uint sad_adjust_mode;
    cl_uint search_path_type;
 } cl_motion_estimation_desc_intel;
 /* error codes */
 #define CL_INVALID_ACCELERATOR_INTEL                              -1094
 #define CL_INVALID_ACCELERATOR_TYPE_INTEL                         -1095
 #define CL_INVALID_ACCELERATOR_DESCRIPTOR_INTEL                   -1096
 #define CL_ACCELERATOR_TYPE_NOT_SUPPORTED_INTEL                   -1097
 /* cl_accelerator_type_intel */
 #define CL_ACCELERATOR_TYPE_MOTION_ESTIMATION_INTEL               0x0
 /* cl_accelerator_info_intel */
 #define CL_ACCELERATOR_DESCRIPTOR_INTEL                           0x4090
 #define CL_ACCELERATOR_REFERENCE_COUNT_INTEL                      0x4091
 #define CL_ACCELERATOR_CONTEXT_INTEL                              0x4092
 #define CL_ACCELERATOR_TYPE_INTEL                                 0x4093
 /* cl_motion_detect_desc_intel flags */
 #define CL_ME_MB_TYPE_16x16_INTEL                                 0x0
 #define CL_ME_MB_TYPE_8x8_INTEL                                   0x1
 #define CL_ME_MB_TYPE_4x4_INTEL                                   0x2
 #define CL_ME_SUBPIXEL_MODE_INTEGER_INTEL                         0x0
 #define CL_ME_SUBPIXEL_MODE_HPEL_INTEL                            0x1
 #define CL_ME_SUBPIXEL_MODE_QPEL_INTEL                            0x2
 #define CL_ME_SAD_ADJUST_MODE_NONE_INTEL                          0x0
 #define CL_ME_SAD_ADJUST_MODE_HAAR_INTEL                          0x1
 #define CL_ME_SEARCH_PATH_RADIUS_2_2_INTEL                        0x0
 #define CL_ME_SEARCH_PATH_RADIUS_4_4_INTEL                        0x1
 #define CL_ME_SEARCH_PATH_RADIUS_16_12_INTEL                      0x5
 #define CL_ME_SKIP_BLOCK_TYPE_16x16_INTEL                         0x0
 #define CL_ME_CHROMA_INTRA_PREDICT_ENABLED_INTEL                  0x1
 #define CL_ME_LUMA_INTRA_PREDICT_ENABLED_INTEL                    0x2
 #define CL_ME_SKIP_BLOCK_TYPE_8x8_INTEL                           0x4
 #define CL_ME_FORWARD_INPUT_MODE_INTEL                            0x1
 #define CL_ME_BACKWARD_INPUT_MODE_INTEL                           0x2
 #define CL_ME_BIDIRECTION_INPUT_MODE_INTEL                        0x3
 #define CL_ME_BIDIR_WEIGHT_QUARTER_INTEL                          16
 #define CL_ME_BIDIR_WEIGHT_THIRD_INTEL                            21
 #define CL_ME_BIDIR_WEIGHT_HALF_INTEL                             32
 #define CL_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL                        43
 #define CL_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL                    48
 #define CL_ME_COST_PENALTY_NONE_INTEL                             0x0
 #define CL_ME_COST_PENALTY_LOW_INTEL                              0x1
 #define CL_ME_COST_PENALTY_NORMAL_INTEL                           0x2
 #define CL_ME_COST_PENALTY_HIGH_INTEL                             0x3
 #define CL_ME_COST_PRECISION_QPEL_INTEL                           0x0
 #define CL_ME_COST_PRECISION_HPEL_INTEL                           0x1
 #define CL_ME_COST_PRECISION_PEL_INTEL                            0x2
 #define CL_ME_COST_PRECISION_DPEL_INTEL                           0x3
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL                  0x0
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL                0x1
 #define CL_ME_LUMA_PREDICTOR_MODE_DC_INTEL                        0x2
 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL        0x3
 #define CL_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL       0x4
 #define CL_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL                     0x4
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL            0x5
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL           0x6
 #define CL_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL             0x7
 #define CL_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL             0x8
 #define CL_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                      0x0
 #define CL_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL              0x1
 #define CL_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL                0x2
 #define CL_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL                   0x3
 /* cl_device_info */
 #define CL_DEVICE_ME_VERSION_INTEL                                0x407E
 #define CL_ME_VERSION_LEGACY_INTEL                                0x0
 #define CL_ME_VERSION_ADVANCED_VER_1_INTEL                        0x1
 #define CL_ME_VERSION_ADVANCED_VER_2_INTEL                        0x2
 extern CL_API_ENTRY cl_accelerator_intel CL_API_CALL
 clCreateAcceleratorINTEL(
    cl_context                  /* context */,
    cl_accelerator_type_intel   /* accelerator_type */,
    size_t                      /* descriptor_size */,
    const void*                 /* descriptor */,
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_accelerator_intel (CL_API_CALL *clCreateAcceleratorINTEL_fn)(
    cl_context                  /* context */,
    cl_accelerator_type_intel   /* accelerator_type */,
    size_t                      /* descriptor_size */,
    const void*                 /* descriptor */,
    cl_int*                     /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetAcceleratorInfoINTEL(
    cl_accelerator_intel        /* accelerator */,
    cl_accelerator_info_intel   /* param_name */,
    size_t                      /* param_value_size */,
    void*                       /* param_value */,
    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetAcceleratorInfoINTEL_fn)(
    cl_accelerator_intel        /* accelerator */,
    cl_accelerator_info_intel   /* param_name */,
    size_t                      /* param_value_size */,
    void*                       /* param_value */,
    size_t*                     /* param_value_size_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clRetainAcceleratorINTEL(
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clRetainAcceleratorINTEL_fn)(
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clReleaseAcceleratorINTEL(
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clReleaseAcceleratorINTEL_fn)(
    cl_accelerator_intel        /* accelerator */) CL_EXT_SUFFIX__VERSION_1_2;
 /******************************************
 * cl_intel_simultaneous_sharing extension *
 *******************************************/
 #define cl_intel_simultaneous_sharing 1
 #define CL_DEVICE_SIMULTANEOUS_INTEROPS_INTEL            0x4104
 #define CL_DEVICE_NUM_SIMULTANEOUS_INTEROPS_INTEL        0x4105
 /***********************************
 * cl_intel_egl_image_yuv extension *
 ************************************/
 #define cl_intel_egl_image_yuv 1
 #define CL_EGL_YUV_PLANE_INTEL                           0x4107
 /********************************
 * cl_intel_packed_yuv extension *
 *********************************/
 #define cl_intel_packed_yuv 1
 #define CL_YUYV_INTEL                                    0x4076
 #define CL_UYVY_INTEL                                    0x4077
 #define CL_YVYU_INTEL                                    0x4078
 #define CL_VYUY_INTEL                                    0x4079
 /********************************************
 * cl_intel_required_subgroup_size extension *
 *********************************************/
 #define cl_intel_required_subgroup_size 1
 #define CL_DEVICE_SUB_GROUP_SIZES_INTEL                  0x4108
 #define CL_KERNEL_SPILL_MEM_SIZE_INTEL                   0x4109
 #define CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL           0x410A
 /****************************************
 * cl_intel_driver_diagnostics extension *
 *****************************************/
 #define cl_intel_driver_diagnostics 1
 typedef cl_uint cl_diagnostics_verbose_level;
 #define CL_CONTEXT_SHOW_DIAGNOSTICS_INTEL                0x4106
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_ALL_INTEL           ( 0xff )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_GOOD_INTEL          ( 1 )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL           ( 1 << 1 )
 #define CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL       ( 1 << 2 )
 /********************************
 * cl_intel_planar_yuv extension *
 *********************************/
 #define CL_NV12_INTEL                                       0x410E
 #define CL_MEM_NO_ACCESS_INTEL                              ( 1 << 24 )
 #define CL_MEM_ACCESS_FLAGS_UNRESTRICTED_INTEL              ( 1 << 25 )
 #define CL_DEVICE_PLANAR_YUV_MAX_WIDTH_INTEL                0x417E
 #define CL_DEVICE_PLANAR_YUV_MAX_HEIGHT_INTEL               0x417F
 /*******************************************************
 * cl_intel_device_side_avc_motion_estimation extension *
 ********************************************************/
 #define CL_DEVICE_AVC_ME_VERSION_INTEL                      0x410B
 #define CL_DEVICE_AVC_ME_SUPPORTS_TEXTURE_SAMPLER_USE_INTEL 0x410C
 #define CL_DEVICE_AVC_ME_SUPPORTS_PREEMPTION_INTEL          0x410D
 #define CL_AVC_ME_VERSION_0_INTEL                           0x0;  // No support.
 #define CL_AVC_ME_VERSION_1_INTEL                           0x1;  // First supported version.
 #define CL_AVC_ME_MAJOR_16x16_INTEL                         0x0
 #define CL_AVC_ME_MAJOR_16x8_INTEL                          0x1
 #define CL_AVC_ME_MAJOR_8x16_INTEL                          0x2
 #define CL_AVC_ME_MAJOR_8x8_INTEL                           0x3
 #define CL_AVC_ME_MINOR_8x8_INTEL                           0x0
 #define CL_AVC_ME_MINOR_8x4_INTEL                           0x1
 #define CL_AVC_ME_MINOR_4x8_INTEL                           0x2
 #define CL_AVC_ME_MINOR_4x4_INTEL                           0x3
 #define CL_AVC_ME_MAJOR_FORWARD_INTEL                       0x0
 #define CL_AVC_ME_MAJOR_BACKWARD_INTEL                      0x1
 #define CL_AVC_ME_MAJOR_BIDIRECTIONAL_INTEL                 0x2
 #define CL_AVC_ME_PARTITION_MASK_ALL_INTEL                  0x0
 #define CL_AVC_ME_PARTITION_MASK_16x16_INTEL                0x7E
 #define CL_AVC_ME_PARTITION_MASK_16x8_INTEL                 0x7D
 #define CL_AVC_ME_PARTITION_MASK_8x16_INTEL                 0x7B
 #define CL_AVC_ME_PARTITION_MASK_8x8_INTEL                  0x77
 #define CL_AVC_ME_PARTITION_MASK_8x4_INTEL                  0x6F
 #define CL_AVC_ME_PARTITION_MASK_4x8_INTEL                  0x5F
 #define CL_AVC_ME_PARTITION_MASK_4x4_INTEL                  0x3F
 #define CL_AVC_ME_SEARCH_WINDOW_EXHAUSTIVE_INTEL            0x0
 #define CL_AVC_ME_SEARCH_WINDOW_SMALL_INTEL                 0x1
 #define CL_AVC_ME_SEARCH_WINDOW_TINY_INTEL                  0x2
 #define CL_AVC_ME_SEARCH_WINDOW_EXTRA_TINY_INTEL            0x3
 #define CL_AVC_ME_SEARCH_WINDOW_DIAMOND_INTEL               0x4
 #define CL_AVC_ME_SEARCH_WINDOW_LARGE_DIAMOND_INTEL         0x5
 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED0_INTEL             0x6
 #define CL_AVC_ME_SEARCH_WINDOW_RESERVED1_INTEL             0x7
 #define CL_AVC_ME_SEARCH_WINDOW_CUSTOM_INTEL                0x8
 #define CL_AVC_ME_SEARCH_WINDOW_16x12_RADIUS_INTEL          0x9
 #define CL_AVC_ME_SEARCH_WINDOW_4x4_RADIUS_INTEL            0x2
 #define CL_AVC_ME_SEARCH_WINDOW_2x2_RADIUS_INTEL            0xa
 #define CL_AVC_ME_SAD_ADJUST_MODE_NONE_INTEL                0x0
 #define CL_AVC_ME_SAD_ADJUST_MODE_HAAR_INTEL                0x2
 #define CL_AVC_ME_SUBPIXEL_MODE_INTEGER_INTEL               0x0
 #define CL_AVC_ME_SUBPIXEL_MODE_HPEL_INTEL                  0x1
 #define CL_AVC_ME_SUBPIXEL_MODE_QPEL_INTEL                  0x3
 #define CL_AVC_ME_COST_PRECISION_QPEL_INTEL                 0x0
 #define CL_AVC_ME_COST_PRECISION_HPEL_INTEL                 0x1
 #define CL_AVC_ME_COST_PRECISION_PEL_INTEL                  0x2
 #define CL_AVC_ME_COST_PRECISION_DPEL_INTEL                 0x3
 #define CL_AVC_ME_BIDIR_WEIGHT_QUARTER_INTEL                0x10
 #define CL_AVC_ME_BIDIR_WEIGHT_THIRD_INTEL                  0x15
 #define CL_AVC_ME_BIDIR_WEIGHT_HALF_INTEL                   0x20
 #define CL_AVC_ME_BIDIR_WEIGHT_TWO_THIRD_INTEL              0x2B
 #define CL_AVC_ME_BIDIR_WEIGHT_THREE_QUARTER_INTEL          0x30
 #define CL_AVC_ME_BORDER_REACHED_LEFT_INTEL                 0x0
 #define CL_AVC_ME_BORDER_REACHED_RIGHT_INTEL                0x2
 #define CL_AVC_ME_BORDER_REACHED_TOP_INTEL                  0x4
 #define CL_AVC_ME_BORDER_REACHED_BOTTOM_INTEL               0x8
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_16x16_INTEL          0x0
 #define CL_AVC_ME_SKIP_BLOCK_PARTITION_8x8_INTEL            0x4000
 #define CL_AVC_ME_SKIP_BLOCK_16x16_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_16x16_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_16x16_DUAL_ENABLE_INTEL        ( 0x3 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_FORWARD_ENABLE_INTEL       ( 0x55 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_BACKWARD_ENABLE_INTEL      ( 0xAA << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_DUAL_ENABLE_INTEL          ( 0xFF << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_0_FORWARD_ENABLE_INTEL     ( 0x1 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_0_BACKWARD_ENABLE_INTEL    ( 0x2 << 24 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_1_FORWARD_ENABLE_INTEL     ( 0x1 << 26 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_1_BACKWARD_ENABLE_INTEL    ( 0x2 << 26 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_2_FORWARD_ENABLE_INTEL     ( 0x1 << 28 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_2_BACKWARD_ENABLE_INTEL    ( 0x2 << 28 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_3_FORWARD_ENABLE_INTEL     ( 0x1 << 30 )
 #define CL_AVC_ME_SKIP_BLOCK_8x8_3_BACKWARD_ENABLE_INTEL    ( 0x2 << 30 )
 #define CL_AVC_ME_BLOCK_BASED_SKIP_4x4_INTEL                0x00
 #define CL_AVC_ME_BLOCK_BASED_SKIP_8x8_INTEL                0x80
 #define CL_AVC_ME_INTRA_16x16_INTEL                         0x0
 #define CL_AVC_ME_INTRA_8x8_INTEL                           0x1
 #define CL_AVC_ME_INTRA_4x4_INTEL                           0x2
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_16x16_INTEL     0x6
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_8x8_INTEL       0x5
 #define CL_AVC_ME_INTRA_LUMA_PARTITION_MASK_4x4_INTEL       0x3 
 #define CL_AVC_ME_INTRA_NEIGHBOR_LEFT_MASK_ENABLE_INTEL         0x60
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_MASK_ENABLE_INTEL        0x10
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_RIGHT_MASK_ENABLE_INTEL  0x8
 #define CL_AVC_ME_INTRA_NEIGHBOR_UPPER_LEFT_MASK_ENABLE_INTEL   0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_INTEL            0x0
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_INTEL          0x1
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DC_INTEL                  0x2
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_LEFT_INTEL  0x3
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_DIAGONAL_DOWN_RIGHT_INTEL 0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_PLANE_INTEL               0x4
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_RIGHT_INTEL      0x5
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_DOWN_INTEL     0x6
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_VERTICAL_LEFT_INTEL       0x7
 #define CL_AVC_ME_LUMA_PREDICTOR_MODE_HORIZONTAL_UP_INTEL       0x8
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_DC_INTEL                0x0
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_HORIZONTAL_INTEL        0x1
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_VERTICAL_INTEL          0x2
 #define CL_AVC_ME_CHROMA_PREDICTOR_MODE_PLANE_INTEL             0x3
 #define CL_AVC_ME_FRAME_FORWARD_INTEL                       0x1
 #define CL_AVC_ME_FRAME_BACKWARD_INTEL                      0x2
 #define CL_AVC_ME_FRAME_DUAL_INTEL                          0x3
 #define CL_AVC_ME_SLICE_TYPE_PRED_INTEL                     0x0
 #define CL_AVC_ME_SLICE_TYPE_BPRED_INTEL                    0x1
 #define CL_AVC_ME_SLICE_TYPE_INTRA_INTEL                    0x2
 #define CL_AVC_ME_INTERLACED_SCAN_TOP_FIELD_INTEL           0x0
 #define CL_AVC_ME_INTERLACED_SCAN_BOTTOM_FIELD_INTEL        0x1  
 #ifdef __cplusplus
 }
 #endif
 #endif /* __CL_EXT_INTEL_H */
--- a/include/triton/external/CL/cl_gl.h
+++ b/include/triton/external/CL/cl_gl.h
@@ -1,167 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 #ifndef __OPENCL_CL_GL_H
 #define __OPENCL_CL_GL_H
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #else
 #include "cl.h"
 #endif	
 #ifdef __cplusplus
 extern "C" {
 #endif
 typedef cl_uint     cl_gl_object_type;
 typedef cl_uint     cl_gl_texture_info;
 typedef cl_uint     cl_gl_platform_info;
 typedef struct __GLsync *cl_GLsync;
 /* cl_gl_object_type = 0x2000 - 0x200F enum values are currently taken           */
 #define CL_GL_OBJECT_BUFFER                     0x2000
 #define CL_GL_OBJECT_TEXTURE2D                  0x2001
 #define CL_GL_OBJECT_TEXTURE3D                  0x2002
 #define CL_GL_OBJECT_RENDERBUFFER               0x2003
 #define CL_GL_OBJECT_TEXTURE2D_ARRAY            0x200E
 #define CL_GL_OBJECT_TEXTURE1D                  0x200F
 #define CL_GL_OBJECT_TEXTURE1D_ARRAY            0x2010
 #define CL_GL_OBJECT_TEXTURE_BUFFER             0x2011
 /* cl_gl_texture_info           */
 #define CL_GL_TEXTURE_TARGET                    0x2004
 #define CL_GL_MIPMAP_LEVEL                      0x2005
 #define CL_GL_NUM_SAMPLES                       0x2012
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLBuffer(cl_context     /* context */,
                     cl_mem_flags   /* flags */,
                     cl_GLuint      /* bufobj */,
                     int *          /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLTexture(cl_context      /* context */,
                      cl_mem_flags    /* flags */,
                      cl_GLenum       /* target */,
                      cl_GLint        /* miplevel */,
                      cl_GLuint       /* texture */,
                      cl_int *        /* errcode_ret */) CL_API_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromGLRenderbuffer(cl_context   /* context */,
                           cl_mem_flags /* flags */,
                           cl_GLuint    /* renderbuffer */,
                           cl_int *     /* errcode_ret */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLObjectInfo(cl_mem                /* memobj */,
                  cl_gl_object_type *   /* gl_object_type */,
                  cl_GLuint *           /* gl_object_name */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLTextureInfo(cl_mem               /* memobj */,
                   cl_gl_texture_info   /* param_name */,
                   size_t               /* param_value_size */,
                   void *               /* param_value */,
                   size_t *             /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireGLObjects(cl_command_queue      /* command_queue */,
                          cl_uint               /* num_objects */,
                          const cl_mem *        /* mem_objects */,
                          cl_uint               /* num_events_in_wait_list */,
                          const cl_event *      /* event_wait_list */,
                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseGLObjects(cl_command_queue      /* command_queue */,
                          cl_uint               /* num_objects */,
                          const cl_mem *        /* mem_objects */,
                          cl_uint               /* num_events_in_wait_list */,
                          const cl_event *      /* event_wait_list */,
                          cl_event *            /* event */) CL_API_SUFFIX__VERSION_1_0;
 /* Deprecated OpenCL 1.1 APIs */
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture2D(cl_context      /* context */,
                        cl_mem_flags    /* flags */,
                        cl_GLenum       /* target */,
                        cl_GLint        /* miplevel */,
                        cl_GLuint       /* texture */,
                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 extern CL_API_ENTRY CL_EXT_PREFIX__VERSION_1_1_DEPRECATED cl_mem CL_API_CALL
 clCreateFromGLTexture3D(cl_context      /* context */,
                        cl_mem_flags    /* flags */,
                        cl_GLenum       /* target */,
                        cl_GLint        /* miplevel */,
                        cl_GLuint       /* texture */,
                        cl_int *        /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1_DEPRECATED;
 /* cl_khr_gl_sharing extension  */
 #define cl_khr_gl_sharing 1
 typedef cl_uint     cl_gl_context_info;
 /* Additional Error Codes  */
 #define CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR  -1000
 /* cl_gl_context_info  */
 #define CL_CURRENT_DEVICE_FOR_GL_CONTEXT_KHR    0x2006
 #define CL_DEVICES_FOR_GL_CONTEXT_KHR           0x2007
 /* Additional cl_context_properties  */
 #define CL_GL_CONTEXT_KHR                       0x2008
 #define CL_EGL_DISPLAY_KHR                      0x2009
 #define CL_GLX_DISPLAY_KHR                      0x200A
 #define CL_WGL_HDC_KHR                          0x200B
 #define CL_CGL_SHAREGROUP_KHR                   0x200C
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetGLContextInfoKHR(const cl_context_properties * /* properties */,
                      cl_gl_context_info            /* param_name */,
                      size_t                        /* param_value_size */,
                      void *                        /* param_value */,
                      size_t *                      /* param_value_size_ret */) CL_API_SUFFIX__VERSION_1_0;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clGetGLContextInfoKHR_fn)(
    const cl_context_properties * properties,
    cl_gl_context_info            param_name,
    size_t                        param_value_size,
    void *                        param_value,
    size_t *                      param_value_size_ret);
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_GL_H */
--- a/include/triton/external/CL/cl_gl_ext.h
+++ b/include/triton/external/CL/cl_gl_ext.h
@@ -1,74 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 /* cl_gl_ext.h contains vendor (non-KHR) OpenCL extensions which have           */
 /* OpenGL dependencies.                                                         */
 #ifndef __OPENCL_CL_GL_EXT_H
 #define __OPENCL_CL_GL_EXT_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifdef __APPLE__
    #include <OpenCL/cl_gl.h>
 #else
    #include "cl_gl.h"
 #endif
 /*
 * For each extension, follow this template
 *  cl_VEN_extname extension  */
 /* #define cl_VEN_extname 1
 * ... define new types, if any
 * ... define new tokens, if any
 * ... define new APIs, if any
 *
 *  If you need GLtypes here, mirror them with a cl_GLtype, rather than including a GL header
 *  This allows us to avoid having to decide whether to include GL headers or GLES here.
 */
 /* 
 *  cl_khr_gl_event  extension
 *  See section 9.9 in the OpenCL 1.1 spec for more information
 */
 #define CL_COMMAND_GL_FENCE_SYNC_OBJECT_KHR     0x200D
 extern CL_API_ENTRY cl_event CL_API_CALL
 clCreateEventFromGLsyncKHR(cl_context           /* context */,
                           cl_GLsync            /* cl_GLsync */,
                           cl_int *             /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_1;
 #ifdef __cplusplus
 }
 #endif
 #endif	/* __OPENCL_CL_GL_EXT_H  */
--- a/include/triton/external/CL/cl_platform.h
+++ b/include/triton/external/CL/cl_platform.h
--- a/include/triton/external/CL/cl_va_api_media_sharing_intel.h
+++ b/include/triton/external/CL/cl_va_api_media_sharing_intel.h
@@ -1,172 +0,0 @@
 /**********************************************************************************
 * Copyright (c) 2008-2016 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 **********************************************************************************/
 /*****************************************************************************\
 Copyright (c) 2013-2016 Intel Corporation All Rights Reserved.
 THESE MATERIALS ARE PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
 CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
 NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THESE
 MATERIALS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 File Name: cl_va_api_media_sharing_intel.h
 Abstract:
 Notes:
 \*****************************************************************************/
 #ifndef __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 #define __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H
 #include "cl.h"
 #include "cl_platform.h"
 #include <va/va.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /******************************************
 * cl_intel_va_api_media_sharing extension *
 *******************************************/
 #define cl_intel_va_api_media_sharing 1
 /* error codes */
 #define CL_INVALID_VA_API_MEDIA_ADAPTER_INTEL               -1098
 #define CL_INVALID_VA_API_MEDIA_SURFACE_INTEL               -1099
 #define CL_VA_API_MEDIA_SURFACE_ALREADY_ACQUIRED_INTEL      -1100
 #define CL_VA_API_MEDIA_SURFACE_NOT_ACQUIRED_INTEL          -1101
 /* cl_va_api_device_source_intel */
 #define CL_VA_API_DISPLAY_INTEL                             0x4094
 /* cl_va_api_device_set_intel */
 #define CL_PREFERRED_DEVICES_FOR_VA_API_INTEL               0x4095
 #define CL_ALL_DEVICES_FOR_VA_API_INTEL                     0x4096
 /* cl_context_info */
 #define CL_CONTEXT_VA_API_DISPLAY_INTEL                     0x4097
 /* cl_mem_info */
 #define CL_MEM_VA_API_MEDIA_SURFACE_INTEL                   0x4098
 /* cl_image_info */
 #define CL_IMAGE_VA_API_PLANE_INTEL                         0x4099
 /* cl_command_type */
 #define CL_COMMAND_ACQUIRE_VA_API_MEDIA_SURFACES_INTEL      0x409A
 #define CL_COMMAND_RELEASE_VA_API_MEDIA_SURFACES_INTEL      0x409B
 typedef cl_uint cl_va_api_device_source_intel;
 typedef cl_uint cl_va_api_device_set_intel;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clGetDeviceIDsFromVA_APIMediaAdapterINTEL(
    cl_platform_id                /* platform */,
    cl_va_api_device_source_intel /* media_adapter_type */,
    void*                         /* media_adapter */,
    cl_va_api_device_set_intel    /* media_adapter_set */,
    cl_uint                       /* num_entries */,
    cl_device_id*                 /* devices */,
    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL * clGetDeviceIDsFromVA_APIMediaAdapterINTEL_fn)(
    cl_platform_id                /* platform */,
    cl_va_api_device_source_intel /* media_adapter_type */,
    void*                         /* media_adapter */,
    cl_va_api_device_set_intel    /* media_adapter_set */,
    cl_uint                       /* num_entries */,
    cl_device_id*                 /* devices */,
    cl_uint*                      /* num_devices */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_mem CL_API_CALL
 clCreateFromVA_APIMediaSurfaceINTEL(
    cl_context                    /* context */,
    cl_mem_flags                  /* flags */,
    VASurfaceID*                  /* surface */,
    cl_uint                       /* plane */,
    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_mem (CL_API_CALL * clCreateFromVA_APIMediaSurfaceINTEL_fn)(
    cl_context                    /* context */,
    cl_mem_flags                  /* flags */,
    VASurfaceID*                  /* surface */,
    cl_uint                       /* plane */,
    cl_int*                       /* errcode_ret */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueAcquireVA_APIMediaSurfacesINTEL(
    cl_command_queue              /* command_queue */,
    cl_uint                       /* num_objects */,
    const cl_mem*                 /* mem_objects */,
    cl_uint                       /* num_events_in_wait_list */,
    const cl_event*               /* event_wait_list */,
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueAcquireVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              /* command_queue */,
    cl_uint                       /* num_objects */,
    const cl_mem*                 /* mem_objects */,
    cl_uint                       /* num_events_in_wait_list */,
    const cl_event*               /* event_wait_list */,
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 extern CL_API_ENTRY cl_int CL_API_CALL
 clEnqueueReleaseVA_APIMediaSurfacesINTEL(
    cl_command_queue              /* command_queue */,
    cl_uint                       /* num_objects */,
    const cl_mem*                 /* mem_objects */,
    cl_uint                       /* num_events_in_wait_list */,
    const cl_event*               /* event_wait_list */,
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 typedef CL_API_ENTRY cl_int (CL_API_CALL *clEnqueueReleaseVA_APIMediaSurfacesINTEL_fn)(
    cl_command_queue              /* command_queue */,
    cl_uint                       /* num_objects */,
    const cl_mem*                 /* mem_objects */,
    cl_uint                       /* num_events_in_wait_list */,
    const cl_event*               /* event_wait_list */,
    cl_event*                     /* event */) CL_EXT_SUFFIX__VERSION_1_2;
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_CL_VA_API_MEDIA_SHARING_INTEL_H */
--- a/include/triton/external/CL/opencl.h
+++ b/include/triton/external/CL/opencl.h
@@ -1,59 +0,0 @@
 /*******************************************************************************
 * Copyright (c) 2008-2015 The Khronos Group Inc.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and/or associated documentation files (the
 * "Materials"), to deal in the Materials without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Materials, and to
 * permit persons to whom the Materials are furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Materials.
 *
 * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
 * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
 * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
 *    https://www.khronos.org/registry/
 *
 * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 ******************************************************************************/
 /* $Revision: 11708 $ on $Date: 2010-06-13 23:36:24 -0700 (Sun, 13 Jun 2010) $ */
 #ifndef __OPENCL_H
 #define __OPENCL_H
 #ifdef __cplusplus
 extern "C" {
 #endif
 #ifdef __APPLE__
 #include <OpenCL/cl.h>
 #include <OpenCL/cl_gl.h>
 #include <OpenCL/cl_gl_ext.h>
 #include <OpenCL/cl_ext.h>
 #else
 #include "cl.h"
 #include "cl_gl.h"
 #include "cl_gl_ext.h"
 #include "cl_ext.h"
 #endif
 #ifdef __cplusplus
 }
 #endif
 #endif  /* __OPENCL_H   */
--- a/include/triton/external/hip.h
+++ b/include/triton/external/hip.h
@@ -0,0 +1,288 @@
 /*
 * @brief hipError_t
 * @enum
 * @ingroup Enumerations
 */
 // Developer note - when updating these, update the hipErrorName and hipErrorString functions in
 // NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
 // Ignoring error-code return values from hip APIs is discouraged. On C++17,
 // we can make that yield a warning
 /*
 * @brief hipError_t
 * @enum
 * @ingroup Enumerations
 */
 // Developer note - when updating these, update the hipErrorName and hipErrorString functions in
 // NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
 #include <cstddef>
 typedef enum hipError_t {
    hipSuccess = 0,  ///< Successful completion.
    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
                               ///< or not in an acceptable range.
    hipErrorOutOfMemory = 2,
    // Deprecated
    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
    hipErrorNotInitialized = 3,
    // Deprecated
    hipErrorInitializationError = 3,
    hipErrorDeinitialized = 4,
    hipErrorProfilerDisabled = 5,
    hipErrorProfilerNotInitialized = 6,
    hipErrorProfilerAlreadyStarted = 7,
    hipErrorProfilerAlreadyStopped = 8,
    hipErrorInvalidConfiguration = 9,
    hipErrorInvalidPitchValue = 12,
    hipErrorInvalidSymbol = 13,
    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
    hipErrorInsufficientDriver = 35,
    hipErrorMissingConfiguration = 52,
    hipErrorPriorLaunchFailure = 53,
    hipErrorInvalidDeviceFunction = 98,
    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
    hipErrorInvalidImage = 200,
    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
    hipErrorContextAlreadyCurrent = 202,
    hipErrorMapFailed = 205,
    // Deprecated
    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
    hipErrorUnmapFailed = 206,
    hipErrorArrayIsMapped = 207,
    hipErrorAlreadyMapped = 208,
    hipErrorNoBinaryForGpu = 209,
    hipErrorAlreadyAcquired = 210,
    hipErrorNotMapped = 211,
    hipErrorNotMappedAsArray = 212,
    hipErrorNotMappedAsPointer = 213,
    hipErrorECCNotCorrectable = 214,
    hipErrorUnsupportedLimit = 215,
    hipErrorContextAlreadyInUse = 216,
    hipErrorPeerAccessUnsupported = 217,
    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
    hipErrorInvalidGraphicsContext = 219,
    hipErrorInvalidSource = 300,
    hipErrorFileNotFound = 301,
    hipErrorSharedObjectSymbolNotFound = 302,
    hipErrorSharedObjectInitFailed = 303,
    hipErrorOperatingSystem = 304,
    hipErrorInvalidHandle = 400,
    // Deprecated
    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
    hipErrorNotFound = 500,
    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
                             ///< ready.  This is not actually an error, but is used to distinguish
                             ///< from hipSuccess (which indicates completion).  APIs that return
                             ///< this error include hipEventQuery and hipStreamQuery.
    hipErrorIllegalAddress = 700,
    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
    hipErrorLaunchTimeOut = 702,
    hipErrorPeerAccessAlreadyEnabled =
        704,  ///< Peer access was already enabled from the current device.
    hipErrorPeerAccessNotEnabled =
        705,  ///< Peer access was never enabled from the current device.
    hipErrorSetOnActiveProcess = 708,
    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
    hipErrorHostMemoryAlreadyRegistered =
        712,  ///< Produced when trying to lock a page-locked memory.
    hipErrorHostMemoryNotRegistered =
        713,  ///< Produced when trying to unlock a non-page-locked memory.
    hipErrorLaunchFailure =
        719,  ///< An exception occurred on the device while executing a kernel.
    hipErrorCooperativeLaunchTooLarge =
        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
              ///< that was launched via cooperative launch APIs exceeds the maximum number of
              ///< allowed blocks for the current device
    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
    hipErrorUnknown = 999,  //< Unknown error.
    // HSA Runtime Error Codes start here.
    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
                                   ///< in production systems.
    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
                                  ///< not seen in production systems.
    hipErrorTbd  ///< Marker that more error codes are needed.
 } hipError_t;
 typedef struct ihipCtx_t* hipCtx_t;
 // Note many APIs also use integer deviceIds as an alternative to the device pointer:
 typedef int hipDevice_t;
 typedef enum hipDeviceP2PAttr {
  hipDevP2PAttrPerformanceRank = 0,
  hipDevP2PAttrAccessSupported,
  hipDevP2PAttrNativeAtomicSupported,
  hipDevP2PAttrHipArrayAccessSupported
 } hipDeviceP2PAttr;
 typedef struct ihipStream_t* hipStream_t;
 #define hipIpcMemLazyEnablePeerAccess 0
 #define HIP_IPC_HANDLE_SIZE 64
 typedef struct hipIpcMemHandle_st {
    char reserved[HIP_IPC_HANDLE_SIZE];
 } hipIpcMemHandle_t;
 typedef struct hipIpcEventHandle_st {
    char reserved[HIP_IPC_HANDLE_SIZE];
 } hipIpcEventHandle_t;
 typedef struct ihipModule_t* hipModule_t;
 typedef struct ihipModuleSymbol_t* hipFunction_t;
 typedef struct hipFuncAttributes {
    int binaryVersion;
    int cacheModeCA;
    size_t constSizeBytes;
    size_t localSizeBytes;
    int maxDynamicSharedSizeBytes;
    int maxThreadsPerBlock;
    int numRegs;
    int preferredShmemCarveout;
    int ptxVersion;
    size_t sharedSizeBytes;
 } hipFuncAttributes;
 typedef struct ihipEvent_t* hipEvent_t;
 /*
 * @brief hipDeviceAttribute_t
 * @enum
 * @ingroup Enumerations
 */
 typedef enum hipDeviceAttribute_t {
    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
                                                ///< bytes.
    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
                                             ///< thread block. This number is shared by all thread
                                             ///< blocks simultaneously resident on a
                                             ///< multiprocessor.
    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
                                    ///< cache.
    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
                                                    ///< multiprocessor.
    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
                                          ///< concurrently.
    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
                                                         ///< Multiprocessor.
    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
    hipDeviceAttributeIntegrated,                        ///< iGPU
    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
    hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D images in image elements
    hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimensions height of 3D images in image elements
    hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimensions depth of 3D images in image elements
    hipDeviceAttributeHdpMemFlushCntl,      ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
    hipDeviceAttributeHdpRegFlushCntl,      ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
    hipDeviceAttributeEccEnabled,           ///<Device has ECC support enabled
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,        ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched functions
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,     ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched grid dimensions
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched block dimensions
    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
                                                                  ///devices with unmatched shared memories
    hipDeviceAttributeAsicRevision,         ///< Revision of the GPU in this device
    hipDeviceAttributeManagedMemory,        ///< Device supports allocating managed memory on this system
    hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
                                                      /// the device without migration
    hipDeviceAttributeConcurrentManagedAccess,  ///< Device can coherently access managed memory
                                                /// concurrently with the CPU
    hipDeviceAttributePageableMemoryAccess,     ///< Device supports coherently accessing pageable memory
                                                /// without calling hipHostRegister on it
    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
                                                              /// the host's page tables
    hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
                                            ///< hipStreamWaitValue64() , '0' otherwise.
 } hipDeviceAttribute_t;
 typedef void* hipDeviceptr_t;
 /*
 * @brief hipJitOption
 * @enum
 * @ingroup Enumerations
 */
 typedef enum hipJitOption {
    hipJitOptionMaxRegisters = 0,
    hipJitOptionThreadsPerBlock,
    hipJitOptionWallTime,
    hipJitOptionInfoLogBuffer,
    hipJitOptionInfoLogBufferSizeBytes,
    hipJitOptionErrorLogBuffer,
    hipJitOptionErrorLogBufferSizeBytes,
    hipJitOptionOptimizationLevel,
    hipJitOptionTargetFromContext,
    hipJitOptionTarget,
    hipJitOptionFallbackStrategy,
    hipJitOptionGenerateDebugInfo,
    hipJitOptionLogVerbose,
    hipJitOptionGenerateLineInfo,
    hipJitOptionCacheMode,
    hipJitOptionSm3xOpt,
    hipJitOptionFastCompile,
    hipJitOptionNumOptions
 } hipJitOption;
 /**
 * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
 */
 typedef enum hipFuncAttribute {
    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
    hipFuncAttributePreferredSharedMemoryCarveout = 9,
    hipFuncAttributeMax
 } hipFuncAttribute;
 /**
 * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
 */
 typedef enum hipFuncCache_t {
    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
 } hipFuncCache_t;
 #define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
 #define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
 #define HIP_LAUNCH_PARAM_END ((void*)0x03)
--- a/lib/codegen/pass.cc
+++ b/lib/codegen/pass.cc
@@ -13,45 +13,40 @@
 #include "triton/codegen/transform/peephole.h"
 #include "triton/codegen/transform/pipeline.h"
 #include "triton/codegen/transform/prefetch.h"
 #include "triton/driver/device.h"
 #include "triton/driver/kernel.h"
 #include "triton/driver/module.h"
 #include "triton/ir/function.h"
 #include "triton/ir/module.h"
 #include "triton/ir/print.h"
 #include "llvm/IR/Module.h"
-
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 namespace triton {
 namespace codegen {
 // TODO:
 // There should be a proper pass manager there!
-void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps, int num_stages, bool force_nc_cache,
+std::unique_ptr<llvm::Module> add_passes_to_emit_bin(ir::module &ir, llvm::LLVMContext& ctx, codegen::target* target,
-                            driver::module *&mod, driver::kernel *&ker, size_t &shared_mem) {
+                                                     int cc, int num_warps, int num_stages, bool force_nc_cache, int& shared_static) {
  // generate llvm code
  llvm::LLVMContext ctx;
  std::string name = ir.get_function_list()[0]->get_name();
  std::unique_ptr<llvm::Module> llvm(new llvm::Module(name, ctx));
  // optimizations
-  std::unique_ptr<codegen::target> target = dev->make_target();
+  bool cts_use_async = target->as_nvidia() && target->as_nvidia()->sm() >= 80;
  bool cts_use_async = target->as_nvidia()->sm() >= 80;
  // create passes
  codegen::analysis::align align;
  codegen::analysis::axes axes;
  codegen::transform::cts cts(cts_use_async);
  codegen::transform::pipeline pipeline(cts_use_async, num_stages);
  codegen::transform::disassociate disassociate;
-  codegen::analysis::layouts layouts(&axes, &align, num_warps, target.get());
+  codegen::analysis::layouts layouts(&axes, &align, num_warps, target);
  codegen::analysis::liveness liveness(&layouts);
-  codegen::analysis::swizzle swizzle(&layouts, target.get());
+  codegen::analysis::swizzle swizzle(&layouts, target);
  codegen::analysis::allocation allocation(&liveness);
  codegen::transform::dce dce;
-  codegen::transform::peephole peephole(target.get(), &layouts);
+  codegen::transform::peephole peephole(target, &layouts);
 //  codegen::transform::reassociate reassociate;
  codegen::transform::coalesce coalesce(&align, &layouts);
-  codegen::transform::prefetch prefetch_s(target.get());
+  codegen::transform::prefetch prefetch_s(target);
-  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target.get());
+  codegen::transform::membar barriers(&liveness, &layouts, &allocation, &prefetch_s, target);
-  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target.get(), num_warps, force_nc_cache);
+  codegen::generator isel(&axes, &layouts, &align, &allocation, &swizzle, target, num_warps, force_nc_cache);
  // run passes
  dce.run(ir);
  peephole.run(ir);
@@ -72,15 +67,12 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  layouts.run(ir);
  coalesce.run(ir);
  dce.run(ir);
 //  exit(1);
  align.run(ir);
  dce.run(ir);
  if (target->is_gpu())
    cts.run(ir);
  dce.run(ir);
  align.run(ir);
 //  ir::print(ir, std::cout);
  axes.run(ir);
  layouts.run(ir);
  peephole.run(ir);
@@ -93,11 +85,9 @@ void add_passes_to_emit_bin(ir::module &ir, driver::device *dev, int num_warps,
  allocation.run(ir);
  prefetch_s.run(ir);
  barriers.run(ir);
  // ir.print(std::cout);
  isel.visit(ir, *llvm);
-  mod = driver::module::create(dev, std::move(llvm));
+  shared_static = allocation.allocated_size();
-  ker = driver::kernel::create(&*mod, name.c_str());
+  return llvm;
  shared_mem = allocation.allocated_size();
 }
 } // namespace codegen
--- a/lib/driver/backend.cc
+++ b/lib/driver/backend.cc
@@ -1,231 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <vector>
 #include <stdexcept>
 #include "triton/driver/dispatch.h"
 #include "triton/driver/backend.h"
 #include "triton/driver/buffer.h"
 #include "triton/driver/context.h"
 #include "triton/driver/stream.h"
 #include "triton/driver/kernel.h"
 namespace triton
 {
 namespace driver
 {
 /*-----------------------------------*/
 //-----------  Platforms ------------*/
 /*-----------------------------------*/
 void backend::platforms::init() {
  if(!cache_.empty())
    return;
  //if CUDA is here
  if(dispatch::cuinit()){
    cache_.push_back(new cu_platform());
  }
  //if host should be added
  bool host_visible = true;
  if(host_visible){
    cache_.push_back(new host_platform());
  }
 //  //if OpenCL is here
 //  if(dispatch::clinit()){
 //    cl_uint num_platforms;
 //    dispatch::clGetPlatformIDs(0, nullptr, &num_platforms);
 //    std::vector<cl_platform_id> ids(num_platforms);
 //    dispatch::clGetPlatformIDs(num_platforms, ids.data(), nullptr);
 //    for(cl_platform_id id: ids)
 //      cache_.push_back(new cl_platform(id));
 //  }
  if(cache_.empty())
    throw std::runtime_error("Triton: No backend available. Make sure CUDA is available in your library path");
 }
 void backend::platforms::get(std::vector<platform *> &results) {
  std::copy(cache_.begin(), cache_.end(), std::back_inserter(results));
 }
 std::vector<driver::platform*> backend::platforms::cache_;
 /*-----------------------------------*/
 //-----------  Devices --------------*/
 /*-----------------------------------*/
 void backend::devices::init(std::vector<platform*> const & platforms) {
  if(!cache_.empty())
    return;
  for(driver::platform* pf: platforms)
    pf->devices(cache_);
  if(cache_.empty())
    throw std::runtime_error("Triton: No device available. Make sure that your platform is configured properly");
 }
 void backend::devices::get(std::vector<device*> &devs) {
  std::copy(cache_.begin(), cache_.end(), std::back_inserter(devs));
 }
 std::vector<driver::device*> backend::devices::cache_;
 /*-----------------------------------*/
 //---------- Modules ----------------*/
 /*-----------------------------------*/
 void backend::modules::release(){
  for(auto & x: cache_)
    delete x.second;
  cache_.clear();
 }
 std::map<std::tuple<driver::stream*, std::string>, driver::module*>  backend::modules::cache_;
 /*-----------------------------------*/
 //-----------  Kernels --------------*/
 /*-----------------------------------*/
 void backend::kernels::release(){
  for(auto & x: cache_)
    delete x.second;
  cache_.clear();
 }
 driver::kernel* backend::kernels::get(driver::module *mod, std::string const & name){
  std::tuple<driver::module*, std::string> key(mod, name);
  if(cache_.find(key)==cache_.end()){
    return &*cache_.insert({key, driver::kernel::create(mod, name.c_str())}).first->second;
  }
  return cache_.at(key);
 }
 std::map<std::tuple<driver::module*, std::string>, driver::kernel*> backend::kernels::cache_;
 /*-----------------------------------*/
 //------------  Queues --------------*/
 /*-----------------------------------*/
 void backend::streams::init(std::list<driver::context*> const & contexts){
  for(driver::context* ctx : contexts)
    if(cache_.find(ctx)==cache_.end())
      cache_.insert(std::make_pair(ctx, std::vector<driver::stream*>{driver::stream::create(ctx->backend())}));
 }
 void backend::streams::release(){
  for(auto & x: cache_)
    for(auto & y: x.second)
      delete y;
  cache_.clear();
 }
 driver::stream* backend::streams::get_default()
 { return get(contexts::get_default(), 0); }
 driver::stream* backend::streams::get(driver::context* context, unsigned int id){
  init(std::list<driver::context*>(1,context));
  for(auto & x : cache_)
    if(x.first==context)
      return x.second[id];
  throw;
 }
 void backend::streams::get(driver::context* context, std::vector<driver::stream*> & queues){
  init(std::list<driver::context*>(1,context));
  queues = cache_.at(context);
 }
 std::map<driver::context*, std::vector<driver::stream*>> backend::streams::cache_;
 /*-----------------------------------*/
 //------------  Contexts ------------*/
 /*-----------------------------------*/
 void backend::contexts::init(std::vector<driver::device*> const & devices){
  for(driver::device* dvc: devices)
    cache_.push_back(driver::context::create(dvc));
 }
 void backend::contexts::release(){
  for(auto & x: cache_)
    delete x;
  cache_.clear();
 }
 driver::context* backend::contexts::get_default(){
  backend::init();
  auto it = cache_.begin();
  std::advance(it, default_device);
  return *it;
 }
 void backend::contexts::get(std::list<driver::context*> & contexts){
  backend::init();
  contexts = cache_;
 }
 std::list<driver::context*> backend::contexts::cache_;
 /*-----------------------------------*/
 //------------  General -------------*/
 /*-----------------------------------*/
 void backend::synchronize(driver::context* context){
  for(driver::stream * queue: streams::cache_.at(context))
    queue->synchronize();
 }
 void backend::release(){
  backend::kernels::release();
 //  backend::programs::release();
  backend::streams::release();
  backend::contexts::release();
 }
 void backend::init(){
  if(!contexts::cache_.empty())
    return;
  // initialize platforms
  backend::platforms::init();
  // initialize devices
  backend::devices::init(platforms::cache_);
  // initialize contexts
  backend::contexts::init(devices::cache_);
  // initialize streams
  streams::init(contexts::cache_);
 }
 unsigned int backend::default_device = 0;
 }
 }
--- a/lib/driver/buffer.cc
+++ b/lib/driver/buffer.cc
@@ -1,90 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "triton/driver/stream.h"
 #include "triton/driver/buffer.h"
 #include "triton/driver/context.h"
 #include "triton/driver/dispatch.h"
 namespace triton
 {
 namespace driver
 {
 //
 buffer::buffer(size_t size, CUdeviceptr cu, bool take_ownership)
  : polymorphic_resource(cu, take_ownership), size_(size) { }
 buffer::buffer(size_t size, host_buffer_t hst, bool take_ownership)
  : polymorphic_resource(hst, take_ownership), size_(size) { }
 size_t buffer::size() {
  return size_;
 }
 uintptr_t buffer::addr_as_uintptr_t() {
  switch(backend_){
    case CUDA: return *cu_;
    case Host: return (uintptr_t)hst_->data;
    default: return 0;
  }
 }
 buffer* buffer::create(driver::context* ctx, size_t size) {
  switch(ctx->backend()){
  case CUDA: return new cu_buffer(size);
  case Host: return new host_buffer(size);
  default: throw std::runtime_error("unknown backend");
  }
 }
 //
 host_buffer::host_buffer(size_t size)
  :  buffer(size, host_buffer_t(), true){
  hst_->data = new char[size];
 }
 //
 cu_buffer::cu_buffer(size_t size)
  : buffer(size, CUdeviceptr(), true) {
  dispatch::cuMemAlloc(&*cu_, size);
 }
 cu_buffer::cu_buffer(size_t size, CUdeviceptr cu, bool take_ownership)
  : buffer(size, cu, take_ownership){
 }
 void cu_buffer::set_zero(driver::stream* queue, size_t size){
  dispatch::cuMemsetD8Async(*cu_, 0, size, *queue->cu());
 }
 }
 }
--- a/lib/driver/context.cc
+++ b/lib/driver/context.cc
@@ -1,118 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <cassert>
 #include "triton/driver/context.h"
 #include "triton/driver/module.h"
 #include "triton/tools/sys/getenv.hpp"
 #include "triton/tools/sys/mkdir.hpp"
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //         BASE             //
 /* ------------------------ */
 context::context(driver::device *dev, CUcontext cu, bool take_ownership):
  polymorphic_resource(cu, take_ownership),
  dev_(dev), cache_path_(get_cache_path()) {
 }
 context::context(driver::device *dev, host_context_t hst, bool take_ownership):
  polymorphic_resource(hst, take_ownership),
  dev_(dev), cache_path_(get_cache_path()){
 }
 context* context::create(driver::device *dev){
  switch(dev->backend()){
  case CUDA: return new cu_context(dev);
  case Host: return new host_context(dev);
  default: throw std::runtime_error("unknown backend");
  }
 }
 driver::device* context::device() const {
  return dev_;
 }
 std::string context::get_cache_path(){
  //user-specified cache path
  std::string result = tools::getenv("TRITON_CACHE_PATH");
  if(!result.empty()){
    if(tools::mkpath(result)==0)
      return result;
  }
  //create in home
  result = tools::getenv("HOME");
  if(!result.empty())
  {
    result = result + "/.triton/cache/";
    if(tools::mkpath(result)==0)
      return result;
  }
  //couldn't find a directory
  return "";
 }
 std::string const & context::cache_path() const{
  return cache_path_;
 }
 /* ------------------------ */
 //         Host             //
 /* ------------------------ */
 host_context::host_context(driver::device* dev): context(dev, host_context_t(), true){
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 // import CUdevice
 CUdevice cu_context::get_device_of(CUcontext context){
  dispatch::cuCtxPushCurrent_v2(context);
  CUdevice res;
  dispatch::cuCtxGetDevice(&res);
  dispatch::cuCtxPopCurrent_v2(NULL);
  return res;
 }
 // wrapper for cuda context
 cu_context::cu_context(CUcontext context, bool take_ownership): driver::context(new driver::cu_device(get_device_of(context), false),
                                                                                context, take_ownership) {
 }
 cu_context::cu_context(driver::device* device): context(device, CUcontext(), true){
  dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, *((driver::cu_device*)dev_)->cu());
 //  dispatch::cuCtxPopCurrent_v2(NULL);
 }
 }
 }
--- a/lib/driver/device.cc
+++ b/lib/driver/device.cc
@@ -1,192 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <map>
 #include <algorithm>
 #include <sstream>
 #include <cstring>
 #include <memory>
 #include "triton/driver/device.h"
 #include "triton/driver/context.h"
 #include "triton/driver/error.h"
 #include "triton/codegen/target.h"
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //          Host            //
 /* ------------------------ */
 std::unique_ptr<codegen::target> host_device::make_target() const {
  return std::unique_ptr<codegen::cpu_target>(new codegen::cpu_target());
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 // information query
 template<CUdevice_attribute attr>
 int cu_device::cuGetInfo() const{
  int res;
  dispatch::cuDeviceGetAttribute(&res, attr, *cu_);
  return res;
 }
 // convert to nvml
 nvmlDevice_t cu_device::nvml_device() const{
  std::map<std::string, nvmlDevice_t> map;
  std::string key = pci_bus_id();
  if(map.find(key)==map.end()){
    nvmlDevice_t device;
    dispatch::nvmlDeviceGetHandleByPciBusId_v2(key.c_str(), &device);
    return map.insert(std::make_pair(key, device)).first->second;
  }
  return map.at(key);
 }
 // number of address bits
 size_t cu_device::address_bits() const{
  return sizeof(size_t)*8;
 }
 // name
 std::string cu_device::name() const {
    char tmp[128];
    dispatch::cuDeviceGetName(tmp, 128, *cu_);
    return std::string(tmp);
 }
 // PCI bus ID
 std::string cu_device::pci_bus_id() const{
  char tmp[128];
  dispatch::cuDeviceGetPCIBusId(tmp, 128, *cu_);
  return std::string(tmp);
 }
 // force the device to be interpreted as a particular cc
 void cu_device::interpret_as(int cc){
  interpreted_as_ = std::make_shared<int>(cc);
 }
 // compute capability
 int cu_device::compute_capability() const {
  if(interpreted_as_)
    return *interpreted_as_;
  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>();
  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>();
  return major*10 + minor;
 }
 // maximum number of threads per block
 size_t cu_device::max_threads_per_block() const {
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK>();
 }
 // maximum amount of shared memory per block
 size_t cu_device::max_shared_memory() const {
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>();
 }
 // warp size
 size_t cu_device::warp_size() const {
  return cuGetInfo<CU_DEVICE_ATTRIBUTE_WARP_SIZE>();
 }
 // maximum block dimensions
 std::vector<size_t> cu_device::max_block_dim() const {
  std::vector<size_t> result(3);
  result[0] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X>();
  result[1] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y>();
  result[2] = cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z>();
  return result;
 }
 // current SM clock
 size_t cu_device::current_sm_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
  return result;
 }
 // max SM clock
 size_t cu_device::max_sm_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_SM, &result);
  return result;
 }
 // current memory clock
 size_t cu_device::current_mem_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
  return result;
 }
 // max memory clock
 size_t cu_device::max_mem_clock() const{
  unsigned int result;
  dispatch::nvmlDeviceGetMaxClockInfo(nvml_device(), NVML_CLOCK_MEM, &result);
  return result;
 }
 // max memory clock
 void cu_device::set_max_clock() {
  dispatch::nvmlDeviceSetApplicationsClocks(nvml_device(), max_mem_clock(), max_sm_clock());
 }
 void cu_device::enable_peer_access(CUdeviceptr peer_mem_ptr) const{
  CUcontext context;
  dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_mem_ptr);
  try {
    dispatch::cuCtxEnablePeerAccess(context, 0);
  } catch (exception::cuda::peer_access_already_enabled) {}
 }
 // print infos
 std::string cu_device::infos() const{
  std::ostringstream oss;
  std::vector<size_t> max_wi_sizes = max_block_dim();
  oss << "Platform: CUDA" << std::endl;
  oss << "Name: " << name() << std::endl;
  oss << "Maximum total work-group size: " << max_threads_per_block() << std::endl;
  oss << "Maximum individual work-group sizes: " << max_wi_sizes[0] << ", " << max_wi_sizes[1] << ", " << max_wi_sizes[2] << std::endl;
  oss << "Local memory size: " << max_shared_memory() << std::endl;
  return oss.str();
 }
 // target
 std::unique_ptr<codegen::target> cu_device::make_target() const {
  return std::unique_ptr<codegen::nvidia_cu_target>(new codegen::nvidia_cu_target(compute_capability()));
 }
 }
 }
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -21,7 +21,6 @@
 */
 #include "triton/driver/dispatch.h"
 #include "triton/driver/context.h"
 #include "triton/tools/sys/getenv.hpp"
 namespace triton
@@ -31,65 +30,65 @@ namespace driver
 //Helpers for function definition
 #define DEFINE0(init, hlib, ret, fname) ret dispatch::fname()\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname); }\
 void* dispatch::fname ## _;
 #define DEFINE1(init, hlib, ret, fname, t1) ret dispatch::fname(t1 a)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a); }\
 void* dispatch::fname ## _;
 #define DEFINE2(init, hlib, ret, fname, t1, t2) ret dispatch::fname(t1 a, t2 b)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b); }\
 void* dispatch::fname ## _;
 #define DEFINE3(init, hlib, ret, fname, t1, t2, t3) ret dispatch::fname(t1 a, t2 b, t3 c)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c); }\
 void* dispatch::fname ## _;
 #define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d); }\
 void* dispatch::fname ## _;
 #define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e); }\
 void* dispatch::fname ## _;
 #define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f); }\
 void* dispatch::fname ## _;
 #define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g); }\
 void* dispatch::fname ## _;
 #define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h); }\
 void* dispatch::fname ## _;
 #define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i); }\
 void* dispatch::fname ## _;
 #define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j); }\
 void* dispatch::fname ## _;
 #define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k); }\
 void* dispatch::fname ## _;
 #define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m); }\
 void* dispatch::fname ## _;
 #define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18, t19) ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i, t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q, t18 r, t19 s)\
-{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }
+{return f_impl<dispatch::init>(hlib, fname, fname ## _, #fname, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s); }\
 void* dispatch::fname ## _;
 //Specialized helpers for CUDA
 #define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
 #define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
 #define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
 #define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
 #define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
 #define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 #define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 #define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
 #define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
 #define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
 #define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 /* ------------------- *
 * CUDA
 * ------------------- */
 bool dispatch::cuinit(){
  if(cuda_==nullptr){
@@ -115,6 +114,74 @@ bool dispatch::cuinit(){
  return true;
 }
 #define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
 #define CUDA_DEFINE2(ret, fname, t1, t2) DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
 #define CUDA_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
 #define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
 #define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
 #define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 #define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 // context management
 CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
 CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
 CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
 CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
 CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 // device management
 CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
 CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
 CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
 CUDA_DEFINE1(CUresult, cuDeviceGetCount, int*)
 // link management
 CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
 CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
 CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
 CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
 // module management
 CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
 CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
 CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
 CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 // stream management
 CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
 CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
 CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
 // function management
 CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
 CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
 // memory management
 CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
 CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
 CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
 CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
 CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
 CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
 CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
 // event management
 CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
 CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 /* ------------------- *
 * NVML
 * ------------------- */
 bool dispatch::nvmlinit(){
  if(nvml_==nullptr)
    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
@@ -126,59 +193,93 @@ bool dispatch::nvmlinit(){
  return res;
 }
-//CUDA
+#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
+#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
+#define NVML_DEFINE2(ret, fname, t1, t2) DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
+#define NVML_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
 CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
 CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
 CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
 CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
 CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t, CUstream)
 CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
 CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
 CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
 CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr*, size_t*, CUmodule, const char*)
 CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void*, size_t, const char*, unsigned int, CUjit_option*, void**);
 CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option*, void**, CUlinkState*);
 CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
 CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void**, size_t*);
 CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t, CUstream)
 CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
 CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, CUstream, void **, void **)
 CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
 CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
 CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *, unsigned int, CUjit_option *, void **)
 CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute, CUdevice)
 CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
 CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t )
 CUDA_DEFINE1(CUresult, cuInit, unsigned int)
 CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
 CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
 CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule, const char *)
 CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
 CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
 CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext*)
 CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
 CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t)
 CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr)
 CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*)
 CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*)
 CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext)
 CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream)
 CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext)
 CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*)
 CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int*, CUfunction_attribute, CUfunction)
 CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute, int)
 CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
 CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
 NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *, nvmlDevice_t*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t, nvmlClockType_t, unsigned int*)
 NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t, unsigned int, unsigned int)
 /* ------------------- *
 * HIP
 * ------------------- */
 bool dispatch::hipinit(){
  if(hip_==nullptr)
    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
  if(hip_ == nullptr)
    return false;
  hipError_t (*fptr)();
  hipInit_ = dlsym(hip_, "hipInit");
  *reinterpret_cast<void **>(&fptr) = hipInit_;
  hipError_t res = (*fptr)();
  check(res);
  return res;
 }
 #define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
 #define HIP_DEFINE2(ret, fname, t1, t2) DEFINE2(hipinit, hip_, ret, fname, t1, t2)
 #define HIP_DEFINE3(ret, fname, t1, t2, t3) DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
 #define HIP_DEFINE4(ret, fname, t1, t2, t3, t4) DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
 #define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5) DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
 #define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6) DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
 #define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7) DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
 #define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8) DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
 #define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9) DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
 #define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10) DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
 #define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11)
 // context management
 HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
 HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
 HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t*)
 HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
 HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t*)
 HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
 HIP_DEFINE1(hipError_t, hipInit, unsigned int)
 HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
 // device management
 HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
 HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
 HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
 HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t, hipDevice_t)
 HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
 // module management
 HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t*, size_t*, hipModule_t, const char*)
 HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
 HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
 HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
 HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *, unsigned int, hipJitOption *, void **)
 HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t, const char *)
 // stream management
 HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
 HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
 HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
 HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, hipStream_t, void **, void **)
 // function management
 HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes*, void*)
 HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
 // memory management
 HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
 HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
 HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t, hipStream_t)
 HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *, size_t, hipStream_t)
 HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t )
 HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t*, size_t)
 HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void*, CUpointer_attribute, hipDeviceptr_t)
 HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t, hipStream_t)
 // event management
 HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
 HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
 HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
 HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
 /* ------------------- *
 * COMMON
 * ------------------- */
 // Release
 void dispatch::release(){
@@ -190,61 +291,9 @@ void dispatch::release(){
 void* dispatch::cuda_;
 void* dispatch::nvml_;
 //CUDA
 void* dispatch::cuCtxGetCurrent_;
 void* dispatch::cuCtxSetCurrent_;
 void* dispatch::cuCtxDestroy_v2_;
 void* dispatch::cuEventCreate_;
 void* dispatch::cuDeviceGet_;
 void* dispatch::cuMemcpyDtoH_v2_;
 void* dispatch::cuStreamCreate_;
 void* dispatch::cuEventElapsedTime_;
 void* dispatch::cuMemFree_v2_;
 void* dispatch::cuMemcpyDtoHAsync_v2_;
 void* dispatch::cuDriverGetVersion_;
 void* dispatch::cuDeviceGetName_;
 void* dispatch::cuDeviceGetPCIBusId_;
 void* dispatch::cuModuleGetGlobal_v2_;
 void* dispatch::cuLinkAddData_v2_;
 void* dispatch::cuLinkCreate_v2_;
 void* dispatch::cuLinkDestroy_;
 void* dispatch::cuModuleLoadData_;
 void* dispatch::cuLinkComplete_;
 void* dispatch::cuMemcpyHtoDAsync_v2_;
 void* dispatch::cuModuleLoad_;
 void* dispatch::cuLaunchKernel_;
 void* dispatch::cuModuleUnload_;
 void* dispatch::cuModuleLoadDataEx_;
 void* dispatch::cuDeviceGetAttribute_;
 void* dispatch::cuDeviceGetCount_;
 void* dispatch::cuMemcpyHtoD_v2_;
 void* dispatch::cuInit_;
 void* dispatch::cuEventRecord_;
 void* dispatch::cuCtxCreate_v2_;
 void* dispatch::cuModuleGetFunction_;
 void* dispatch::cuStreamSynchronize_;
 void* dispatch::cuStreamDestroy_v2_;
 void* dispatch::cuStreamGetCtx_;
 void* dispatch::cuEventDestroy_v2_;
 void* dispatch::cuMemAlloc_v2_;
 void* dispatch::cuPointerGetAttribute_;
 void* dispatch::cuCtxGetDevice_;
 void* dispatch::cuMemsetD8Async_;
 void* dispatch::cuCtxPushCurrent_v2_;
 void* dispatch::cuCtxPopCurrent_v2_;
 void* dispatch::cuFuncGetAttribute_;
 void* dispatch::cuFuncSetAttribute_;
 void* dispatch::cuFuncSetCacheConfig_;
 void* dispatch::cuCtxEnablePeerAccess_;
 void* dispatch::nvmlInit_v2_;
-void* dispatch::nvmlDeviceGetHandleByPciBusId_v2_;
+void* dispatch::hip_;
-void* dispatch::nvmlDeviceGetClockInfo_;
+
 void* dispatch::nvmlDeviceGetMaxClockInfo_;
 void* dispatch::nvmlDeviceSetApplicationsClocks_;
 }
 }
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -94,6 +94,73 @@ void check(CUresult err)
  }
 }
 void check(hipError_t error) {
  using namespace exception::hip;
  switch(error)
  {
  case hipSuccess                              : break;
    case hipErrorInvalidValue                  : throw invalid_value();
    case hipErrorMemoryAllocation                  : throw out_of_memory();
    case hipErrorNotInitialized                : throw not_initialized();
    case hipErrorDeinitialized                  : throw deinitialized();
    case hipErrorProfilerDisabled              : throw profiler_disabled();
    case hipErrorProfilerNotInitialized       : throw profiler_not_initialized();
    case hipErrorProfilerAlreadyStarted       : throw profiler_already_started();
    case hipErrorProfilerAlreadyStopped       : throw profiler_already_stopped();
    case hipErrorNoDevice                      : throw no_device();
    case hipErrorInvalidSymbol                      : throw invalid_symbol();
    case hipErrorInvalidDevice                 : throw invalid_device();
    case hipErrorInvalidImage                  : throw invalid_image();
    case hipErrorInvalidContext                : throw invalid_context();
    case hipErrorContextAlreadyCurrent        : throw context_already_current();
    case hipErrorMapFailed                     : throw map_failed();
    case hipErrorUnmapFailed                   : throw unmap_failed();
    case hipErrorArrayIsMapped                : throw array_is_mapped();
    case hipErrorAlreadyMapped                 : throw already_mapped();
    case hipErrorNoBinaryForGpu              : throw no_binary_for_gpu();
    case hipErrorAlreadyAcquired               : throw already_acquired();
    case hipErrorNotMapped                     : throw not_mapped();
    case hipErrorNotMappedAsArray             : throw not_mapped_as_array();
    case hipErrorNotMappedAsPointer           : throw not_mapped_as_pointer();
    case hipErrorECCNotCorrectable            : throw ecc_uncorrectable();
    case hipErrorUnsupportedLimit             : throw unsupported_limit();
    case hipErrorContextAlreadyInUse          : throw context_already_in_use();
    case hipErrorPeerAccessUnsupported        : throw peer_access_unsupported();
    case hipErrorInvalidKernelFile            : throw invalid_ptx();
    case hipErrorInvalidGraphicsContext       : throw invalid_graphics_context();
    case hipErrorInvalidSource                 : throw invalid_source();
    case hipErrorFileNotFound                 : throw file_not_found();
    case hipErrorSharedObjectSymbolNotFound : throw shared_object_symbol_not_found();
    case hipErrorSharedObjectInitFailed      : throw shared_object_init_failed();
    case hipErrorOperatingSystem               : throw operating_system();
    case hipErrorInvalidResourceHandle                 : throw invalid_handle();
    case hipErrorNotFound                      : throw not_found();
    case hipErrorNotReady                      : throw not_ready();
    case hipErrorIllegalAddress                : throw illegal_address();
    case hipErrorLaunchOutOfResources        : throw launch_out_of_resources();
    case hipErrorLaunchTimeOut                 : throw launch_timeout();
    // case hipErrorLaunchIncompatibleTexturing  : throw launch_incompatible_texturing();
    case hipErrorPeerAccessAlreadyEnabled    : throw peer_access_already_enabled();
    case hipErrorPeerAccessNotEnabled        : throw peer_access_not_enabled();
    // case hipErrorPrimaryContextActive         : throw primary_context_active();
    // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
    case hipErrorAssert                         : throw assert_error();
    // case hipErrorTooManyPeers                 : throw too_many_peers();
    case hipErrorHostMemoryAlreadyRegistered : throw host_memory_already_registered();
    case hipErrorHostMemoryNotRegistered     : throw host_memory_not_registered();
    // case hipErrorHardwareStackError           : throw hardware_stack_error();
    // case hipErrorIllegalInstruction            : throw illegal_instruction();
    // case hipErrorMisalignedAddress             : throw misaligned_address();
    // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
    // case hipErrorInvalidPc                     : throw invalid_pc();
    case hipErrorLaunchFailure                  : throw launch_failed();
    // case hipErrorNotPermitted                  : throw not_permitted();
    case hipErrorNotSupported                  : throw not_supported();
    case hipErrorUnknown                        : throw unknown();
    default                                        : throw unknown();
 }
 }
 }
 }
--- a/lib/driver/handle.cc
+++ b/lib/driver/handle.cc
@@ -1,91 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "triton/driver/handle.h"
 #include "triton/driver/error.h"
 namespace triton
 {
 namespace driver
 {
 //Host
 inline void _delete(host_platform_t) { }
 inline void _delete(host_device_t)   { }
 inline void _delete(host_context_t)  { }
 inline void _delete(host_module_t)   { }
 inline void _delete(host_stream_t)   { }
 inline void _delete(host_buffer_t x)   { if(x.data) delete[] x.data; }
 inline void _delete(host_function_t) { }
 //CUDA
 inline void _delete(CUcontext x) { dispatch::cuCtxDestroy(x); }
 inline void _delete(CUdeviceptr x) { dispatch::cuMemFree(x); }
 inline void _delete(CUstream x) { dispatch::cuStreamDestroy(x); }
 inline void _delete(CUdevice) { }
 inline void _delete(CUevent x) { dispatch::cuEventDestroy(x); }
 inline void _delete(CUfunction) { }
 inline void _delete(CUmodule x) { dispatch::cuModuleUnload(x); }
 inline void _delete(cu_event_t x) { _delete(x.first); _delete(x.second); }
 inline void _delete(CUPlatform){}
 //Constructor
 template<class T>
 handle<T>::handle(T cu, bool take_ownership): h_(new T(cu)), has_ownership_(take_ownership)
 { }
 template<class T>
 handle<T>::handle(): has_ownership_(false){ }
 template<class T>
 handle<T>::~handle(){
  try{
    if(has_ownership_ && h_ && h_.unique())
      _delete(*h_);
  }catch(const exception::cuda::base&){
    // order of destruction for global variables
    // is not guaranteed
  }
 }
 template class handle<CUdeviceptr>;
 template class handle<CUstream>;
 template class handle<CUcontext>;
 template class handle<CUdevice>;
 template class handle<cu_event_t>;
 template class handle<CUfunction>;
 template class handle<CUmodule>;
 template class handle<CUPlatform>;
 template class handle<host_platform_t>;
 template class handle<host_device_t>;
 template class handle<host_context_t>;
 template class handle<host_module_t>;
 template class handle<host_stream_t>;
 template class handle<host_buffer_t>;
 template class handle<host_function_t>;
 }
 }
--- a/lib/driver/kernel.cc
+++ b/lib/driver/kernel.cc
@@ -1,94 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <string.h>
 #include "triton/driver/kernel.h"
 #include "triton/driver/buffer.h"
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //         Base             //
 /* ------------------------ */
 kernel::kernel(driver::module *program, CUfunction fn, bool has_ownership):
  polymorphic_resource(fn, has_ownership), program_(program){
 }
 kernel::kernel(driver::module *program, host_function_t fn, bool has_ownership):
  polymorphic_resource(fn, has_ownership), program_(program){
 }
 kernel* kernel::create(driver::module* program, const char* name) {
    switch(program->backend()){
    case CUDA: return new cu_kernel(program, name);
    case Host: return new host_kernel(program, name);
    default: throw std::runtime_error("unknown backend");
    }
 }
 driver::module* kernel::module() {
  return program_;
 }
 /* ------------------------ */
 //         Host             //
 /* ------------------------ */
 host_kernel::host_kernel(driver::module* program, const char *name): kernel(program, host_function_t(), true) {
  hst_->fn = program->hst()->functions.at(name);
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 cu_kernel::cu_kernel(driver::module *program, const char * name) : kernel(program, CUfunction(), true) {
  dispatch::cuModuleGetFunction(&*cu_, *program->cu(), name);
  dispatch::cuFuncSetCacheConfig(*cu_, CU_FUNC_CACHE_PREFER_SHARED);
  // properties
  int shared_total, shared_optin, shared_static;
  int n_spills, n_reg;
  CUdevice dev;
  dispatch::cuCtxGetDevice(&dev);
  dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
  dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
  dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, *cu_);
  dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  *cu_);
  dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, *cu_);
 //  std::cout << n_reg << std::endl;
  if (shared_optin > 49152){
 //      std::cout << "dynamic shared memory " << shared_optin << " " << shared_static << std::endl;
      dispatch::cuFuncSetAttribute(*cu_, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
  }
 }
 }
 }
--- a/lib/driver/llvm.cc
+++ b/lib/driver/llvm.cc
@@ -0,0 +1,324 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <fstream>
 #include <unistd.h>
 #include <memory>
 #include <regex>
 #include "triton/driver/llvm.h"
 #include "triton/driver/dispatch.h"
 #include "triton/driver/error.h"
 #include "triton/tools/sha1.hpp"
 #include "triton/tools/sys/getenv.hpp"
 #include "triton/tools/sys/mkdir.hpp"
 #include "triton/tools/sys/exec.hpp"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 // begin AMD stuff
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 // end AMD stuff
 namespace triton{
 namespace driver{
 void init_llvm() {
  static bool init = false;
  if(!init){
    LLVMInitializeNVPTXTargetInfo();
    LLVMInitializeNVPTXTarget();
    LLVMInitializeNVPTXTargetMC();
    LLVMInitializeNVPTXAsmPrinter();
    LLVMInitializeAMDGPUTargetInfo();
    LLVMInitializeAMDGPUTarget();
    LLVMInitializeAMDGPUTargetMC();
    LLVMInitializeAMDGPUAsmPrinter();
    init = true;
  }
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
  size_t start_replace = str.find(begin);
  size_t end_replace = str.find(end, start_replace);
  if(start_replace == std::string::npos)
    return false;
  str.replace(start_replace, end_replace + 1 - start_replace, target);
  return true;
 }
 int vptx(int version){
  if(version >= 11030) return 73;
  if(version >= 11020) return 72;
  if(version >= 11010) return 71;
  if(version >= 11000) return 70;
  if(version >= 10020) return 65;
  if(version >= 10010) return 64;
  if(version >= 10000) return 63;
  throw std::runtime_error("Triton requires CUDA 10+");
 }
 std::string llir_to_ptx(llvm::Module* module, int cc, int version){
  // LLVM version in use may not officially support target hardware
  int max_nvvm_cc = 75;
  int max_nvvm_ptx = 64;
  // options
  auto options = llvm::cl::getRegisteredOptions();
  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
  assert(short_ptr);
  short_ptr->setValue(true);
  // compute capability
  std::string sm = "sm_" + std::to_string(cc);
  // max PTX version
  int ptx = vptx(version);
  int ptx_major = ptx / 10;
  int ptx_minor = ptx % 10;
  // create
  llvm::SmallVector<char, 0> buffer;
  std::string triple = "nvptx64-nvidia-cuda";
  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
  std::string layout = "";
  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
  init_llvm();
  // verify and store llvm
  llvm::legacy::PassManager pm;
  pm.add(llvm::createVerifierPass());
  pm.run(*module);
  // create machine
  module->setTargetTriple(triple);
  std::string error;
  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
  llvm::TargetOptions opt;
  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
  opt.UnsafeFPMath = false;
  opt.NoInfsFPMath = false;
  opt.NoNaNsFPMath = true;
  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
  // set data layout
  if(layout.empty())
    module->setDataLayout(machine->createDataLayout());
  else
    module->setDataLayout(layout);
  // emit machine code
  for (llvm::Function &f : module->functions())
    f.addFnAttr(llvm::Attribute::AlwaysInline);
  llvm::legacy::PassManager pass;
  llvm::raw_svector_ostream stream(buffer);
  // emit
  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
  pass.run(*module);
  // post-process
  std::string result(buffer.begin(), buffer.end());
  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
  return result;
 }
 CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
  // JIT compile source-code
  try{
    // use ptxas if present in PATH. Otherwise, use JIT from the driver
    std::string ptxas = "ptxas";
    std::string version;
    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
    // Use PTXAS via system call
    if(use_system_ptxas){
      // compile ptx with ptxas
      char _fsrc[] = "/tmp/triton_k_XXXXXX";
      char _flog[] = "/tmp/triton_l_XXXXXX";
      mkstemp(_fsrc);
      mkstemp(_flog);
      std::string fsrc = _fsrc;
      std::string flog = _flog;
      std::ofstream ofs(fsrc);
      ofs << ptx;
      ofs.close();
      std::string cmd;
      int err;
      cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
      err = system(cmd.c_str());
      CUmodule ret;
      dispatch::cuModuleLoad(&ret, (fsrc + ".o").c_str());
      unlink(_fsrc);
      unlink(_flog);
      return ret;
    }
    // Use PTXAS included in driver
    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
                          CU_JIT_LOG_VERBOSE};
    unsigned int errbufsize = 8192;
    unsigned int logbufsize = 8192;
    char _err[errbufsize];
    char _log[logbufsize];
    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
    CUmodule ret;
    dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
    return ret;
  }
  catch(exception::cuda::invalid_ptx const &){
    std::cout << ptx << std::endl;
    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
    throw;
  }
 }
 /* ------------------------ */
 //         HIP              //
 /* ------------------------ */
 std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) {
  init_llvm();
 //  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
 //  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
  // create
  llvm::SmallVector<char, 0> buffer;
  std::string triple = "amdgcn-amd-amdhsa";
  std::string layout = "";
  std::string features;
  std::string proc = "gfx908";
  // verify and store llvm
  llvm::legacy::PassManager pm;
  pm.add(llvm::createVerifierPass());
  pm.run(*module);
  // create machine
  module->setTargetTriple(triple);
  std::string error;
  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
  llvm::TargetOptions opt;
  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
  opt.UnsafeFPMath = false;
  opt.NoInfsFPMath = false;
  opt.NoNaNsFPMath = true;
  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
                                                             llvm::Reloc::PIC_, llvm::None,
                                                             llvm::CodeGenOpt::Aggressive);
  // set data layout
  if(layout.empty())
    module->setDataLayout(machine->createDataLayout());
  else
    module->setDataLayout(layout);
  // emit machine code
  for (llvm::Function &f : module->functions())
    f.addFnAttr(llvm::Attribute::AlwaysInline);
  llvm::legacy::PassManager pass;
  llvm::raw_svector_ostream stream(buffer);
  // create dump files
  std::string module_name = module->getModuleIdentifier();
  std::error_code ec;
  // Save GCN ISA binary.
  std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o");
  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
  if (ec)
  {
    std::cout << isabin_path << " was not created. error code: " << ec << std::endl;
  }
  // emit
  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile);
  pass.run(*module);
  // Save GCN ISA.
  std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn");
  std::string result(buffer.begin(), buffer.end());
  std::ofstream amdgcn(amdgcn_path);
  amdgcn << result;
  amdgcn.close();
  // generate HASCO file
  std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco");
  std::string error_message;
  int lld_result =
      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path},
                                llvm::None, {}, 0, 0, &error_message);
  if (lld_result)
  {
    std::cout << "ld.lld execute fail: " << std::endl;
    std::cout << error_message << std::endl;
    std::cout << lld_result << std::endl;
  }
  return hsaco_path;
 }
 hipModule_t amdgpu_to_hipmodule(const std::string& path) {
  // Read HSACO.
  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
  std::vector<unsigned char> hsaco(hsaco_file_size);
  hsaco_file.seekg(0, std::ios::beg);
  hsaco_file.read(reinterpret_cast<char*>(&hsaco[0]), hsaco_file_size);
  hsaco_file.close();
  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer,
                            hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer,
                            hipJitOptionLogVerbose};
  unsigned int errbufsize = 8192;
  unsigned int logbufsize = 8192;
  char _err[errbufsize];
  char _log[logbufsize];
  void* optval[] = {(void*)(uintptr_t)errbufsize,
                    (void*)_err, (void*)(uintptr_t)logbufsize,
                    (void*)_log, (void*)1};
  hipModule_t ret;
  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
  return ret;
 }
 }
 }
--- a/lib/driver/module.cc
+++ b/lib/driver/module.cc
@@ -1,375 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files
 * (the "Software"), to deal in the Software without restriction,
 * including without limitation the rights to use, copy, modify, merge,
 * publish, distribute, sublicense, and/or sell copies of the Software,
 * and to permit persons to whom the Software is furnished to do so,
 * subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <fstream>
 #include <unistd.h>
 #include <memory>
 #include <regex>
 #include "triton/driver/module.h"
 #include "triton/driver/context.h"
 #include "triton/driver/error.h"
 #include "triton/tools/sha1.hpp"
 #include "triton/tools/sys/getenv.hpp"
 #include "triton/tools/sys/mkdir.hpp"
 #include "triton/tools/sys/exec.hpp"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 std::string exec(const char* cmd) {
    std::array<char, 128> buffer;
    std::string result;
    std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
    if (!pipe) {
        throw std::runtime_error("popen() failed!");
    }
    while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
        result += buffer.data();
    }
    return result;
 }
  void LLVMInitializeNVPTXTargetInfo();
  void LLVMInitializeNVPTXTarget();
  void LLVMInitializeNVPTXTargetMC();
  void LLVMInitializeNVPTXAsmPrinter();
  void LLVMInitializeNVPTXAsmParser();
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //         Base             //
 /* ------------------------ */
 void module::init_llvm() {
  static bool init = false;
  if(!init){
    LLVMInitializeNVPTXTargetInfo();
    LLVMInitializeNVPTXTarget();
    LLVMInitializeNVPTXTargetMC();
    LLVMInitializeNVPTXAsmPrinter();
    init = true;
  }
 }
 module::module(CUmodule mod, bool has_ownership)
  : polymorphic_resource(mod, has_ownership), spilled_(0) {
 }
 module::module(host_module_t mod, bool has_ownership)
  : polymorphic_resource(mod, has_ownership), spilled_(0) {
 }
 module* module::create(driver::device* device, std::unique_ptr<llvm::Module> src) {
  switch(device->backend()){
    case CUDA: return new cu_module(device, std::move(src));
    case Host: return new host_module(std::move(src));
    default: throw std::runtime_error("unknown backend");
  }
 }
 void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std::string& triple,
                                 const std::string &proc, std::string layout,
                                 llvm::SmallVectorImpl<char> &buffer,
                                 const std::string& features,
                                 file_type_t ft) {
 }
 /* ------------------------ */
 //        Host              //
 /* ------------------------ */
 host_module::host_module(std::unique_ptr<llvm::Module> src): module(host_module_t(), true) {
  throw std::runtime_error("CPU unsupported");
 //  init_llvm();
 //  // create kernel wrapper
 //  llvm::LLVMContext &ctx = src->getContext();
 //  llvm::Type *void_ty = llvm::Type::getVoidTy(ctx);
 //  llvm::Type *args_ty = llvm::Type::getInt8PtrTy(ctx)->getPointerTo();
 //  llvm::Type *int32_ty = llvm::Type::getInt32Ty(ctx);
 //  std::vector<llvm::Type*> tys = {args_ty, int32_ty, int32_ty, int32_ty};
 //  llvm::FunctionType *main_ty = llvm::FunctionType::get(void_ty, tys, false);
 //  llvm::Function* main = llvm::Function::Create(main_ty, llvm::Function::ExternalLinkage, "_main", &*src);
 //  llvm::Function* fn = &*src->getFunctionList().begin();
 //  llvm::FunctionType *fn_ty = fn->getFunctionType();
 //  std::vector<llvm::Value*> fn_args(fn_ty->getNumParams());
 //  std::vector<llvm::Value*> ptrs(fn_args.size() - 3);
 //  llvm::BasicBlock* entry = llvm::BasicBlock::Create(ctx, "entry", main);
 //  llvm::IRBuilder<> ir_builder(ctx);
 //  ir_builder.SetInsertPoint(entry);
 //  auto get_size = [](llvm::Type* ty) { return ty->isPointerTy() ? sizeof(char*) : ty->getPrimitiveSizeInBits() / 8; };
 //  llvm::Value* base = main->arg_begin();
 //  llvm::Value* args_base = ir_builder.CreateBitCast(base, base->getType()->getPointerElementType());
 //  size_t offset = 0;
 //  for(unsigned i = 0; i < ptrs.size(); i++){
 //    ptrs[i] = ir_builder.CreateGEP(args_base, ir_builder.getInt32(offset));
 //    size_t nbytes = get_size(fn_ty->getParamType(i));
 //    offset += nbytes;
 //    if(i < ptrs.size() - 1){
 //      size_t np1bytes = get_size(fn_ty->getParamType(i+1));
 //      offset = (offset + np1bytes - 1) / np1bytes * np1bytes;
 //    }
 //  }
 //  for(unsigned i = 0; i < ptrs.size(); i++)
 //    ptrs[i] = ir_builder.CreateBitCast(ptrs[i], fn_ty->getParamType(i)->getPointerTo());
 //  for(unsigned i = 0; i < ptrs.size(); i++)
 //    fn_args[i] = ir_builder.CreateLoad(ptrs[i]);
 //  fn_args[fn_args.size() - 3] = main->arg_begin() + 1;
 //  fn_args[fn_args.size() - 2] = main->arg_begin() + 2;
 //  fn_args[fn_args.size() - 1] = main->arg_begin() + 3;
 //  ir_builder.CreateCall(fn, fn_args);
 //  ir_builder.CreateRetVoid();
 ////  llvm::legacy::PassManager pm;
 ////  pm.add(llvm::createPrintModulePass(llvm::outs()));
 ////  pm.add(llvm::createVerifierPass());
 ////  pm.run(*src);
 ////   create execution engine
 //  for(llvm::Function& fn: src->functions())
 //    hst_->functions[fn.getName().str()] = &fn;
 ////  llvm::orc::JITTargetMachineBuilder JTMB = *llvm::orc::JITTargetMachineBuilder::detectHost();
 ////  auto DL = JTMB.getDefaultDataLayoutForTarget();
 ////  auto CIRC = std::unique_ptr<llvm::orc::ConcurrentIRCompiler>(new llvm::orc::ConcurrentIRCompiler(JTMB));
 ////  hst_->ES = new llvm::orc::ExecutionSession();
 ////  hst_->ObjectLayer = new llvm::orc::RTDyldObjectLinkingLayer(*hst_->ES, []() { return std::unique_ptr<llvm::SectionMemoryManager>(new llvm::SectionMemoryManager()); });
 ////  hst_->CompileLayer = new llvm::orc::IRCompileLayer(*hst_->ES, *hst_->ObjectLayer, *CIRC);
 ////  hst_->DL = new llvm::DataLayout(std::move(*DL));
 ////  hst_->Mangle = new llvm::orc::MangleAndInterner(*hst_->ES, *hst_->DL);
 ////  hst_->Ctx = new llvm::orc::ThreadSafeContext(std::unique_ptr<llvm::LLVMContext>(new llvm::LLVMContext()));
 ////  hst_->MainJD =  &hst_->ES->createJITDylib("<main>");
 ////  hst_->MainJD->setGenerator(llvm::cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
 ////                                            hst_->DL->getGlobalPrefix())));
 ////  llvm::cantFail(hst_->CompileLayer->add(*hst_->MainJD, llvm::orc::ThreadSafeModule(std::move(src), *hst_->Ctx)));
 ////  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->ES->lookup({hst_->MainJD}, (*hst_->Mangle)("_main"))->getAddress());
 //  llvm::EngineBuilder builder(std::move(src));
 //  builder.setErrorStr(&hst_->error);
 //  builder.setMCJITMemoryManager(std::make_unique<llvm::SectionMemoryManager>());
 //  builder.setOptLevel(llvm::CodeGenOpt::Aggressive);
 //  builder.setEngineKind(llvm::EngineKind::JIT);
 //  hst_->engine = builder.create();
 //  hst_->fn = (void(*)(char**, int32_t, int32_t, int32_t))(hst_->engine->getFunctionAddress("_main"));
 }
 std::unique_ptr<buffer> host_module::symbol(const char *name) const {
  throw std::runtime_error("not implemented");
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){
  size_t start_replace = str.find(begin);
  size_t end_replace = str.find(end, start_replace);
  if(start_replace == std::string::npos)
    return false;
  str.replace(start_replace, end_replace + 1 - start_replace, target);
  return true;
 }
 //static std::map<int, int> vptx = {
 //  {10000, 63},
 //  {10010, 64},
 //  {10020, 65},
 //  {11000, 70},
 //  {11010, 71},
 //  {11020, 72},
 //  {11030, 73},
 //  {11040, 73}
 //};
 int vptx(int version){
  if(version >= 11030) return 73;
  if(version >= 11020) return 72;
  if(version >= 11010) return 71;
  if(version >= 11000) return 70;
  if(version >= 10020) return 65;
  if(version >= 10010) return 64;
  if(version >= 10000) return 63;
  throw std::runtime_error("Triton requires CUDA 10+");
 }
 std::string cu_module::compile_llvm_module(llvm::Module* module, driver::device* device) {
  // LLVM version in use may not officially support target hardware
  int max_nvvm_cc = 75;
  int max_nvvm_ptx = 64;
  // options
  auto options = llvm::cl::getRegisteredOptions();
  auto* short_ptr = static_cast<llvm::cl::opt<bool>*>(options["nvptx-short-ptr"]);
  assert(short_ptr);
  short_ptr->setValue(true);
  // compute capability
  int cc = ((driver::cu_device*)device)->compute_capability();
  std::string sm = "sm_" + std::to_string(cc);
  // driver version
  int version;
  dispatch::cuDriverGetVersion(&version);
  int ptx = vptx(version);
  int ptx_major = ptx / 10;
  int ptx_minor = ptx % 10;
  // create
  llvm::SmallVector<char, 0> buffer;
  std::string triple = "nvptx64-nvidia-cuda";
  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
  std::string layout = "";
  std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx));
  init_llvm();
  // verify and store llvm
  llvm::legacy::PassManager pm;
  pm.add(llvm::createVerifierPass());
  pm.run(*module);
  // create machine
  module->setTargetTriple(triple);
  std::string error;
  auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
  llvm::TargetOptions opt;
  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
  opt.UnsafeFPMath = false;
  opt.NoInfsFPMath = false;
  opt.NoNaNsFPMath = true;
  llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt,
                                                             llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive);
  // set data layout
  if(layout.empty())
    module->setDataLayout(machine->createDataLayout());
  else
    module->setDataLayout(layout);
  // emit machine code
  for (llvm::Function &f : module->functions())
    f.addFnAttr(llvm::Attribute::AlwaysInline);
  llvm::legacy::PassManager pass;
  llvm::raw_svector_ostream stream(buffer);
  // emit
  machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile);
  pass.run(*module);
  // post-process
  std::string result(buffer.begin(), buffer.end());
  find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n");
  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
  while(find_and_replace(result, "\t// begin inline asm", "\n", ""));
  while(find_and_replace(result, "\t// end inline asm", "\n", ""));
  return result;
 }
 void cu_module::init_from_ptx(const std::string& ptx, driver::cu_device* device) {
  // JIT compile source-code
  try{
    // use ptxas if present in PATH. Otherwise, use JIT from the driver
    std::string ptxas = "ptxas";
    std::string version;
    int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
    // Use PTXAS via system call
    if(use_system_ptxas){
      // compile ptx with ptxas
      char _fsrc[] = "/tmp/triton_k_XXXXXX";
      char _flog[] = "/tmp/triton_l_XXXXXX";
      mkstemp(_fsrc);
      mkstemp(_flog);
      std::string fsrc = _fsrc;
      std::string flog = _flog;
      std::ofstream ofs(fsrc);
      ofs << ptx;
      ofs.close();
      std::string cmd;
      int err;
      std::string cc = std::to_string(device->compute_capability());
      cmd = ptxas + " -v --gpu-name=sm_" + cc + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
      err = system(cmd.c_str());
      dispatch::cuModuleLoad(&*cu_, (fsrc + ".o").c_str());
      unlink(_fsrc);
      unlink(_flog);
      return;
    }
    // Use PTXAS included in driver
    CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
                          CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
                          CU_JIT_LOG_VERBOSE};
    unsigned int errbufsize = 8192;
    unsigned int logbufsize = 8192;
    char _err[errbufsize];
    char _log[logbufsize];
    void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
    dispatch::cuModuleLoadDataEx(&*cu_, ptx_.data(), 5, opt, optval);
  }
  catch(exception::cuda::invalid_ptx const &){
 //#ifdef TRITON_LOG_PTX_ERROR
     std::cout << ptx << std::endl;
    std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
 //    exit(1);
 //#endif
    throw;
  }
 }
 cu_module::cu_module(driver::device* device, std::unique_ptr<llvm::Module> ll_module): module(CUmodule(), true) {
  llvm::raw_string_ostream oss(llir_);
  oss << *ll_module;
  oss.flush();
  ptx_ = compile_llvm_module(ll_module.get(), device);
  init_from_ptx(ptx_, (driver::cu_device*)device);
 }
 cu_module::cu_module(driver::device* device, std::string const & source) : module(CUmodule(), true), ptx_(source){
  init_from_ptx(ptx_, (driver::cu_device*)device);
 }
 std::unique_ptr<buffer> cu_module::symbol(const char *name) const{
  CUdeviceptr handle;
  size_t size;
  dispatch::cuModuleGetGlobal_v2(&handle, &size, *cu_, name);
  std::unique_ptr<buffer> res(new cu_buffer(size, handle, false));
  return std::move(res);
 }
 }
 }
--- a/lib/driver/platform.cc
+++ b/lib/driver/platform.cc
@@ -1,68 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <string>
 #include "triton/driver/platform.h"
 #include "triton/driver/device.h"
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 std::string cu_platform::version() const{
  int version;
  dispatch::cuDriverGetVersion(&version);
  return std::to_string(version);
 }
 void cu_platform::devices(std::vector<device *> &devices) const{
  int N;
  dispatch::cuDeviceGetCount(&N);
  for(int i = 0 ; i < N ; ++i){
    CUdevice dvc;
    dispatch::cuDeviceGet(&dvc, i);
    devices.push_back(new driver::cu_device(dvc));
  }
 }
 /* ------------------------ */
 //        Host              //
 /* ------------------------ */
 std::string host_platform::version() const {
  return "1.0";
 }
 void host_platform::devices(std::vector<driver::device*> &devices) const {
  devices.push_back(new driver::host_device());
 }
 }
 }
--- a/lib/driver/stream.cc
+++ b/lib/driver/stream.cc
@@ -1,142 +0,0 @@
 /* Copyright 2015-2017 Philippe Tillet
 * 
 * Permission is hereby granted, free of charge, to any person obtaining 
 * a copy of this software and associated documentation files 
 * (the "Software"), to deal in the Software without restriction, 
 * including without limitation the rights to use, copy, modify, merge, 
 * publish, distribute, sublicense, and/or sell copies of the Software, 
 * and to permit persons to whom the Software is furnished to do so, 
 * subject to the following conditions:
 * 
 * The above copyright notice and this permission notice shall be 
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include <cassert>
 #include <unistd.h>
 #include <array>
 #include "triton/driver/backend.h"
 #include "triton/driver/stream.h"
 #include "triton/driver/context.h"
 #include "triton/driver/device.h"
 #include "triton/driver/kernel.h"
 #include "triton/driver/buffer.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
 namespace triton
 {
 namespace driver
 {
 /* ------------------------ */
 //         Base             //
 /* ------------------------ */
 stream::stream(CUstream cu, bool has_ownership)
  : polymorphic_resource(cu, has_ownership) {
 }
 stream::stream(host_stream_t cl, bool has_ownership)
  : polymorphic_resource(cl, has_ownership) {
 }
 driver::stream* stream::create(backend_t backend) {
  switch(backend){
    case CUDA: return new cu_stream();
    case Host: return new host_stream();
    default: throw std::runtime_error("unknown backend");
  }
 }
 /* ------------------------ */
 //          Host            //
 /* ------------------------ */
 host_stream::host_stream(): stream(host_stream_t(), true) {
  hst_->pool.reset(new ThreadPool(1));
  hst_->futures.reset(new std::vector<std::future<void>>());
 }
 void host_stream::synchronize() {
  for(auto& x: *hst_->futures)
    x.wait();
  hst_->futures->clear();
  hst_->args.clear();
 }
 void host_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t) {
  auto hst = kernel->module()->hst();
  hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
  char* params = new char[args_size];
  std::memcpy((void*)params, (void*)args, args_size);
  for(size_t i = 0; i < grid[0]; i++)
    for(size_t j = 0; j < grid[1]; j++)
      for(size_t k = 0; k < grid[2]; k++)
        hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
 }
 void host_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
  std::memcpy((void*)buffer->hst()->data, ptr, size);
 }
 void host_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
  std::memcpy(ptr, (const void*)buffer->hst()->data, size);
 }
 /* ------------------------ */
 //         CUDA             //
 /* ------------------------ */
 cu_stream::cu_stream(CUstream str, bool take_ownership):
  stream(str, take_ownership) {
 }
 cu_stream::cu_stream(): stream(CUstream(), true) {
  dispatch::cuStreamCreate(&*cu_, 0);
 }
 void cu_stream::synchronize() {
  dispatch::cuStreamSynchronize(*cu_);
 }
 void cu_stream::enqueue(driver::kernel* kernel, std::array<size_t, 3> grid, std::array<size_t, 3> block, void* args, size_t args_size, size_t shared_mem) {
  void *config[] = {
      CU_LAUNCH_PARAM_BUFFER_POINTER, args,
      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
      CU_LAUNCH_PARAM_END
  };
  dispatch::cuLaunchKernel(*kernel->cu(), grid[0], grid[1], grid[2], block[0], block[1], block[2], shared_mem, *cu_, nullptr, config);
 }
 void cu_stream::write(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void const* ptr) {
  if(blocking)
    dispatch::cuMemcpyHtoD(*buffer->cu() + offset, ptr, size);
  else
    dispatch::cuMemcpyHtoDAsync(*buffer->cu() + offset, ptr, size, *cu_);
 }
 void cu_stream::read(driver::buffer* buffer, bool blocking, std::size_t offset, std::size_t size, void* ptr) {
  if(blocking)
    dispatch::cuMemcpyDtoH(ptr, *buffer->cu() + offset, size);
  else
    dispatch::cuMemcpyDtoHAsync(ptr, *buffer->cu() + offset, size, *cu_);
 }
 }
 }
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -1,7 +1,7 @@
 #include "triton/codegen/pass.h"
-#include "triton/driver/kernel.h"
+#include "triton/codegen/target.h"
-#include "triton/driver/module.h"
+#include "triton/driver/error.h"
-#include "triton/driver/stream.h"
+#include "triton/driver/llvm.h"
 #include "triton/ir/builder.h"
 #include "triton/ir/dispatch.h"
 #include "triton/ir/enums.h"
@@ -15,7 +15,9 @@
 #include <pybind11/stl.h>
 #include <regex>
 #include <string>
-#include <sstream>
+#include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 namespace py = pybind11;
 namespace ir = triton::ir;
@@ -24,72 +26,213 @@ namespace drv = triton::driver;
 /*****************************************************************************/
 /* Python bindings for triton::driver                                        */
 /*****************************************************************************/
 // information query
 template<CUdevice_attribute attr>
 int cuGetInfo(CUdevice device) {
  int res;
  drv::dispatch::cuDeviceGetAttribute(&res, attr, device);
  return res;
 }
-void init_triton_driver(py::module &&m) {
+template<hipDeviceAttribute_t attr>
-  // base device
+int hipGetInfo(hipDevice_t device) {
-  py::class_<drv::device>(m, "device");
+  int res;
-  // cuda device
+  drv::dispatch::hipDeviceGetAttribute(&res, attr, device);
-  py::class_<drv::cu_device, drv::device>(m, "cu_device")
+  return res;
-      .def(py::init([](int dev_id, bool take_ownership) {
+}
        CUdevice handle;
        drv::dispatch::cuDeviceGet(&handle, dev_id);
        return new drv::cu_device(handle, take_ownership);
      }))
      .def("max_shared_memory", [](drv::cu_device *self) {
        return self->max_shared_memory();
      })
      .def("enable_peer_access", [](drv::cu_device *self, unsigned long long int peer_mem_ptr) {
        self->enable_peer_access(peer_mem_ptr);
      });
  // host device
  py::class_<drv::host_device, drv::device>(m, "host_device")
      .def(py::init<>());
-  // base stream
+enum backend_t {
-  py::class_<drv::stream>(m, "stream");
+  HOST,
-  // host stream
+  CUDA,
-  py::class_<drv::host_stream, drv::stream>(m, "host_stream")
+  ROCM,
-      .def(py::init<>());
+};
  // cuda stream
  py::class_<drv::cu_stream, drv::stream>(m, "cu_stream")
      // py doesn't support opaque pointer (e.g., CUstream) so
      // we assume it has been converted to uint64_t
      .def(py::init([](uint64_t handle, bool take_ownership) {
        return std::unique_ptr<drv::cu_stream>(new drv::cu_stream((CUstream)handle, take_ownership));
      }))
      .def("enqueue", [](drv::cu_stream *self, drv::kernel *kernel,
                         size_t grid_0, size_t grid_1, size_t grid_2,
                         size_t block_0, size_t block_1, size_t block_2,
                         const std::string &args,
                         size_t shared_mem) {
        return self->enqueue(kernel, {grid_0, grid_1, grid_2}, {block_0, block_1, block_2},
                             (void *)args.data(), args.size(), shared_mem);
      });
-  py::class_<drv::module>(m, "module");
+void cu_enable_peer_access(uint64_t peer_ptr){
  CUcontext context;
  drv::dispatch::cuPointerGetAttribute(&context, CU_POINTER_ATTRIBUTE_CONTEXT, peer_ptr);
  try {
      drv::dispatch::cuCtxEnablePeerAccess(context, 0);
  } catch (drv::exception::cuda::peer_access_already_enabled) {}
 }
-  py::class_<drv::cu_module, drv::module>(m, "cu_module")
+void host_enqueue(uint64_t stream, uint64_t kernel,
-      .def("ptx", &drv::cu_module::ptx)
+                  uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
-      .def("cubin", [](drv::cu_module *self) { return py::bytes(self->cubin()); })
+                  uint64_t block_0, uint64_t block_1, uint64_t block_2,
-      .def("llir", &drv::cu_module::llir);
+                  void* args_ptr, size_t args_size, int64_t shared_mem){
  throw std::runtime_error("unsupported");
 // auto hst = kernel->module()->hst();
 // hst_->futures->reserve(hst_->futures->size() + grid[0]*grid[1]*grid[2]);
 // char* params = new char[args_size];
 // std::memcpy((void*)params, (void*)args, args_size);
 // for(size_t i = 0; i < grid[0]; i++)
 //   for(size_t j = 0; j < grid[1]; j++)
 //     for(size_t k = 0; k < grid[2]; k++)
 //       hst_->futures->emplace_back(hst_->pool->enqueue(hst->fn, (char**)params, int32_t(i), int32_t(j), int32_t(k)));
 }
-  py::class_<drv::kernel>(m, "kernel");
+void cu_enqueue(uint64_t stream, uint64_t kernel,
                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
                uint64_t block_0, uint64_t block_1, uint64_t block_2,
                void* args_ptr, size_t args_size, int64_t shared_mem){
  void *config[] = {
      CU_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
      CU_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
      CU_LAUNCH_PARAM_END
  };
  drv::dispatch::cuLaunchKernel((CUfunction)kernel, grid_0, grid_1, grid_2, 
                                block_0, block_1, block_2, 
                                shared_mem, (CUstream)stream, nullptr, config);
 }
 void hip_enqueue(uint64_t stream, uint64_t kernel,
                uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
                uint64_t block_0, uint64_t block_1, uint64_t block_2,
                void* args_ptr, size_t args_size, int64_t shared_mem) {
  void *config[] = {
      HIP_LAUNCH_PARAM_BUFFER_POINTER, (void*)args_ptr,
      HIP_LAUNCH_PARAM_BUFFER_SIZE,    &args_size,
      HIP_LAUNCH_PARAM_END
  };
  drv::dispatch::hipModuleLaunchKernel((hipFunction_t)kernel, grid_0, grid_1, grid_2, 
                                block_0, block_1, block_2, 
                                shared_mem, (hipStream_t)stream, nullptr, config);
 }
 void init_triton_runtime(py::module &&m) {
  // wrap backend_t
  py::enum_<backend_t>(m, "backend")
    .value("HOST", HOST)
    .value("CUDA", CUDA)
    .value("ROCM", ROCM)
    .export_values();
  // enable peer-to-peer
  m.def("enable_peer_access", [](backend_t backend, uint64_t peer_ptr) {
      if (backend != CUDA)
        throw std::runtime_error("P2P only supported on CUDA devices!");
      cu_enable_peer_access(peer_ptr);
    }
  );
  // query maximum shared memory
  m.def("max_shared_memory", [](backend_t backend, uint64_t device) {
      if (backend == HOST)
        return 0;
      if(backend == CUDA) 
        return cuGetInfo<CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN>(device);
      if(backend == ROCM)
        return hipGetInfo<hipDeviceAttributeMaxSharedMemoryPerBlock>(device);
      return -1;
  });
  // enqueue
  m.def("enqueue", [](backend_t backend, uint64_t stream, uint64_t kernel,
                      uint64_t grid_0, uint64_t grid_1, uint64_t grid_2,
                      uint64_t block_0, uint64_t block_1, uint64_t block_2,
                      const std::string &args, int64_t shared_mem){
    void* args_ptr = (void*)args.data();
    size_t args_size = args.size();
    if(backend == HOST)
      host_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
    if(backend == CUDA)
      cu_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
    if(backend == ROCM)
      hip_enqueue(stream, kernel, grid_0, grid_1, grid_2, block_0, block_1, block_2, args_ptr, args_size, shared_mem);
  });
 }
 /*****************************************************************************/
 /* Python bindings for triton::codegen                                       */
 /*****************************************************************************/
 typedef std::map<std::string, std::string> asm_map_t;
 std::tuple<uint64_t, uint64_t> cu_compile_llir(const std::string& name, size_t n_shared_bytes, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map, int cc, int version){
  // LLVM-IR -> PTX
  std::string ptx = drv::llir_to_ptx(llvm, cc, version);
  asm_map["ptx"] = ptx;
  // PTX -> Binary
  CUmodule mod = drv::ptx_to_cumodule(ptx, cc);
  // Handle to the kernel
  CUfunction fun;
  drv::dispatch::cuModuleGetFunction(&fun, mod, name.c_str());
  // Dynamic shared memory
  int shared_optin;
  drv::dispatch::cuDeviceGetAttribute(&shared_optin, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN, dev);
  if(n_shared_bytes > 49152 && shared_optin > 49152){
    drv::dispatch::cuFuncSetCacheConfig(fun, CU_FUNC_CACHE_PREFER_SHARED);
    int shared_total, shared_static;
    int n_spills, n_reg;
    drv::dispatch::cuDeviceGetAttribute(&shared_total, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, dev);
    drv::dispatch::cuFuncGetAttribute(&shared_static, CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES, fun);
    drv::dispatch::cuFuncGetAttribute(&n_spills, CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,  fun);
    drv::dispatch::cuFuncGetAttribute(&n_reg, CU_FUNC_ATTRIBUTE_NUM_REGS, fun);
    drv::dispatch::cuFuncSetAttribute(fun, CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, shared_optin - shared_static);
  }
  // record asm
  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 }
 std::tuple<uint64_t, uint64_t> hip_compile_llir(const std::string& name, llvm::Module* llvm, uint64_t dev, asm_map_t& asm_map){
  // LLVM-IR -> HSA-CO
  std::string path = drv::llir_to_amdgpu(llvm, "gfx908");
  // HSA-CO -> hipModule
  hipModule_t mod = drv::amdgpu_to_hipmodule(path);
  // Handle to the kernel
  hipFunction_t fun;
  drv::dispatch::hipModuleGetFunction(&fun, mod, name.c_str());
  // record asm
  return std::make_tuple((uint64_t)mod, (uint64_t)fun);
 }
 void init_triton_codegen(py::module &&m) {
  m.def(
-      "add_passes_to_emit_bin", [](ir::module &ir, drv::device *dev, int num_warps, int num_stages, bool force_nc_cache) {
+      "compile_ttir", [](backend_t backend, ir::module &ir, uint64_t device, int num_warps, int num_stages, bool force_nc_cache) {
-        drv::module *mod;
+        std::string name = ir.get_function_list()[0]->get_name();
-        drv::kernel *ker;
+        // record asm as we generate
-        size_t shared_mem;
+        asm_map_t asm_map;
-        triton::codegen::add_passes_to_emit_bin(ir, dev, num_warps, num_stages, force_nc_cache, mod, ker, shared_mem);
+        std::ostringstream ttir;
-        std::stringstream ss;
+        ir::print(ir, ttir);
-        ir::print(ir, ss);
+        asm_map["ttir"] = ttir.str();
-        return std::make_tuple(mod, ker, shared_mem, ss.str());
+        llvm::LLVMContext ctx;
        if(backend == CUDA){
          // device properties
          CUdevice dev = (CUdevice)device;
          size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
          size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
          size_t cc = major*10 + minor;
          int version;
          drv::dispatch::cuDriverGetVersion(&version);
          // Triton-IR -> NVPTX LLVM-IR
          triton::codegen::nvidia_cu_target target(cc);
          int n_shared_bytes;
          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, cc, num_warps, num_stages, force_nc_cache, n_shared_bytes);
          llvm::raw_string_ostream llir(asm_map["llir"]);
          llir << *llvm;
          llir.flush();
          // LLVM-IR -> Bin
          uint64_t mod, fun;
          std::tie(mod, fun) = cu_compile_llir(name, n_shared_bytes, &*llvm, device, asm_map, cc, version);
          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
        }
        if(backend == ROCM){
          // Triton-IR -> NVPTX LLVM-IR
          triton::codegen::amd_cl_target target;
          int n_shared_bytes;
          auto llvm = triton::codegen::add_passes_to_emit_bin(ir, ctx, &target, 70, num_warps, num_stages, force_nc_cache, n_shared_bytes);
          llvm::raw_string_ostream llir(asm_map["llir"]);
          llir << *llvm;
          llir.flush();
          // LLVM-IR -> Bin
          uint64_t mod, fun;
          std::tie(mod, fun) = hip_compile_llir(name, &*llvm, device, asm_map);
          return std::make_tuple(mod, fun, asm_map, n_shared_bytes);
        }
      },
      py::return_value_policy::take_ownership);
 }
@@ -302,7 +445,7 @@ void init_triton_ir(py::module &&m) {
 void init_triton(py::module &m) {
  py::module subm = m.def_submodule("triton");
  init_triton_codegen(std::move(subm.def_submodule("code_gen")));
-  init_triton_driver(std::move(subm.def_submodule("driver")));
+  init_triton_runtime(std::move(subm.def_submodule("runtime")));
  init_triton_ir(std::move(subm.def_submodule("ir")));
  init_triton_frontend(std::move(subm.def_submodule("frontend")));
 }
--- a/python/test/language/test_core.py
+++ b/python/test/language/test_core.py
@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
    return kernel
 # generic test functions
 def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
    SIZE = 128
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
    # compare
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # parse ptx to make sure ld/st are vectorized
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    assert 'ld.global.v4' in ptx
    assert 'st.global.v4' in ptx
@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
        z_ref += z[0,:][None, :]
    z_ref = z_ref.to(torch.float16)
    # compare
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    # print(ptx)
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # make sure ld/st are vectorized
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
 # ---------------
 # test while
 # ---------------
 # ---------------
 # test noop
 #----------------
 def test_noop(device='cuda'):
    @triton.jit
    def kernel(**meta):
        pass
    x = triton.testing.random((1,), dtype=torch.int32, device=device)
    kernel[(1, )](x)
--- a/python/triton/code_gen.py
+++ b/python/triton/code_gen.py
@@ -411,9 +411,9 @@ class CodeGenerator(ast.NodeVisitor):
 class Binary:
-    def __init__(self, module, kernel, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm):
+    def __init__(self, backend, module, kernel, asm, num_warps, num_stages, force_nc_cache, shared_mem):
        # cache ir asm
-        self.ir_asm = ir_asm
+        self.asm = asm
        self.module = module
        self.kernel = kernel
        self.shared_mem = shared_mem
@@ -421,29 +421,13 @@ class Binary:
        self.num_stages = num_stages
        self.force_nc_cache = force_nc_cache
        self.sass = None
-
+        self.backend = backend
    def asm(self, mode):
        if mode == 'ttir':
            return self.ir_asm
        if mode == 'ptx':
            return self.module.ptx()
        if mode == 'sass':
            if self.sass is None:
                cubin = self.module.cubin()
                # get a temporary file name
                fd, path = tempfile.mkstemp(suffix='.cubin')
                f = open(path, 'wb')
                f.write(cubin)
                f.close()
                # extract SASS from cubin
                self.sass = extract(path, None)
            return self.sass
        if mode == 'llir':
            return self.module.llir()
        raise ValueError('Unsupported mode ' + mode)
    def __call__(self, stream, args, grid_0, grid_1=1, grid_2=1):
-        stream.enqueue(self.kernel, grid_0, grid_1, grid_2, self.num_warps * 32, 1, 1, args, self.shared_mem)
+        _triton.runtime.enqueue(self.backend, stream, self.kernel,
                                grid_0, grid_1, grid_2, 
                                self.num_warps * 32, 1, 1, 
                                args, self.shared_mem)
 class CompilationError(Exception):
@@ -548,10 +532,15 @@ class Kernel:
                raise e
            raise CompilationError(self.fn.src, node, e)
        # Compile to machine code
-        mod, ker, shared_mem, ir_asm = _triton.code_gen.add_passes_to_emit_bin(generator.module, device, num_warps, num_stages, force_nc_cache)
+        if torch.version.hip is None:
-        if shared_mem > device.max_shared_memory():
+            backend = _triton.runtime.backend.CUDA
-            raise OutOfResources(shared_mem, device.max_shared_memory(), "shared memory")
+        else:
-        return Binary(mod, ker, num_warps, num_stages, force_nc_cache, shared_mem, ir_asm)
+            backend = _triton.runtime.backend.ROCM
        mod, ker, asm, shared_mem = _triton.code_gen.compile_ttir(backend, generator.module, device, num_warps, num_stages, force_nc_cache)
        max_shared_memory = _triton.runtime.max_shared_memory(backend, device)
        if shared_mem > max_shared_memory:
            raise OutOfResources(shared_mem, max_shared_memory, "shared memory")
        return Binary(backend, mod, ker, asm, num_warps, num_stages, force_nc_cache, shared_mem)
    def __call__(self, *wargs, grid, num_warps=4, num_stages=2, force_nc_cache=False, **meta):
        # device inference
@@ -571,19 +560,20 @@ class Kernel:
                             " Only CUDA is supported at the moment")
        device = torch.device('cuda', torch.cuda.current_device())
-        tt_device = _triton.driver.cu_device(device.index, False)
+        device_ty  = device.type
-        if len(set(device_ids)) != 1 or device_ids[0] != device.index:
+        device_idx = device.index
        if len(set(device_ids)) != 1 or device_ids[0] != device_idx:
            # try to enable P2P communication
            for arg_idx, dst_idx in zip(tensor_idxs, device_ids):
-                if dst_idx != device.index:
+                if dst_idx != device_idx:
                    try:
-                        tt_device.enable_peer_access(wargs[arg_idx].data_ptr())
+                        _triton.runtime.enable_peer_access(self.backend, wargs[arg_idx].data_ptr())
                    except RuntimeError as e:
                        raise RuntimeError("Cannot enable P2P access from device {} to device {}: {}"
-                                           .format(device.index, dst_idx, str(e)))
+                                           .format(device_idx, dst_idx, str(e)))
        # enqueue kernel on the current device
-        torch.cuda.set_device(device.index)
+        torch.cuda.set_device(device_idx)
        # attributes
        args = [arg.data_ptr() if i in tensor_idxs else arg for i, arg in enumerate(wargs)]
        attributes = {i: Kernel.pow2_divisor(a) for i, a in enumerate(args) if isinstance(a, int)}
@@ -594,12 +584,12 @@ class Kernel:
        attr_key = frozenset(attributes.items())
        meta_key = frozenset(meta.items())
        const_key = frozenset(constants.items())
-        key = (device.type, device.index, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
+        key = (device_ty, device_idx, types_key, attr_key, num_warps, num_stages, meta_key, const_key)
        cache = self.fn.cache
        if key not in cache:
            # compile and cache configuration if necessary
            cache[key] = self._compile(
-                *wargs, device=tt_device, attributes=attributes,
+                *wargs, device=device_idx, attributes=attributes,
                num_warps=num_warps, num_stages=num_stages, force_nc_cache=force_nc_cache, 
                constants=constants, **meta
            )
@@ -608,8 +598,7 @@ class Kernel:
        params = struct.pack(fmt, *args)
        # enqueue cached function into stream
        binary = cache[key]
-        cu_stream = torch.cuda.current_stream(device.index).cuda_stream
+        stream = torch.cuda.current_stream(device_idx).cuda_stream
        stream = _triton.driver.cu_stream(cu_stream, False)
        grid = grid(meta) if hasattr(grid, '__call__') else grid
        binary(stream, params, *grid)
        return binary
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -64,7 +64,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
-    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    return output
@@ -85,6 +85,7 @@ print(
    f'The maximum difference between torch and triton is '
    f'{torch.max(torch.abs(output_torch - output_triton))}'
 )
 exit()
 # %%
 # Seems like we're good to go!