[RUNTIME] Major code cleanup (#711)

This PR does the following: - CUDA utilities (e.g., cuGetInfo) won't be compiled as part of libtriton.so anymore. - Refactoring driver/llvm.cc to split it between PTX codegen and python. - By extension this will also deprecate include/external so Triton won't have to live with a copy of some CUDA/Hip headers anymore. - `triton-translate` becomes a `triton.tools.aot` Python utility that re-uses functions from the triton.compile sub-module.
2022-09-26 16:38:06 -07:00
parent 8bb09f83ee
commit 1e91ed30d0
28 changed files with 509 additions and 31483 deletions
--- a/include/triton/Target/PTX/PTXTranslation.h
+++ b/include/triton/Target/PTX/PTXTranslation.h
@@ -1,34 +1,17 @@
 #ifndef TRITON_TARGET_PTXTRANSLATION_H
 #define TRITON_TARGET_PTXTRANSLATION_H

-#include "triton/driver/dispatch.h"
-
+#include <memory>
 #include <string>

-namespace mlir {
-
-class ModuleOp;
-
-} // namespace mlir
+namespace llvm {
+class Module;
+} // namespace llvm

 namespace triton {

-template <CUdevice_attribute attr> int cuGetInfo(CUdevice device) {
-  int res;
-  driver::dispatch::cuDeviceGetAttribute(&res, attr, device);
-  return res;
-}
-
-void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version,
-                                 std::string *ptxasPath);
-
 // Translate TritonGPU IR to PTX code.
-std::tuple<std::string, // ptx code
-           size_t,      // PTX cc
-           int,         // PTX version
-           std::string  // ptxas path
-           >
-translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device);
+std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version);

 } // namespace triton

--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -1,376 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_DISPATCH_H_
-#define _TRITON_DRIVER_DISPATCH_H_
-
-#include <dlfcn.h>
-#include <type_traits>
-
-// CUDA Backend
-#include "triton/external/CUDA/cuda.h"
-#include "triton/external/CUDA/nvml.h"
-
-//// HIP backend
-//#define __HIP_PLATFORM_AMD__
-#include "triton/external/hip.h"
-
-// Exceptions
-#include <iostream>
-#include <stdexcept>
-
-namespace llvm {
-class PassRegistry;
-class Module;
-} // namespace llvm
-
-namespace triton {
-namespace driver {
-
-class cu_context;
-
-template <class T> void check(T) {}
-void check(CUresult err);
-void check(hipError_t err);
-
-class dispatch {
-protected:
-  template <class F> struct return_type;
-
-  template <class R, class... A> struct return_type<R (*)(A...)> {
-    typedef R type;
-  };
-
-  typedef bool (*f_init_t)();
-
-  template <f_init_t initializer, typename FunPtrT, typename... Args>
-  static typename return_type<FunPtrT>::type
-  f_impl(void *&lib_h, FunPtrT, void *&cache, const char *name, Args... args) {
-    initializer();
-    if (cache == nullptr) {
-      cache = dlsym(lib_h, name);
-      if (cache == 0) {
-#ifdef __EXCEPTIONS
-        throw std::runtime_error("dlsym unable to load function");
-#else
-        std::cerr << "Triton: dlsym unable to load function `" << name << "`"
-                  << std::endl;
-        std::abort();
-#endif
-      }
-    }
-    FunPtrT fptr;
-    *reinterpret_cast<void **>(&fptr) = cache;
-    typename return_type<FunPtrT>::type res = (*fptr)(args...);
-    check(res);
-    return res;
-  }
-
-public:
-  static void release();
-  // Nvidia
-  static bool nvmlinit();
-  static bool cuinit();
-  // AMD
-  static bool hipinit();
-
-  /* ------------------- *
-   * CUDA
-   * ------------------- */
-  // context management
-  static CUresult cuInit(unsigned int Flags);
-  static CUresult cuCtxDestroy_v2(CUcontext ctx);
-  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags,
-                                 CUdevice dev);
-  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
-  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-  static CUresult cuCtxGetDevice(CUdevice *result);
-  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext,
-                                        unsigned int flags);
-  static CUresult cuDriverGetVersion(int *driverVersion);
-  // device management
-  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
-  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
-  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
-  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
-                                       CUdevice dev);
-  static CUresult cuDeviceGetCount(int *count);
-  // link management
-  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type,
-                                   void *data, size_t size, const char *name,
-                                   unsigned int numOptions,
-                                   CUjit_option *options, void **optionValues);
-  static CUresult cuLinkCreate_v2(unsigned int numOptions,
-                                  CUjit_option *options, void **optionValues,
-                                  CUlinkState *stateOut);
-  static CUresult cuLinkComplete(CUlinkState state, void **cubinOut,
-                                 size_t *sizeOut);
-  static CUresult cuLinkDestroy(CUlinkState state);
-  // module management
-  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes,
-                                       CUmodule hmod, const char *name);
-  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-  static CUresult cuModuleLoadData(CUmodule *module, const void *image);
-  static CUresult cuModuleUnload(CUmodule hmod);
-  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
-                                     unsigned int numOptions,
-                                     CUjit_option *options,
-                                     void **optionValues);
-  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
-                                      const char *name);
-  // stream management
-  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
-  static CUresult cuStreamSynchronize(CUstream hStream);
-  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
-  static CUresult cuStreamDestroy_v2(CUstream hStream);
-  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
-                                 unsigned int gridDimY, unsigned int gridDimZ,
-                                 unsigned int blockDimX, unsigned int blockDimY,
-                                 unsigned int blockDimZ,
-                                 unsigned int sharedMemBytes, CUstream hStream,
-                                 void **kernelParams, void **extra);
-  // function management
-  static CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
-                                     CUfunction hfunc);
-  static CUresult cuFuncSetAttribute(CUfunction hfunc,
-                                     CUfunction_attribute attrib, int value);
-  static CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
-  // memory management
-  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-  static CUresult cuPointerGetAttribute(void *data,
-                                        CUpointer_attribute attribute,
-                                        CUdeviceptr ptr);
-  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N,
-                                  CUstream stream);
-  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice,
-                                  size_t ByteCount);
-  static CUresult cuMemFree_v2(CUdeviceptr dptr);
-  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice,
-                                       size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice,
-                                       const void *srcHost, size_t ByteCount,
-                                       CUstream hStream);
-  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost,
-                                  size_t ByteCount);
-  // event management
-  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
-                                     CUevent hEnd);
-  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
-  static CUresult cuEventDestroy_v2(CUevent hEvent);
-
-  /* ------------------- *
-   * NVML
-   * ------------------- */
-  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId,
-                                                       nvmlDevice_t *device);
-  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device,
-                                             nvmlClockType_t type,
-                                             unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device,
-                                                nvmlClockType_t type,
-                                                unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device,
-                                                      unsigned int mem_clock,
-                                                      unsigned int sm_clock);
-
-  /* ------------------- *
-   * HIP
-   * ------------------- */
-  // context management
-  static hipError_t hipInit(unsigned int Flags);
-  static hipError_t hipCtxDestroy(hipCtx_t ctx);
-  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags,
-                                 hipDevice_t dev);
-  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
-  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
-  static hipError_t hipCtxGetDevice(hipDevice_t *result);
-  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext,
-                                           unsigned int flags);
-  static hipError_t hipDriverGetVersion(int *driverVersion);
-  // device management
-  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
-  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib,
-                                          hipDevice_t dev);
-  static hipError_t hipGetDeviceCount(int *count);
-  // module management
-  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
-                                       hipModule_t hmod, const char *name);
-  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
-  static hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
-  static hipError_t hipModuleUnload(hipModule_t hmod);
-  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
-                                        unsigned int numOptions,
-                                        hipJitOption *options,
-                                        void **optionValues);
-  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
-                                         const char *name);
-  // stream management
-  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
-  static hipError_t hipStreamSynchronize(hipStream_t hStream);
-  static hipError_t hipStreamDestroy(hipStream_t hStream);
-  static hipError_t
-  hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
-                        unsigned int gridDimY, unsigned int gridDimZ,
-                        unsigned int blockDimX, unsigned int blockDimY,
-                        unsigned int blockDimZ, unsigned int sharedMemBytes,
-                        hipStream_t hStream, void **kernelParams, void **extra);
-  // function management
-  static hipError_t hipFuncGetAttributes(hipFuncAttributes *attrib,
-                                         void *hfunc);
-  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc,
-                                        hipFuncAttribute attrib, int value);
-  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc,
-                                          hipFuncCache_t config);
-  // memory management
-  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
-  static hipError_t hipPointerGetAttribute(void *data,
-                                           CUpointer_attribute attribute,
-                                           hipDeviceptr_t ptr);
-  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x,
-                                     size_t N, hipStream_t stream);
-  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice,
-                                  size_t ByteCount);
-  static hipError_t hipFree(hipDeviceptr_t dptr);
-  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice,
-                                       size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
-                                       const void *srcHost, size_t ByteCount,
-                                       hipStream_t hStream);
-  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost,
-                                  size_t ByteCount);
-  // event management
-  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
-  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart,
-                                        hipEvent_t hEnd);
-  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
-  static hipError_t hipEventDestroy(hipEvent_t hEvent);
-
-private:
-  // Libraries
-  static void *cuda_;
-  static void *nvml_;
-  static void *hip_;
-
-  /* ------------------- *
-   * CUDA
-   * ------------------- */
-  // context management
-  static void *cuCtxGetCurrent_;
-  static void *cuCtxSetCurrent_;
-  static void *cuCtxDestroy_v2_;
-  static void *cuCtxCreate_v2_;
-  static void *cuCtxGetDevice_;
-  static void *cuCtxPushCurrent_v2_;
-  static void *cuCtxPopCurrent_v2_;
-  static void *cuCtxEnablePeerAccess_;
-  static void *cuDriverGetVersion_;
-  static void *cuInit_;
-  // device management
-  static void *cuDeviceGet_;
-  static void *cuDeviceGetName_;
-  static void *cuDeviceGetPCIBusId_;
-  static void *cuDeviceGetAttribute_;
-  static void *cuDeviceGetCount_;
-  // link management
-  static void *cuLinkAddData_v2_;
-  static void *cuLinkCreate_v2_;
-  static void *cuLinkDestroy_;
-  static void *cuLinkComplete_;
-  // module management
-  static void *cuModuleGetGlobal_v2_;
-  static void *cuModuleLoad_;
-  static void *cuModuleUnload_;
-  static void *cuModuleLoadDataEx_;
-  static void *cuModuleLoadData_;
-  static void *cuModuleGetFunction_;
-  // stream management
-  static void *cuStreamCreate_;
-  static void *cuStreamSynchronize_;
-  static void *cuStreamDestroy_v2_;
-  static void *cuStreamGetCtx_;
-  static void *cuLaunchKernel_;
-  // function management
-  static void *cuFuncGetAttribute_;
-  static void *cuFuncSetAttribute_;
-  static void *cuFuncSetCacheConfig_;
-  // memory management
-  static void *cuMemcpyDtoH_v2_;
-  static void *cuMemFree_v2_;
-  static void *cuMemcpyDtoHAsync_v2_;
-  static void *cuMemcpyHtoDAsync_v2_;
-  static void *cuMemcpyHtoD_v2_;
-  static void *cuMemAlloc_v2_;
-  static void *cuMemsetD8Async_;
-  static void *cuPointerGetAttribute_;
-  // event management
-  static void *cuEventCreate_;
-  static void *cuEventElapsedTime_;
-  static void *cuEventRecord_;
-  static void *cuEventDestroy_v2_;
-
-  /* ------------------- *
-   * NVML
-   * ------------------- */
-  static void *nvmlInit_v2_;
-  static void *nvmlDeviceGetHandleByPciBusId_v2_;
-  static void *nvmlDeviceGetClockInfo_;
-  static void *nvmlDeviceGetMaxClockInfo_;
-  static void *nvmlDeviceSetApplicationsClocks_;
-
-  /* ------------------- *
-   * HIP
-   * ------------------- */
-  // context management
-  static void *hipInit_;
-  static void *hipCtxDestroy_;
-  static void *hipCtxCreate_;
-  static void *hipCtxPushCurrent_;
-  static void *hipCtxPopCurrent_;
-  static void *hipCtxGetDevice_;
-  static void *hipCtxEnablePeerAccess_;
-  static void *hipDriverGetVersion_;
-  // device management
-  static void *hipGetDevice_;
-  static void *hipDeviceGetName_;
-  static void *hipDeviceGetPCIBusId_;
-  static void *hipDeviceGetAttribute_;
-  static void *hipGetDeviceCount_;
-  // module management
-  static void *hipModuleGetGlobal_;
-  static void *hipModuleLoad_;
-  static void *hipModuleLoadData_;
-  static void *hipModuleUnload_;
-  static void *hipModuleLoadDataEx_;
-  static void *hipModuleGetFunction_;
-  // stream management
-  static void *hipStreamCreate_;
-  static void *hipStreamSynchronize_;
-  static void *hipStreamDestroy_;
-  static void *hipModuleLaunchKernel_;
-  ;
-  // function management
-  static void *hipFuncGetAttributes_;
-  static void *hipFuncSetAttribute_;
-  static void *hipFuncSetCacheConfig_;
-  // memory management
-  static void *hipMalloc_;
-  static void *hipPointerGetAttribute_;
-  static void *hipMemsetD8Async_;
-  static void *hipMemcpyDtoH_;
-  static void *hipFree_;
-  static void *hipMemcpyDtoHAsync_;
-  static void *hipMemcpyHtoDAsync_;
-  static void *hipMemcpyHtoD_;
-  // event management
-  static void *hipEventCreate_;
-  static void *hipEventElapsedTime_;
-  static void *hipEventRecord_;
-  static void *hipEventDestroy_;
-};
-
-} // namespace driver
-} // namespace triton
-
-#endif
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -1,254 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_DRIVER_ERROR_H_
-#define _TRITON_DRIVER_ERROR_H_
-
-#include "triton/driver/dispatch.h"
-#include <exception>
-
-namespace triton {
-
-namespace driver {
-
-namespace exception {
-
-namespace nvrtc {
-
-#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg)                               \
-  class name : public std::exception {                                         \
-  public:                                                                      \
-    const char *what() const throw() override { return "NVRTC: Error- " msg; } \
-  }
-
-TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure,
-                              "program creation failure");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_input, "invalid input");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_program, "invalid program");
-TRITON_CREATE_NVRTC_EXCEPTION(invalid_option, "invalid option");
-TRITON_CREATE_NVRTC_EXCEPTION(compilation, "compilation");
-TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure,
-                              "builtin operation failure");
-TRITON_CREATE_NVRTC_EXCEPTION(unknown_error, "unknown error");
-
-#undef TRITON_CREATE_NVRTC_EXCEPTION
-} // namespace nvrtc
-
-namespace cuda {
-class base : public std::exception {};
-
-#define TRITON_CREATE_CUDA_EXCEPTION(name, msg)                                \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override { return "CUDA: Error- " msg; }  \
-  }
-
-TRITON_CREATE_CUDA_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUDA_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_CUDA_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUDA_EXCEPTION(deinitialized, "deinitialized");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled, "profiler disabled");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized,
-                             "profiler not initialized");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started,
-                             "profiler already started");
-TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped,
-                             "profiler already stopped");
-TRITON_CREATE_CUDA_EXCEPTION(no_device, "no device");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_device, "invalid device");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_image, "invalid image");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_context, "invalid context");
-TRITON_CREATE_CUDA_EXCEPTION(context_already_current,
-                             "context already current");
-TRITON_CREATE_CUDA_EXCEPTION(map_failed, "map failed");
-TRITON_CREATE_CUDA_EXCEPTION(unmap_failed, "unmap failed");
-TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped, "array is mapped");
-TRITON_CREATE_CUDA_EXCEPTION(already_mapped, "already mapped");
-TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
-TRITON_CREATE_CUDA_EXCEPTION(already_acquired, "already acquired");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped, "not mapped");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array, "not mapped as array");
-TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
-TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
-TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit, "unsupported limit");
-TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use, "context already in use");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported,
-                             "peer access unsupported");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx, "invalid ptx");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context,
-                             "invalid graphics context");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_source, "invalid source");
-TRITON_CREATE_CUDA_EXCEPTION(file_not_found, "file not found");
-TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found,
-                             "shared object symbol not found");
-TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed,
-                             "shared object init failed");
-TRITON_CREATE_CUDA_EXCEPTION(operating_system, "operating system");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_handle, "invalid handle");
-TRITON_CREATE_CUDA_EXCEPTION(not_found, "not found");
-TRITON_CREATE_CUDA_EXCEPTION(not_ready, "not ready");
-TRITON_CREATE_CUDA_EXCEPTION(illegal_address, "illegal address");
-TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources,
-                             "launch out of resources");
-TRITON_CREATE_CUDA_EXCEPTION(launch_timeout, "launch timeout");
-TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing,
-                             "launch incompatible texturing");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled,
-                             "peer access already enabled");
-TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled,
-                             "peer access not enabled");
-TRITON_CREATE_CUDA_EXCEPTION(primary_context_active, "primary context active");
-TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed, "context is destroyed");
-TRITON_CREATE_CUDA_EXCEPTION(assert_error, "assert");
-TRITON_CREATE_CUDA_EXCEPTION(too_many_peers, "too many peers");
-TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered,
-                             "host memory already registered");
-TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered,
-                             "hot memory not registered");
-TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error, "hardware stack error");
-TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction, "illegal instruction");
-TRITON_CREATE_CUDA_EXCEPTION(misaligned_address, "misaligned address");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space, "invalid address space");
-TRITON_CREATE_CUDA_EXCEPTION(invalid_pc, "invalid pc");
-TRITON_CREATE_CUDA_EXCEPTION(launch_failed, "launch failed");
-TRITON_CREATE_CUDA_EXCEPTION(not_permitted, "not permitted");
-TRITON_CREATE_CUDA_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUDA_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUDA_EXCEPTION
-} // namespace cuda
-
-namespace cublas {
-class base : public std::exception {};
-
-#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg)                              \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override {                                \
-      return "CUBLAS: Error- " msg;                                            \
-    }                                                                          \
-  }
-
-TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed, "alloc failed");
-TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch, "arch mismatch");
-TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error, "mapping error");
-TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed, "execution failed");
-TRITON_CREATE_CUBLAS_EXCEPTION(internal_error, "internal error");
-TRITON_CREATE_CUBLAS_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUBLAS_EXCEPTION(license_error, "license error");
-TRITON_CREATE_CUBLAS_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUBLAS_EXCEPTION
-} // namespace cublas
-
-namespace cudnn {
-#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg)                               \
-  class name : public std::exception {                                         \
-  public:                                                                      \
-    const char *what() const throw() override { return "CUDNN: Error- " msg; } \
-  }
-
-TRITON_CREATE_CUDNN_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed, "allocation failed");
-TRITON_CREATE_CUDNN_EXCEPTION(bad_param, "bad param");
-TRITON_CREATE_CUDNN_EXCEPTION(internal_error, "internal error");
-TRITON_CREATE_CUDNN_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch, "arch mismatch");
-TRITON_CREATE_CUDNN_EXCEPTION(mapping_error, "mapping error");
-TRITON_CREATE_CUDNN_EXCEPTION(execution_failed, "execution failed");
-TRITON_CREATE_CUDNN_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_CUDNN_EXCEPTION(license_error, "license error");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing,
-                              "prerequisite missing");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress, "runtime in progress");
-TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow, "runtime fp overflow");
-} // namespace cudnn
-
-namespace hip {
-class base : public std::exception {};
-
-#define TRITON_CREATE_HIP_EXCEPTION(name, msg)                                 \
-  class name : public base {                                                   \
-  public:                                                                      \
-    const char *what() const throw() override { return "HIP: Error- " msg; }   \
-  }
-
-TRITON_CREATE_HIP_EXCEPTION(invalid_value, "invalid value");
-TRITON_CREATE_HIP_EXCEPTION(out_of_memory, "out of memory");
-TRITON_CREATE_HIP_EXCEPTION(not_initialized, "not initialized");
-TRITON_CREATE_HIP_EXCEPTION(deinitialized, "deinitialized");
-TRITON_CREATE_HIP_EXCEPTION(profiler_disabled, "profiler disabled");
-TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized,
-                            "profiler not initialized");
-TRITON_CREATE_HIP_EXCEPTION(profiler_already_started,
-                            "profiler already started");
-TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped,
-                            "profiler already stopped");
-TRITON_CREATE_HIP_EXCEPTION(no_device, "no device");
-TRITON_CREATE_HIP_EXCEPTION(invalid_device, "invalid device");
-TRITON_CREATE_HIP_EXCEPTION(invalid_image, "invalid image");
-TRITON_CREATE_HIP_EXCEPTION(invalid_context, "invalid context");
-TRITON_CREATE_HIP_EXCEPTION(context_already_current, "context already current");
-TRITON_CREATE_HIP_EXCEPTION(map_failed, "map failed");
-TRITON_CREATE_HIP_EXCEPTION(unmap_failed, "unmap failed");
-TRITON_CREATE_HIP_EXCEPTION(array_is_mapped, "array is mapped");
-TRITON_CREATE_HIP_EXCEPTION(already_mapped, "already mapped");
-TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
-TRITON_CREATE_HIP_EXCEPTION(already_acquired, "already acquired");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped, "not mapped");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array, "not mapped as array");
-TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
-TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
-TRITON_CREATE_HIP_EXCEPTION(unsupported_limit, "unsupported limit");
-TRITON_CREATE_HIP_EXCEPTION(context_already_in_use, "context already in use");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported, "peer access unsupported");
-TRITON_CREATE_HIP_EXCEPTION(invalid_ptx, "invalid ptx");
-TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context,
-                            "invalid graphics context");
-TRITON_CREATE_HIP_EXCEPTION(invalid_source, "invalid source");
-TRITON_CREATE_HIP_EXCEPTION(file_not_found, "file not found");
-TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found,
-                            "shared object symbol not found");
-TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed,
-                            "shared object init failed");
-TRITON_CREATE_HIP_EXCEPTION(operating_system, "operating system");
-TRITON_CREATE_HIP_EXCEPTION(invalid_handle, "invalid handle");
-TRITON_CREATE_HIP_EXCEPTION(not_found, "not found");
-TRITON_CREATE_HIP_EXCEPTION(not_ready, "not ready");
-TRITON_CREATE_HIP_EXCEPTION(illegal_address, "illegal address");
-TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources, "launch out of resources");
-TRITON_CREATE_HIP_EXCEPTION(launch_timeout, "launch timeout");
-TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing,
-                            "launch incompatible texturing");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled,
-                            "peer access already enabled");
-TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled, "peer access not enabled");
-TRITON_CREATE_HIP_EXCEPTION(primary_context_active, "primary context active");
-TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed, "context is destroyed");
-TRITON_CREATE_HIP_EXCEPTION(assert_error, "assert");
-TRITON_CREATE_HIP_EXCEPTION(too_many_peers, "too many peers");
-TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered,
-                            "host memory already registered");
-TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered,
-                            "hot memory not registered");
-TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error, "hardware stack error");
-TRITON_CREATE_HIP_EXCEPTION(illegal_instruction, "illegal instruction");
-TRITON_CREATE_HIP_EXCEPTION(misaligned_address, "misaligned address");
-TRITON_CREATE_HIP_EXCEPTION(invalid_address_space, "invalid address space");
-TRITON_CREATE_HIP_EXCEPTION(invalid_pc, "invalid pc");
-TRITON_CREATE_HIP_EXCEPTION(launch_failed, "launch failed");
-TRITON_CREATE_HIP_EXCEPTION(not_permitted, "not permitted");
-TRITON_CREATE_HIP_EXCEPTION(not_supported, "not supported");
-TRITON_CREATE_HIP_EXCEPTION(invalid_symbol, "invalid symbol");
-TRITON_CREATE_HIP_EXCEPTION(unknown, "unknown");
-
-#undef TRITON_CREATE_CUDA_EXCEPTION
-} // namespace hip
-
-} // namespace exception
-} // namespace driver
-} // namespace triton
-
-#endif
--- a/include/triton/driver/llvm.h
+++ b/include/triton/driver/llvm.h
@@ -1,22 +0,0 @@
-#include "triton/external/CUDA/cuda.h"
-#include "triton/external/hip.h"
-#include <string>
-
-namespace llvm {
-class Module;
-}
-
-namespace triton {
-namespace driver {
-
-void init_llvm();
-std::string path_to_ptxas(int &version);
-std::string llir_to_ptx(llvm::Module *module, int cc, int version);
-std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas_path,
-                         int cc);
-CUmodule ptx_to_cumodule(const std::string &ptx, int cc);
-std::string llir_to_amdgpu(llvm::Module *module, const std::string &proc);
-hipModule_t amdgpu_to_hipmodule(const std::string &path);
-
-} // namespace driver
-} // namespace triton
--- a/include/triton/external/CUDA/cuda.h
+++ b/include/triton/external/CUDA/cuda.h
--- a/include/triton/external/CUDA/nvml.h
+++ b/include/triton/external/CUDA/nvml.h
--- a/include/triton/external/half.hpp
+++ b/include/triton/external/half.hpp
--- a/include/triton/external/hip.h
+++ b/include/triton/external/hip.h
@@ -1,293 +0,0 @@
-#ifndef __external_hip_h__
-#define __external_hip_h__
-
-/*
- * @brief hipError_t
- * @enum
- * @ingroup Enumerations
- */
-// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
-// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
-
-// Ignoring error-code return values from hip APIs is discouraged. On C++17,
-// we can make that yield a warning
-
-/*
- * @brief hipError_t
- * @enum
- * @ingroup Enumerations
- */
-// Developer note - when updating these, update the hipErrorName and hipErrorString functions in
-// NVCC and HCC paths Also update the hipCUDAErrorTohipError function in NVCC path.
-
-#include <cstddef>
-
-typedef enum hipError_t {
-    hipSuccess = 0,  ///< Successful completion.
-    hipErrorInvalidValue = 1,  ///< One or more of the parameters passed to the API call is NULL
-                               ///< or not in an acceptable range.
-    hipErrorOutOfMemory = 2,
-    // Deprecated
-    hipErrorMemoryAllocation = 2,  ///< Memory allocation error.
-    hipErrorNotInitialized = 3,
-    // Deprecated
-    hipErrorInitializationError = 3,
-    hipErrorDeinitialized = 4,
-    hipErrorProfilerDisabled = 5,
-    hipErrorProfilerNotInitialized = 6,
-    hipErrorProfilerAlreadyStarted = 7,
-    hipErrorProfilerAlreadyStopped = 8,
-    hipErrorInvalidConfiguration = 9,
-    hipErrorInvalidPitchValue = 12,
-    hipErrorInvalidSymbol = 13,
-    hipErrorInvalidDevicePointer = 17,  ///< Invalid Device Pointer
-    hipErrorInvalidMemcpyDirection = 21,  ///< Invalid memory copy direction
-    hipErrorInsufficientDriver = 35,
-    hipErrorMissingConfiguration = 52,
-    hipErrorPriorLaunchFailure = 53,
-    hipErrorInvalidDeviceFunction = 98,
-    hipErrorNoDevice = 100,  ///< Call to hipGetDeviceCount returned 0 devices
-    hipErrorInvalidDevice = 101,  ///< DeviceID must be in range 0...#compute-devices.
-    hipErrorInvalidImage = 200,
-    hipErrorInvalidContext = 201,  ///< Produced when input context is invalid.
-    hipErrorContextAlreadyCurrent = 202,
-    hipErrorMapFailed = 205,
-    // Deprecated
-    hipErrorMapBufferObjectFailed = 205,  ///< Produced when the IPC memory attach failed from ROCr.
-    hipErrorUnmapFailed = 206,
-    hipErrorArrayIsMapped = 207,
-    hipErrorAlreadyMapped = 208,
-    hipErrorNoBinaryForGpu = 209,
-    hipErrorAlreadyAcquired = 210,
-    hipErrorNotMapped = 211,
-    hipErrorNotMappedAsArray = 212,
-    hipErrorNotMappedAsPointer = 213,
-    hipErrorECCNotCorrectable = 214,
-    hipErrorUnsupportedLimit = 215,
-    hipErrorContextAlreadyInUse = 216,
-    hipErrorPeerAccessUnsupported = 217,
-    hipErrorInvalidKernelFile = 218,  ///< In CUDA DRV, it is CUDA_ERROR_INVALID_PTX
-    hipErrorInvalidGraphicsContext = 219,
-    hipErrorInvalidSource = 300,
-    hipErrorFileNotFound = 301,
-    hipErrorSharedObjectSymbolNotFound = 302,
-    hipErrorSharedObjectInitFailed = 303,
-    hipErrorOperatingSystem = 304,
-    hipErrorInvalidHandle = 400,
-    // Deprecated
-    hipErrorInvalidResourceHandle = 400,  ///< Resource handle (hipEvent_t or hipStream_t) invalid.
-    hipErrorNotFound = 500,
-    hipErrorNotReady = 600,  ///< Indicates that asynchronous operations enqueued earlier are not
-                             ///< ready.  This is not actually an error, but is used to distinguish
-                             ///< from hipSuccess (which indicates completion).  APIs that return
-                             ///< this error include hipEventQuery and hipStreamQuery.
-    hipErrorIllegalAddress = 700,
-    hipErrorLaunchOutOfResources = 701,  ///< Out of resources error.
-    hipErrorLaunchTimeOut = 702,
-    hipErrorPeerAccessAlreadyEnabled =
-        704,  ///< Peer access was already enabled from the current device.
-    hipErrorPeerAccessNotEnabled =
-        705,  ///< Peer access was never enabled from the current device.
-    hipErrorSetOnActiveProcess = 708,
-    hipErrorAssert = 710,  ///< Produced when the kernel calls assert.
-    hipErrorHostMemoryAlreadyRegistered =
-        712,  ///< Produced when trying to lock a page-locked memory.
-    hipErrorHostMemoryNotRegistered =
-        713,  ///< Produced when trying to unlock a non-page-locked memory.
-    hipErrorLaunchFailure =
-        719,  ///< An exception occurred on the device while executing a kernel.
-    hipErrorCooperativeLaunchTooLarge =
-        720,  ///< This error indicates that the number of blocks launched per grid for a kernel
-              ///< that was launched via cooperative launch APIs exceeds the maximum number of
-              ///< allowed blocks for the current device
-    hipErrorNotSupported = 801,  ///< Produced when the hip API is not supported/implemented
-    hipErrorUnknown = 999,  //< Unknown error.
-    // HSA Runtime Error Codes start here.
-    hipErrorRuntimeMemory = 1052,  ///< HSA runtime memory call returned error.  Typically not seen
-                                   ///< in production systems.
-    hipErrorRuntimeOther = 1053,  ///< HSA runtime call other than memory returned error.  Typically
-                                  ///< not seen in production systems.
-    hipErrorTbd  ///< Marker that more error codes are needed.
-} hipError_t;
-
-
-typedef struct ihipCtx_t* hipCtx_t;
-
-// Note many APIs also use integer deviceIds as an alternative to the device pointer:
-typedef int hipDevice_t;
-
-typedef enum hipDeviceP2PAttr {
-  hipDevP2PAttrPerformanceRank = 0,
-  hipDevP2PAttrAccessSupported,
-  hipDevP2PAttrNativeAtomicSupported,
-  hipDevP2PAttrHipArrayAccessSupported
-} hipDeviceP2PAttr;
-
-typedef struct ihipStream_t* hipStream_t;
-
-#define hipIpcMemLazyEnablePeerAccess 0
-
-#define HIP_IPC_HANDLE_SIZE 64
-
-typedef struct hipIpcMemHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcMemHandle_t;
-
-typedef struct hipIpcEventHandle_st {
-    char reserved[HIP_IPC_HANDLE_SIZE];
-} hipIpcEventHandle_t;
-
-typedef struct ihipModule_t* hipModule_t;
-
-typedef struct ihipModuleSymbol_t* hipFunction_t;
-
-typedef struct hipFuncAttributes {
-    int binaryVersion;
-    int cacheModeCA;
-    size_t constSizeBytes;
-    size_t localSizeBytes;
-    int maxDynamicSharedSizeBytes;
-    int maxThreadsPerBlock;
-    int numRegs;
-    int preferredShmemCarveout;
-    int ptxVersion;
-    size_t sharedSizeBytes;
-} hipFuncAttributes;
-
-typedef struct ihipEvent_t* hipEvent_t;
-
-/*
- * @brief hipDeviceAttribute_t
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipDeviceAttribute_t {
-    hipDeviceAttributeMaxThreadsPerBlock,       ///< Maximum number of threads per block.
-    hipDeviceAttributeMaxBlockDimX,             ///< Maximum x-dimension of a block.
-    hipDeviceAttributeMaxBlockDimY,             ///< Maximum y-dimension of a block.
-    hipDeviceAttributeMaxBlockDimZ,             ///< Maximum z-dimension of a block.
-    hipDeviceAttributeMaxGridDimX,              ///< Maximum x-dimension of a grid.
-    hipDeviceAttributeMaxGridDimY,              ///< Maximum y-dimension of a grid.
-    hipDeviceAttributeMaxGridDimZ,              ///< Maximum z-dimension of a grid.
-    hipDeviceAttributeMaxSharedMemoryPerBlock,  ///< Maximum shared memory available per block in
-                                                ///< bytes.
-    hipDeviceAttributeTotalConstantMemory,      ///< Constant memory size in bytes.
-    hipDeviceAttributeWarpSize,                 ///< Warp size in threads.
-    hipDeviceAttributeMaxRegistersPerBlock,  ///< Maximum number of 32-bit registers available to a
-                                             ///< thread block. This number is shared by all thread
-                                             ///< blocks simultaneously resident on a
-                                             ///< multiprocessor.
-    hipDeviceAttributeClockRate,             ///< Peak clock frequency in kilohertz.
-    hipDeviceAttributeMemoryClockRate,       ///< Peak memory clock frequency in kilohertz.
-    hipDeviceAttributeMemoryBusWidth,        ///< Global memory bus width in bits.
-    hipDeviceAttributeMultiprocessorCount,   ///< Number of multiprocessors on the device.
-    hipDeviceAttributeComputeMode,           ///< Compute mode that device is currently in.
-    hipDeviceAttributeL2CacheSize,  ///< Size of L2 cache in bytes. 0 if the device doesn't have L2
-                                    ///< cache.
-    hipDeviceAttributeMaxThreadsPerMultiProcessor,  ///< Maximum resident threads per
-                                                    ///< multiprocessor.
-    hipDeviceAttributeComputeCapabilityMajor,       ///< Major compute capability version number.
-    hipDeviceAttributeComputeCapabilityMinor,       ///< Minor compute capability version number.
-    hipDeviceAttributeConcurrentKernels,  ///< Device can possibly execute multiple kernels
-                                          ///< concurrently.
-    hipDeviceAttributePciBusId,           ///< PCI Bus ID.
-    hipDeviceAttributePciDeviceId,        ///< PCI Device ID.
-    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor,  ///< Maximum Shared Memory Per
-                                                         ///< Multiprocessor.
-    hipDeviceAttributeIsMultiGpuBoard,                   ///< Multiple GPU devices.
-    hipDeviceAttributeIntegrated,                        ///< iGPU
-    hipDeviceAttributeCooperativeLaunch,                 ///< Support cooperative launch
-    hipDeviceAttributeCooperativeMultiDeviceLaunch,      ///< Support cooperative launch on multiple devices
-    hipDeviceAttributeMaxTexture1DWidth,    ///< Maximum number of elements in 1D images
-    hipDeviceAttributeMaxTexture2DWidth,    ///< Maximum dimension width of 2D images in image elements
-    hipDeviceAttributeMaxTexture2DHeight,   ///< Maximum dimension height of 2D images in image elements
-    hipDeviceAttributeMaxTexture3DWidth,    ///< Maximum dimension width of 3D images in image elements
-    hipDeviceAttributeMaxTexture3DHeight,   ///< Maximum dimensions height of 3D images in image elements
-    hipDeviceAttributeMaxTexture3DDepth,    ///< Maximum dimensions depth of 3D images in image elements
-
-    hipDeviceAttributeHdpMemFlushCntl,      ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register
-    hipDeviceAttributeHdpRegFlushCntl,      ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register
-
-    hipDeviceAttributeMaxPitch,             ///< Maximum pitch in bytes allowed by memory copies
-    hipDeviceAttributeTextureAlignment,     ///<Alignment requirement for textures
-    hipDeviceAttributeTexturePitchAlignment, ///<Pitch alignment requirement for 2D texture references bound to pitched memory;
-    hipDeviceAttributeKernelExecTimeout,    ///<Run time limit for kernels executed on the device
-    hipDeviceAttributeCanMapHostMemory,     ///<Device can map host memory into device address space
-    hipDeviceAttributeEccEnabled,           ///<Device has ECC support enabled
-
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc,        ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched functions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim,     ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched grid dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim,    ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched block dimensions
-    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem,   ///< Supports cooperative launch on multiple
-                                                                  ///devices with unmatched shared memories
-    hipDeviceAttributeAsicRevision,         ///< Revision of the GPU in this device
-    hipDeviceAttributeManagedMemory,        ///< Device supports allocating managed memory on this system
-    hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on
-                                                      /// the device without migration
-    hipDeviceAttributeConcurrentManagedAccess,  ///< Device can coherently access managed memory
-                                                /// concurrently with the CPU
-    hipDeviceAttributePageableMemoryAccess,     ///< Device supports coherently accessing pageable memory
-                                                /// without calling hipHostRegister on it
-    hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via
-                                                              /// the host's page tables
-    hipDeviceAttributeCanUseStreamWaitValue ///< '1' if Device supports hipStreamWaitValue32() and
-                                            ///< hipStreamWaitValue64() , '0' otherwise.
-
-} hipDeviceAttribute_t;
-
-typedef void* hipDeviceptr_t;
-
-/*
- * @brief hipJitOption
- * @enum
- * @ingroup Enumerations
- */
-typedef enum hipJitOption {
-    hipJitOptionMaxRegisters = 0,
-    hipJitOptionThreadsPerBlock,
-    hipJitOptionWallTime,
-    hipJitOptionInfoLogBuffer,
-    hipJitOptionInfoLogBufferSizeBytes,
-    hipJitOptionErrorLogBuffer,
-    hipJitOptionErrorLogBufferSizeBytes,
-    hipJitOptionOptimizationLevel,
-    hipJitOptionTargetFromContext,
-    hipJitOptionTarget,
-    hipJitOptionFallbackStrategy,
-    hipJitOptionGenerateDebugInfo,
-    hipJitOptionLogVerbose,
-    hipJitOptionGenerateLineInfo,
-    hipJitOptionCacheMode,
-    hipJitOptionSm3xOpt,
-    hipJitOptionFastCompile,
-    hipJitOptionNumOptions
-} hipJitOption;
-
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncAttribute {
-    hipFuncAttributeMaxDynamicSharedMemorySize = 8,
-    hipFuncAttributePreferredSharedMemoryCarveout = 9,
-    hipFuncAttributeMax
-} hipFuncAttribute;
-
-/**
- * @warning On AMD devices and some Nvidia devices, these hints and controls are ignored.
- */
-typedef enum hipFuncCache_t {
-    hipFuncCachePreferNone,    ///< no preference for shared memory or L1 (default)
-    hipFuncCachePreferShared,  ///< prefer larger shared memory and smaller L1 cache
-    hipFuncCachePreferL1,      ///< prefer larger L1 cache and smaller shared memory
-    hipFuncCachePreferEqual,   ///< prefer equal size L1 cache and shared memory
-} hipFuncCache_t;
-
-
-#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01)
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02)
-#define HIP_LAUNCH_PARAM_END ((void*)0x03)
-
-#endif
--- a/include/triton/tools/bench.hpp
+++ b/include/triton/tools/bench.hpp
@@ -1,57 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_BENCH_H_
-#define _TRITON_TOOLS_BENCH_H_
-
-#include "triton/driver/device.h"
-#include "triton/driver/stream.h"
-#include <algorithm>
-#include <chrono>
-#include <functional>
-
-namespace triton {
-namespace tools {
-
-class timer {
-  typedef std::chrono::high_resolution_clock high_resolution_clock;
-  typedef std::chrono::nanoseconds nanoseconds;
-
-public:
-  explicit timer(bool run = false) {
-    if (run)
-      start();
-  }
-
-  void start() { _start = high_resolution_clock::now(); }
-
-  nanoseconds get() const {
-    return std::chrono::duration_cast<nanoseconds>(
-        high_resolution_clock::now() - _start);
-  }
-
-private:
-  high_resolution_clock::time_point _start;
-};
-
-inline double bench(std::function<void()> const &op, driver::stream *stream,
-                    size_t warmup = 10, size_t repeat = 200) {
-  timer tmr;
-  std::vector<size_t> times;
-  double total_time = 0;
-  for (size_t i = 0; i < warmup; i++)
-    op();
-  stream->synchronize();
-  tmr.start();
-  for (size_t i = 0; i < repeat; i++) {
-    op();
-  }
-  stream->synchronize();
-  return (float)tmr.get().count() / repeat;
-
-  //  return *std::min_element(times.begin(), times.end());
-}
-
-} // namespace tools
-} // namespace triton
-
-#endif
--- a/include/triton/tools/graph.h
+++ b/include/triton/tools/graph.h
@@ -1,68 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_THREAD_GRAPH_H_
-#define _TRITON_TOOLS_THREAD_GRAPH_H_
-
-#include <iostream>
-#include <map>
-#include <set>
-#include <vector>
-
-namespace triton {
-namespace tools {
-
-template <class node_t> class graph {
-  typedef std::map<node_t, std::set<node_t>> edges_t;
-
-public:
-  typedef std::map<size_t, std::vector<node_t>> cmap_t;
-  typedef std::map<node_t, size_t> nmap_t;
-
-private:
-  void connected_components_impl(node_t x, std::set<node_t> &nodes,
-                                 nmap_t *nmap, cmap_t *cmap, int id) const {
-    if (nmap)
-      (*nmap)[x] = id;
-    if (cmap)
-      (*cmap)[id].push_back(x);
-    if (nodes.find(x) != nodes.end()) {
-      nodes.erase(x);
-      for (const node_t &y : edges_.at(x))
-        connected_components_impl(y, nodes, nmap, cmap, id);
-    }
-  }
-
-public:
-  void connected_components(cmap_t *cmap, nmap_t *nmap) const {
-    if (cmap)
-      cmap->clear();
-    if (nmap)
-      nmap->clear();
-    std::set<node_t> nodes = nodes_;
-    unsigned id = 0;
-    while (!nodes.empty()) {
-      connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++);
-    }
-  }
-
-  void add_edge(node_t x, node_t y) {
-    nodes_.insert(x);
-    nodes_.insert(y);
-    edges_[x].insert(y);
-    edges_[y].insert(x);
-  }
-
-  void clear() {
-    nodes_.clear();
-    edges_.clear();
-  }
-
-private:
-  std::set<node_t> nodes_;
-  edges_t edges_;
-};
-
-} // namespace tools
-} // namespace triton
-
-#endif
--- a/include/triton/tools/sha1.hpp
+++ b/include/triton/tools/sha1.hpp
@@ -1,172 +0,0 @@
-/*
- Copyright (c) 2011, Micael Hildenborg
- All rights reserved.
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-    * Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-    * Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-    * Neither the name of Micael Hildenborg nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
- THIS SOFTWARE IS PROVIDED BY Micael Hildenborg ''AS IS'' AND ANY
- EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL Micael Hildenborg BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-/*
- Contributors:
- Gustav
- Several members in the gamedev.se forum.
- Gregory Petrosyan
- */
-
-#ifndef _TRITON_TOOLS_SHA1_HPP_
-#define _TRITON_TOOLS_SHA1_HPP_
-
-namespace sha1 {
-namespace // local
-{
-// Rotate an integer value to left.
-inline unsigned int rol(const unsigned int value, const unsigned int steps) {
-  return ((value << steps) | (value >> (32 - steps)));
-}
-
-// Sets the first 16 integers in the buffert to zero.
-// Used for clearing the W buffert.
-inline void clearWBuffert(unsigned int *buffert) {
-  for (int pos = 16; --pos >= 0;) {
-    buffert[pos] = 0;
-  }
-}
-
-inline void innerHash(unsigned int *result, unsigned int *w) {
-  unsigned int a = result[0];
-  unsigned int b = result[1];
-  unsigned int c = result[2];
-  unsigned int d = result[3];
-  unsigned int e = result[4];
-
-  int round = 0;
-
-#define sha1macro(func, val)                                                   \
-  {                                                                            \
-    const unsigned int t = rol(a, 5) + (func) + e + val + w[round];            \
-    e = d;                                                                     \
-    d = c;                                                                     \
-    c = rol(b, 30);                                                            \
-    b = a;                                                                     \
-    a = t;                                                                     \
-  }
-
-  while (round < 16) {
-    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
-  }
-  while (round < 20) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
-  }
-  while (round < 40) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro(b ^ c ^ d, 0x6ed9eba1)++ round;
-  }
-  while (round < 60) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)++ round;
-  }
-  while (round < 80) {
-    w[round] =
-        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-    sha1macro(b ^ c ^ d, 0xca62c1d6)++ round;
-  }
-
-#undef sha1macro
-
-  result[0] += a;
-  result[1] += b;
-  result[2] += c;
-  result[3] += d;
-  result[4] += e;
-}
-} // namespace
-
-inline void calc(const void *src, const int bytelength, unsigned char *hash) {
-  // Init the result array.
-  unsigned int result[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
-                            0xc3d2e1f0};
-
-  // Cast the void src pointer to be the byte array we can work with.
-  const unsigned char *sarray = (const unsigned char *)src;
-
-  // The reusable round buffer
-  unsigned int w[80];
-
-  // Loop through all complete 64byte blocks.
-  const int endOfFullBlocks = bytelength - 64;
-  int endCurrentBlock;
-  int currentBlock = 0;
-
-  while (currentBlock <= endOfFullBlocks) {
-    endCurrentBlock = currentBlock + 64;
-
-    // Init the round buffer with the 64 byte block data.
-    for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4) {
-      // This line will swap endian on big endian and keep endian on little
-      // endian.
-      w[roundPos++] = (unsigned int)sarray[currentBlock + 3] |
-                      (((unsigned int)sarray[currentBlock + 2]) << 8) |
-                      (((unsigned int)sarray[currentBlock + 1]) << 16) |
-                      (((unsigned int)sarray[currentBlock]) << 24);
-    }
-    innerHash(result, w);
-  }
-
-  // Handle the last and not full 64 byte block if existing.
-  endCurrentBlock = bytelength - currentBlock;
-  clearWBuffert(w);
-  int lastBlockBytes = 0;
-  for (; lastBlockBytes < endCurrentBlock; ++lastBlockBytes) {
-    w[lastBlockBytes >> 2] |=
-        (unsigned int)sarray[lastBlockBytes + currentBlock]
-        << ((3 - (lastBlockBytes & 3)) << 3);
-  }
-  w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
-  if (endCurrentBlock >= 56) {
-    innerHash(result, w);
-    clearWBuffert(w);
-  }
-  w[15] = bytelength << 3;
-  innerHash(result, w);
-
-  // Store hash in result pointer, and make sure we get in in the correct order
-  // on both endian models.
-  for (int hashByte = 20; --hashByte >= 0;) {
-    hash[hashByte] =
-        (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
-  }
-}
-
-inline void toHexString(const unsigned char *hash, char *hexstring) {
-  const char hexDigits[] = {"0123456789abcdef"};
-
-  for (int hashByte = 20; --hashByte >= 0;) {
-    hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
-    hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
-  }
-  hexstring[40] = 0;
-}
-} // namespace sha1
-
-#endif
--- a/include/triton/tools/sys/exec.hpp
+++ b/include/triton/tools/sys/exec.hpp
@@ -1,42 +0,0 @@
-#ifndef TRITON_TOOLS_SYS_EXEC_HPP
-#define TRITON_TOOLS_SYS_EXEC_HPP
-
-#include <cstdio>
-#include <iostream>
-#include <memory>
-#include <stdexcept>
-#include <string>
-
-namespace triton {
-namespace tools {
-
-#ifdef _WIN32
-#define popen _popen
-#define pclose _pclose
-#endif
-
-#ifndef WEXITSTATUS
-#define WEXITSTATUS(stat_val) ((unsigned)(stat_val)&255)
-#endif
-
-int exec(const std::string &cmd, std::string &result) {
-  char buffer[128];
-  FILE *pipe = popen(cmd.c_str(), "r");
-  if (!pipe)
-    return 0;
-  result.clear();
-  try {
-    while (fgets(buffer, sizeof buffer, pipe) != NULL)
-      result += buffer;
-  } catch (...) {
-    pclose(pipe);
-    return 0;
-  }
-  int status = pclose(pipe);
-  return WEXITSTATUS(status);
-}
-
-} // namespace tools
-} // namespace triton
-
-#endif
--- a/include/triton/tools/sys/mkdir.hpp
+++ b/include/triton/tools/sys/mkdir.hpp
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2015, PHILIPPE TILLET. All rights reserved.
- *
- * This file is part of ISAAC.
- *
- * ISAAC is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with this library; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
- * MA 02110-1301  USA
- */
-
-#ifndef TDL_TOOLS_SYS_MKDIR_HPP
-#define TDL_TOOLS_SYS_MKDIR_HPP
-
-#include <cstdlib>
-#include <cstring>
-#include <errno.h>
-#include <string>
-#include <sys/stat.h>
-#if defined(_WIN32)
-#include <direct.h>
-#endif
-
-namespace triton {
-
-namespace tools {
-
-inline int mkdir(std::string const &path) {
-#if defined(_WIN32)
-  return _mkdir(path.c_str());
-#else
-  return ::mkdir(path.c_str(), 0777);
-#endif
-}
-
-inline int mkpath(std::string const &path) {
-  int status = 0;
-  size_t pp = 0;
-  size_t sp;
-  while ((sp = path.find('/', pp)) != std::string::npos) {
-    if (sp != pp) {
-      status = mkdir(path.substr(0, sp));
-    }
-    pp = sp + 1;
-  }
-  return (status == 0 || errno == EEXIST) ? 0 : -1;
-}
-
-inline int mtime(std::string const &path) {
-  struct stat st;
-  if (stat(path.c_str(), &st) != 0)
-    return 0;
-  return st.st_mtime;
-}
-
-} // namespace tools
-
-} // namespace triton
-
-#endif
--- a/include/triton/tools/thread_pool.h
+++ b/include/triton/tools/thread_pool.h
@@ -1,81 +0,0 @@
-#pragma once
-
-#ifndef _TRITON_TOOLS_THREAD_POOL_H_
-#define _TRITON_TOOLS_THREAD_POOL_H_
-
-#include <condition_variable>
-#include <functional>
-#include <future>
-#include <memory>
-#include <mutex>
-#include <queue>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-class ThreadPool {
-public:
-  ThreadPool(size_t threads) : stop(false) {
-    for (size_t i = 0; i < threads; ++i)
-      workers.emplace_back([this] {
-        for (;;) {
-          std::function<void()> task;
-          {
-            std::unique_lock<std::mutex> lock(this->queue_mutex);
-            this->condition.wait(
-                lock, [this] { return this->stop || !this->tasks.empty(); });
-            if (this->stop && this->tasks.empty())
-              return;
-            task = std::move(this->tasks.front());
-            this->tasks.pop();
-          }
-          task();
-        }
-      });
-  }
-
-  template <class F, class... Args>
-  auto enqueue(F &&f, Args &&...args)
-      -> std::future<typename std::result_of<F(Args...)>::type> {
-    using return_type = typename std::result_of<F(Args...)>::type;
-
-    auto task = std::make_shared<std::packaged_task<return_type()>>(
-        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
-
-    std::future<return_type> res = task->get_future();
-    {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-
-      // don't allow enqueueing after stopping the pool
-      if (stop)
-        throw std::runtime_error("enqueue on stopped ThreadPool");
-
-      tasks.emplace([task]() { (*task)(); });
-    }
-    condition.notify_one();
-    return res;
-  }
-
-  ~ThreadPool() {
-    {
-      std::unique_lock<std::mutex> lock(queue_mutex);
-      stop = true;
-    }
-    condition.notify_all();
-    for (std::thread &worker : workers)
-      worker.join();
-  }
-
-private:
-  // need to keep track of threads so we can join them
-  std::vector<std::thread> workers;
-  // the task queue
-  std::queue<std::function<void()>> tasks;
-
-  // synchronization
-  std::mutex queue_mutex;
-  std::condition_variable condition;
-  bool stop;
-};
-
-#endif