[RUNTIME] Major code cleanup (#711)

This PR does the following: - CUDA utilities (e.g., cuGetInfo) won't be compiled as part of libtriton.so anymore. - Refactoring driver/llvm.cc to split it between PTX codegen and python. - By extension this will also deprecate include/external so Triton won't have to live with a copy of some CUDA/Hip headers anymore. - `triton-translate` becomes a `triton.tools.aot` Python utility that re-uses functions from the triton.compile sub-module.
2022-09-26 16:38:06 -07:00
parent 8bb09f83ee
commit 1e91ed30d0
28 changed files with 509 additions and 31483 deletions
--- a/lib/CMakeLists.txt
+++ b/lib/CMakeLists.txt
@@ -1,5 +1,4 @@
 # add_subdirectory(codegen)
-add_subdirectory(driver)
 add_subdirectory(Analysis)
 add_subdirectory(Conversion)
 add_subdirectory(Dialect)
--- a/lib/Target/LLVMIR/LLVMIRTranslation.cpp
+++ b/lib/Target/LLVMIR/LLVMIRTranslation.cpp
@@ -13,7 +13,6 @@
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "mlir/Transforms/Passes.h"
 #include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVM.h"
-#include "triton/driver/llvm.h"
 #include "triton/tools/sys/getenv.hpp"
 #include "llvm/IR/Constants.h"

@@ -99,7 +98,6 @@ translateLLVMToLLVMIR(llvm::LLVMContext *llvmContext, mlir::ModuleOp module) {
  }

  // Initialize LLVM targets.
-  ::triton::driver::init_llvm();
  mlir::ExecutionEngine::setupTargetTriple(llvmModule.get());

  auto optPipeline = mlir::makeOptimizingTransformer(
--- a/lib/Target/PTX/PTXTranslation.cpp
+++ b/lib/Target/PTX/PTXTranslation.cpp
@@ -11,31 +11,129 @@
 #include "mlir/Target/LLVMIR/Export.h"
 #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h"
 #include "triton/Target/LLVMIR/LLVMIRTranslation.h"
-#include "triton/driver/dispatch.h"
-#include "triton/driver/llvm.h"
+
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <regex>

 namespace triton {

-void getCuCCAndVersionFromDevice(uint64_t device, int *cc, int *version,
-                                 std::string *ptxasPath) {
-  CUdevice dev = (CUdevice)device;
-  size_t major = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR>(dev);
-  size_t minor = cuGetInfo<CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR>(dev);
-  *cc = major * 10 + minor;
-  *ptxasPath = driver::path_to_ptxas(*version); // assign version
+extern "C" {
+int set_curterm(char *nterm) { return 0; }
+int del_curterm(char *nterm) { return 0; }
+int tigetnum(char *capname) { return 0; }
+int setupterm(char *term, int fildes, int *errret) { return 0; }
 }

-std::tuple<std::string, size_t, int, std::string>
-translateTritonGPUToPTX(mlir::ModuleOp module, uint64_t device) {
-  int cc;
-  int version;
-  std::string ptxasPath;
-  getCuCCAndVersionFromDevice(device, &cc, &version, &ptxasPath);
+static void init_llvm() {
+  LLVMInitializeNVPTXTargetInfo();
+  LLVMInitializeNVPTXTarget();
+  LLVMInitializeNVPTXTargetMC();
+  LLVMInitializeNVPTXAsmPrinter();
+}

-  llvm::LLVMContext ctx;
-  auto llModule = mlir::triton::translateTritonGPUToLLVMIR(&ctx, module);
-  auto ptxCode = driver::llir_to_ptx(llModule.get(), cc, version);
-  return std::make_tuple(ptxCode, cc, version, ptxasPath);
+static bool find_and_replace(std::string &str, const std::string &begin,
+                             const std::string &end,
+                             const std::string &target) {
+  size_t start_replace = str.find(begin);
+  if (start_replace == std::string::npos)
+    return false;
+  size_t end_replace = str.find(end, start_replace);
+  if (end_replace == std::string::npos)
+    return false;
+  str.replace(start_replace, end_replace + 1 - start_replace, target);
+  return true;
+}
+
+static std::string llir_to_ptx(llvm::Module *module, int capability, int ptx) {
+  // LLVM version in use may not officially support target hardware
+  int max_nvvm_cc = 75;
+  int max_nvvm_ptx = 74;
+  // options
+  auto options = llvm::cl::getRegisteredOptions();
+  auto *short_ptr =
+      static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
+  assert(short_ptr);
+  short_ptr->setValue(true);
+  // compute capability
+  std::string sm = "sm_" + std::to_string(capability);
+  // max PTX version
+  int ptx_major = ptx / 10;
+  int ptx_minor = ptx % 10;
+  // create
+  llvm::SmallVector<char, 0> buffer;
+  std::string triple = "nvptx64-nvidia-cuda";
+  std::string proc = "sm_" + std::to_string(std::min(capability, max_nvvm_cc));
+  std::string layout = "";
+  std::string features = "";
+  // std::string features = "+ptx" + std::to_string(std::min(ptx,
+  // max_nvvm_ptx));
+  init_llvm();
+  // verify and store llvm
+  llvm::legacy::PassManager pm;
+  pm.add(llvm::createVerifierPass());
+  pm.run(*module);
+  // module->print(llvm::outs(), nullptr);
+
+  // create machine
+  module->setTargetTriple(triple);
+  std::string error;
+  auto target =
+      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
+  llvm::TargetOptions opt;
+  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
+  opt.UnsafeFPMath = false;
+  opt.NoInfsFPMath = false;
+  opt.NoNaNsFPMath = true;
+  llvm::TargetMachine *machine = target->createTargetMachine(
+      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
+      llvm::None, llvm::CodeGenOpt::Aggressive);
+  // set data layout
+  if (layout.empty())
+    module->setDataLayout(machine->createDataLayout());
+  else
+    module->setDataLayout(layout);
+  // emit machine code
+  for (llvm::Function &f : module->functions())
+    f.addFnAttr(llvm::Attribute::AlwaysInline);
+  llvm::legacy::PassManager pass;
+  llvm::raw_svector_ostream stream(buffer);
+  // emit
+  machine->addPassesToEmitFile(pass, stream, nullptr,
+                               llvm::CodeGenFileType::CGFT_AssemblyFile);
+  pass.run(*module);
+
+  // post-process
+  std::string result(buffer.begin(), buffer.end());
+  find_and_replace(result, ".version", "\n",
+                   ".version " + std::to_string(ptx_major) + "." +
+                       std::to_string(ptx_minor) + "\n");
+  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
+  while (find_and_replace(result, "\t// begin inline asm", "\n", ""))
+    ;
+  while (find_and_replace(result, "\t// end inline asm", "\n", ""))
+    ;
+  return result;
+}
+
+std::string translateLLVMIRToPTX(llvm::Module &module, int cc, int version) {
+  auto ptxCode = llir_to_ptx(&module, cc, version);
+  return ptxCode;
 }

 } // namespace triton
--- a/lib/driver/CMakeLists.txt
+++ b/lib/driver/CMakeLists.txt
@@ -1,5 +0,0 @@
-add_library(TritonDriver
-  dispatch.cc
-  error.cc
-  llvm.cc
-)
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -1,395 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "triton/driver/dispatch.h"
-
-namespace triton {
-namespace driver {
-
-// Helpers for function definition
-#define DEFINE0(init, hlib, ret, fname)                                        \
-  ret dispatch::fname() {                                                      \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname);              \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE1(init, hlib, ret, fname, t1)                                    \
-  ret dispatch::fname(t1 a) {                                                  \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a);           \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE2(init, hlib, ret, fname, t1, t2)                                \
-  ret dispatch::fname(t1 a, t2 b) {                                            \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b);        \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE3(init, hlib, ret, fname, t1, t2, t3)                            \
-  ret dispatch::fname(t1 a, t2 b, t3 c) {                                      \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c);     \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE4(init, hlib, ret, fname, t1, t2, t3, t4)                        \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d) {                                \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d);  \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE5(init, hlib, ret, fname, t1, t2, t3, t4, t5)                    \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e) {                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e);                                          \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE6(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6)                \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f) {                    \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f);                                       \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE7(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7)            \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g) {              \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g);                                    \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE8(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)        \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h) {        \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h);                                 \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE9(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)    \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i) {  \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i);                              \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE10(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10)                                                          \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j) {                                                 \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j);                           \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE11(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11)                                                     \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k) {                                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k);                        \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE13(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11, t12, t13)                                           \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k, t12 l, t13 m) {                            \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k, l, m);                  \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-#define DEFINE19(init, hlib, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9,   \
-                 t10, t11, t12, t13, t14, t15, t16, t17, t18, t19)             \
-  ret dispatch::fname(t1 a, t2 b, t3 c, t4 d, t5 e, t6 f, t7 g, t8 h, t9 i,    \
-                      t10 j, t11 k, t12 l, t13 m, t14 n, t15 o, t16 p, t17 q,  \
-                      t18 r, t19 s) {                                          \
-    return f_impl<dispatch::init>(hlib, fname, fname##_, #fname, a, b, c, d,   \
-                                  e, f, g, h, i, j, k, l, m, n, o, p, q, r,    \
-                                  s);                                          \
-  }                                                                            \
-  void *dispatch::fname##_;
-
-/* ------------------- *
- * CUDA
- * ------------------- */
-
-bool dispatch::cuinit() {
-  if (cuda_ == nullptr) {
-#ifdef _WIN32
-    cuda_ = dlopen("cudart64_110.dll", RTLD_LAZY);
-#else
-    cuda_ = dlopen("libcuda.so", RTLD_LAZY);
-    if (!cuda_)
-      cuda_ = dlopen("libcuda.so.1", RTLD_LAZY);
-#endif
-    if (!cuda_)
-      throw std::runtime_error("Could not find `libcuda.so`. Make sure it is "
-                               "in your LD_LIBRARY_PATH.");
-  }
-  if (cuda_ == nullptr)
-    return false;
-  CUresult (*fptr)(unsigned int);
-  cuInit_ = dlsym(cuda_, "cuInit");
-  *reinterpret_cast<void **>(&fptr) = cuInit_;
-  CUresult res = (*fptr)(0);
-  check(res);
-  return true;
-}
-
-#define CUDA_DEFINE1(ret, fname, t1) DEFINE1(cuinit, cuda_, ret, fname, t1)
-#define CUDA_DEFINE2(ret, fname, t1, t2)                                       \
-  DEFINE2(cuinit, cuda_, ret, fname, t1, t2)
-#define CUDA_DEFINE3(ret, fname, t1, t2, t3)                                   \
-  DEFINE3(cuinit, cuda_, ret, fname, t1, t2, t3)
-#define CUDA_DEFINE4(ret, fname, t1, t2, t3, t4)                               \
-  DEFINE4(cuinit, cuda_, ret, fname, t1, t2, t3, t4)
-#define CUDA_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                           \
-  DEFINE5(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5)
-#define CUDA_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                       \
-  DEFINE6(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define CUDA_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                   \
-  DEFINE7(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define CUDA_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)               \
-  DEFINE8(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define CUDA_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)           \
-  DEFINE9(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define CUDA_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)     \
-  DEFINE10(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define CUDA_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10,     \
-                      t11)                                                     \
-  DEFINE11(cuinit, cuda_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
-           t11)
-
-// context management
-CUDA_DEFINE1(CUresult, cuCtxDestroy_v2, CUcontext)
-CUDA_DEFINE3(CUresult, cuCtxCreate_v2, CUcontext *, unsigned int, CUdevice)
-CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice *)
-CUDA_DEFINE2(CUresult, cuCtxEnablePeerAccess, CUcontext, unsigned int)
-CUDA_DEFINE1(CUresult, cuInit, unsigned int)
-CUDA_DEFINE1(CUresult, cuDriverGetVersion, int *)
-// device management
-CUDA_DEFINE2(CUresult, cuDeviceGet, CUdevice *, int)
-CUDA_DEFINE3(CUresult, cuDeviceGetName, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetPCIBusId, char *, int, CUdevice)
-CUDA_DEFINE3(CUresult, cuDeviceGetAttribute, int *, CUdevice_attribute,
-             CUdevice)
-CUDA_DEFINE1(CUresult, cuDeviceGetCount, int *)
-
-// link management
-CUDA_DEFINE8(CUresult, cuLinkAddData_v2, CUlinkState, CUjitInputType, void *,
-             size_t, const char *, unsigned int, CUjit_option *, void **);
-CUDA_DEFINE4(CUresult, cuLinkCreate_v2, unsigned int, CUjit_option *, void **,
-             CUlinkState *);
-CUDA_DEFINE1(CUresult, cuLinkDestroy, CUlinkState);
-CUDA_DEFINE3(CUresult, cuLinkComplete, CUlinkState, void **, size_t *);
-// module management
-CUDA_DEFINE4(CUresult, cuModuleGetGlobal_v2, CUdeviceptr *, size_t *, CUmodule,
-             const char *)
-CUDA_DEFINE2(CUresult, cuModuleLoad, CUmodule *, const char *)
-CUDA_DEFINE1(CUresult, cuModuleUnload, CUmodule)
-CUDA_DEFINE2(CUresult, cuModuleLoadData, CUmodule *, const void *)
-CUDA_DEFINE5(CUresult, cuModuleLoadDataEx, CUmodule *, const void *,
-             unsigned int, CUjit_option *, void **)
-CUDA_DEFINE3(CUresult, cuModuleGetFunction, CUfunction *, CUmodule,
-             const char *)
-// stream management
-CUDA_DEFINE2(CUresult, cuStreamCreate, CUstream *, unsigned int)
-CUDA_DEFINE1(CUresult, cuStreamSynchronize, CUstream)
-CUDA_DEFINE1(CUresult, cuStreamDestroy_v2, CUstream)
-CUDA_DEFINE2(CUresult, cuStreamGetCtx, CUstream, CUcontext *)
-CUDA_DEFINE11(CUresult, cuLaunchKernel, CUfunction, unsigned int, unsigned int,
-              unsigned int, unsigned int, unsigned int, unsigned int,
-              unsigned int, CUstream, void **, void **)
-// function management
-CUDA_DEFINE3(CUresult, cuFuncGetAttribute, int *, CUfunction_attribute,
-             CUfunction)
-CUDA_DEFINE3(CUresult, cuFuncSetAttribute, CUfunction, CUfunction_attribute,
-             int)
-CUDA_DEFINE2(CUresult, cuFuncSetCacheConfig, CUfunction, CUfunc_cache)
-// memory management
-CUDA_DEFINE3(CUresult, cuMemcpyDtoH_v2, void *, CUdeviceptr, size_t)
-CUDA_DEFINE1(CUresult, cuMemFree_v2, CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemcpyDtoHAsync_v2, void *, CUdeviceptr, size_t,
-             CUstream)
-CUDA_DEFINE4(CUresult, cuMemcpyHtoDAsync_v2, CUdeviceptr, const void *, size_t,
-             CUstream)
-CUDA_DEFINE3(CUresult, cuMemcpyHtoD_v2, CUdeviceptr, const void *, size_t)
-CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr *, size_t)
-CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void *, CUpointer_attribute,
-             CUdeviceptr)
-CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t,
-             CUstream)
-// event management
-CUDA_DEFINE2(CUresult, cuEventCreate, CUevent *, unsigned int)
-CUDA_DEFINE3(CUresult, cuEventElapsedTime, float *, CUevent, CUevent)
-CUDA_DEFINE2(CUresult, cuEventRecord, CUevent, CUstream)
-CUDA_DEFINE1(CUresult, cuEventDestroy_v2, CUevent)
-
-/* ------------------- *
- * NVML
- * ------------------- */
-bool dispatch::nvmlinit() {
-#ifdef _WIN32
-  if (nvml_ == nullptr)
-    nvml_ = dlopen("nvml.dll", RTLD_LAZY);
-#else
-  if (nvml_ == nullptr)
-    nvml_ = dlopen("libnvidia-ml.so", RTLD_LAZY);
-#endif
-  nvmlReturn_t (*fptr)();
-  nvmlInit_v2_ = dlsym(nvml_, "nvmlInit_v2");
-  *reinterpret_cast<void **>(&fptr) = nvmlInit_v2_;
-  nvmlReturn_t res = (*fptr)();
-  check(res);
-  return res;
-}
-
-#define NVML_DEFINE0(ret, fname) DEFINE0(nvmlinit, nvml_, ret, fname)
-#define NVML_DEFINE1(ret, fname, t1) DEFINE1(nvmlinit, nvml_, ret, fname, t1)
-#define NVML_DEFINE2(ret, fname, t1, t2)                                       \
-  DEFINE2(nvmlinit, nvml_, ret, fname, t1, t2)
-#define NVML_DEFINE3(ret, fname, t1, t2, t3)                                   \
-  DEFINE3(nvmlinit, nvml_, ret, fname, t1, t2, t3)
-
-NVML_DEFINE2(nvmlReturn_t, nvmlDeviceGetHandleByPciBusId_v2, const char *,
-             nvmlDevice_t *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetClockInfo, nvmlDevice_t,
-             nvmlClockType_t, unsigned int *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceGetMaxClockInfo, nvmlDevice_t,
-             nvmlClockType_t, unsigned int *)
-NVML_DEFINE3(nvmlReturn_t, nvmlDeviceSetApplicationsClocks, nvmlDevice_t,
-             unsigned int, unsigned int)
-
-/* ------------------- *
- * HIP
- * ------------------- */
-bool dispatch::hipinit() {
-  if (hip_ == nullptr)
-    hip_ = dlopen("libamdhip64.so", RTLD_LAZY);
-  if (hip_ == nullptr)
-    return false;
-  hipError_t (*fptr)();
-  hipInit_ = dlsym(hip_, "hipInit");
-  *reinterpret_cast<void **>(&fptr) = hipInit_;
-  hipError_t res = (*fptr)();
-  check(res);
-  return res;
-}
-
-#define HIP_DEFINE1(ret, fname, t1) DEFINE1(hipinit, hip_, ret, fname, t1)
-#define HIP_DEFINE2(ret, fname, t1, t2)                                        \
-  DEFINE2(hipinit, hip_, ret, fname, t1, t2)
-#define HIP_DEFINE3(ret, fname, t1, t2, t3)                                    \
-  DEFINE3(hipinit, hip_, ret, fname, t1, t2, t3)
-#define HIP_DEFINE4(ret, fname, t1, t2, t3, t4)                                \
-  DEFINE4(hipinit, hip_, ret, fname, t1, t2, t3, t4)
-#define HIP_DEFINE5(ret, fname, t1, t2, t3, t4, t5)                            \
-  DEFINE5(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5)
-#define HIP_DEFINE6(ret, fname, t1, t2, t3, t4, t5, t6)                        \
-  DEFINE6(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6)
-#define HIP_DEFINE7(ret, fname, t1, t2, t3, t4, t5, t6, t7)                    \
-  DEFINE7(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7)
-#define HIP_DEFINE8(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)                \
-  DEFINE8(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8)
-#define HIP_DEFINE9(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)            \
-  DEFINE9(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9)
-#define HIP_DEFINE10(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)      \
-  DEFINE10(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10)
-#define HIP_DEFINE11(ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11) \
-  DEFINE11(hipinit, hip_, ret, fname, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, \
-           t11)
-
-// context management
-HIP_DEFINE1(hipError_t, hipCtxDestroy, hipCtx_t)
-HIP_DEFINE3(hipError_t, hipCtxCreate, hipCtx_t *, unsigned int, hipDevice_t)
-HIP_DEFINE1(hipError_t, hipCtxGetDevice, hipDevice_t *)
-HIP_DEFINE1(hipError_t, hipCtxPushCurrent, hipCtx_t)
-HIP_DEFINE1(hipError_t, hipCtxPopCurrent, hipCtx_t *)
-HIP_DEFINE2(hipError_t, hipCtxEnablePeerAccess, hipCtx_t, unsigned int)
-HIP_DEFINE1(hipError_t, hipInit, unsigned int)
-HIP_DEFINE1(hipError_t, hipDriverGetVersion, int *)
-// device management
-HIP_DEFINE2(hipError_t, hipGetDevice, hipDevice_t *, int)
-HIP_DEFINE3(hipError_t, hipDeviceGetName, char *, int, hipDevice_t)
-HIP_DEFINE3(hipError_t, hipDeviceGetPCIBusId, char *, int, hipDevice_t)
-HIP_DEFINE3(hipError_t, hipDeviceGetAttribute, int *, hipDeviceAttribute_t,
-            hipDevice_t)
-HIP_DEFINE1(hipError_t, hipGetDeviceCount, int *)
-// module management
-HIP_DEFINE4(hipError_t, hipModuleGetGlobal, hipDeviceptr_t *, size_t *,
-            hipModule_t, const char *)
-HIP_DEFINE2(hipError_t, hipModuleLoad, hipModule_t *, const char *)
-HIP_DEFINE1(hipError_t, hipModuleUnload, hipModule_t)
-HIP_DEFINE2(hipError_t, hipModuleLoadData, hipModule_t *, const void *)
-HIP_DEFINE5(hipError_t, hipModuleLoadDataEx, hipModule_t *, const void *,
-            unsigned int, hipJitOption *, void **)
-HIP_DEFINE3(hipError_t, hipModuleGetFunction, hipFunction_t *, hipModule_t,
-            const char *)
-// stream management
-HIP_DEFINE2(hipError_t, hipStreamCreate, hipStream_t *, unsigned int)
-HIP_DEFINE1(hipError_t, hipStreamSynchronize, hipStream_t)
-HIP_DEFINE1(hipError_t, hipStreamDestroy, hipStream_t)
-HIP_DEFINE11(hipError_t, hipModuleLaunchKernel, hipFunction_t, unsigned int,
-             unsigned int, unsigned int, unsigned int, unsigned int,
-             unsigned int, unsigned int, hipStream_t, void **, void **)
-// function management
-HIP_DEFINE2(hipError_t, hipFuncGetAttributes, hipFuncAttributes *, void *)
-HIP_DEFINE2(hipError_t, hipFuncSetCacheConfig, hipFunction_t, hipFuncCache_t)
-// memory management
-HIP_DEFINE3(hipError_t, hipMemcpyDtoH, void *, hipDeviceptr_t, size_t)
-HIP_DEFINE1(hipError_t, hipFree, hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemcpyDtoHAsync, void *, hipDeviceptr_t, size_t,
-            hipStream_t)
-HIP_DEFINE4(hipError_t, hipMemcpyHtoDAsync, hipDeviceptr_t, const void *,
-            size_t, hipStream_t)
-HIP_DEFINE3(hipError_t, hipMemcpyHtoD, hipDeviceptr_t, const void *, size_t)
-HIP_DEFINE2(hipError_t, hipMalloc, hipDeviceptr_t *, size_t)
-HIP_DEFINE3(hipError_t, hipPointerGetAttribute, void *, CUpointer_attribute,
-            hipDeviceptr_t)
-HIP_DEFINE4(hipError_t, hipMemsetD8Async, hipDeviceptr_t, unsigned char, size_t,
-            hipStream_t)
-// event management
-HIP_DEFINE2(hipError_t, hipEventCreate, hipEvent_t *, unsigned int)
-HIP_DEFINE3(hipError_t, hipEventElapsedTime, float *, hipEvent_t, hipEvent_t)
-HIP_DEFINE2(hipError_t, hipEventRecord, hipEvent_t, hipStream_t)
-HIP_DEFINE1(hipError_t, hipEventDestroy, hipEvent_t)
-
-/* ------------------- *
- * COMMON
- * ------------------- */
-
-// Release
-void dispatch::release() {
-  if (cuda_) {
-    dlclose(cuda_);
-    cuda_ = nullptr;
-  }
-}
-
-void *dispatch::cuda_;
-void *dispatch::nvml_;
-void *dispatch::nvmlInit_v2_;
-void *dispatch::hip_;
-
-} // namespace driver
-} // namespace triton
--- a/lib/driver/error.cc
+++ b/lib/driver/error.cc
@@ -1,270 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "triton/driver/error.h"
-
-namespace triton {
-namespace driver {
-
-void check(CUresult err) {
-  using namespace exception::cuda;
-  switch (err) {
-  case CUDA_SUCCESS:
-    break;
-  case CUDA_ERROR_INVALID_VALUE:
-    throw invalid_value();
-  case CUDA_ERROR_OUT_OF_MEMORY:
-    throw out_of_memory();
-  case CUDA_ERROR_NOT_INITIALIZED:
-    throw not_initialized();
-  case CUDA_ERROR_DEINITIALIZED:
-    throw deinitialized();
-  case CUDA_ERROR_PROFILER_DISABLED:
-    throw profiler_disabled();
-  case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-    throw profiler_not_initialized();
-  case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-    throw profiler_already_started();
-  case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-    throw profiler_already_stopped();
-  case CUDA_ERROR_NO_DEVICE:
-    throw no_device();
-  case CUDA_ERROR_INVALID_DEVICE:
-    throw invalid_device();
-  case CUDA_ERROR_INVALID_IMAGE:
-    throw invalid_image();
-  case CUDA_ERROR_INVALID_CONTEXT:
-    throw invalid_context();
-  case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-    throw context_already_current();
-  case CUDA_ERROR_MAP_FAILED:
-    throw map_failed();
-  case CUDA_ERROR_UNMAP_FAILED:
-    throw unmap_failed();
-  case CUDA_ERROR_ARRAY_IS_MAPPED:
-    throw array_is_mapped();
-  case CUDA_ERROR_ALREADY_MAPPED:
-    throw already_mapped();
-  case CUDA_ERROR_NO_BINARY_FOR_GPU:
-    throw no_binary_for_gpu();
-  case CUDA_ERROR_ALREADY_ACQUIRED:
-    throw already_acquired();
-  case CUDA_ERROR_NOT_MAPPED:
-    throw not_mapped();
-  case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-    throw not_mapped_as_array();
-  case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-    throw not_mapped_as_pointer();
-  case CUDA_ERROR_ECC_UNCORRECTABLE:
-    throw ecc_uncorrectable();
-  case CUDA_ERROR_UNSUPPORTED_LIMIT:
-    throw unsupported_limit();
-  case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-    throw context_already_in_use();
-  case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
-    throw peer_access_unsupported();
-  case CUDA_ERROR_INVALID_PTX:
-    throw invalid_ptx();
-  case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
-    throw invalid_graphics_context();
-  case CUDA_ERROR_INVALID_SOURCE:
-    throw invalid_source();
-  case CUDA_ERROR_FILE_NOT_FOUND:
-    throw file_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-    throw shared_object_symbol_not_found();
-  case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-    throw shared_object_init_failed();
-  case CUDA_ERROR_OPERATING_SYSTEM:
-    throw operating_system();
-  case CUDA_ERROR_INVALID_HANDLE:
-    throw invalid_handle();
-  case CUDA_ERROR_NOT_FOUND:
-    throw not_found();
-  case CUDA_ERROR_NOT_READY:
-    throw not_ready();
-  case CUDA_ERROR_ILLEGAL_ADDRESS:
-    throw illegal_address();
-  case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-    throw launch_out_of_resources();
-  case CUDA_ERROR_LAUNCH_TIMEOUT:
-    throw launch_timeout();
-  case CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING:
-    throw launch_incompatible_texturing();
-  case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-    throw peer_access_already_enabled();
-  case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-    throw peer_access_not_enabled();
-  case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-    throw primary_context_active();
-  case CUDA_ERROR_CONTEXT_IS_DESTROYED:
-    throw context_is_destroyed();
-  case CUDA_ERROR_ASSERT:
-    throw assert_error();
-  case CUDA_ERROR_TOO_MANY_PEERS:
-    throw too_many_peers();
-  case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-    throw host_memory_already_registered();
-  case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-    throw host_memory_not_registered();
-  case CUDA_ERROR_HARDWARE_STACK_ERROR:
-    throw hardware_stack_error();
-  case CUDA_ERROR_ILLEGAL_INSTRUCTION:
-    throw illegal_instruction();
-  case CUDA_ERROR_MISALIGNED_ADDRESS:
-    throw misaligned_address();
-  case CUDA_ERROR_INVALID_ADDRESS_SPACE:
-    throw invalid_address_space();
-  case CUDA_ERROR_INVALID_PC:
-    throw invalid_pc();
-  case CUDA_ERROR_LAUNCH_FAILED:
-    throw launch_failed();
-  case CUDA_ERROR_NOT_PERMITTED:
-    throw not_permitted();
-  case CUDA_ERROR_NOT_SUPPORTED:
-    throw not_supported();
-  case CUDA_ERROR_UNKNOWN:
-    throw unknown();
-  default:
-    throw unknown();
-  }
-}
-
-void check(hipError_t error) {
-  using namespace exception::hip;
-  switch (error) {
-  case hipSuccess:
-    break;
-  case hipErrorInvalidValue:
-    throw invalid_value();
-  case hipErrorMemoryAllocation:
-    throw out_of_memory();
-  case hipErrorNotInitialized:
-    throw not_initialized();
-  case hipErrorDeinitialized:
-    throw deinitialized();
-  case hipErrorProfilerDisabled:
-    throw profiler_disabled();
-  case hipErrorProfilerNotInitialized:
-    throw profiler_not_initialized();
-  case hipErrorProfilerAlreadyStarted:
-    throw profiler_already_started();
-  case hipErrorProfilerAlreadyStopped:
-    throw profiler_already_stopped();
-  case hipErrorNoDevice:
-    throw no_device();
-  case hipErrorInvalidSymbol:
-    throw invalid_symbol();
-  case hipErrorInvalidDevice:
-    throw invalid_device();
-  case hipErrorInvalidImage:
-    throw invalid_image();
-  case hipErrorInvalidContext:
-    throw invalid_context();
-  case hipErrorContextAlreadyCurrent:
-    throw context_already_current();
-  case hipErrorMapFailed:
-    throw map_failed();
-  case hipErrorUnmapFailed:
-    throw unmap_failed();
-  case hipErrorArrayIsMapped:
-    throw array_is_mapped();
-  case hipErrorAlreadyMapped:
-    throw already_mapped();
-  case hipErrorNoBinaryForGpu:
-    throw no_binary_for_gpu();
-  case hipErrorAlreadyAcquired:
-    throw already_acquired();
-  case hipErrorNotMapped:
-    throw not_mapped();
-  case hipErrorNotMappedAsArray:
-    throw not_mapped_as_array();
-  case hipErrorNotMappedAsPointer:
-    throw not_mapped_as_pointer();
-  case hipErrorECCNotCorrectable:
-    throw ecc_uncorrectable();
-  case hipErrorUnsupportedLimit:
-    throw unsupported_limit();
-  case hipErrorContextAlreadyInUse:
-    throw context_already_in_use();
-  case hipErrorPeerAccessUnsupported:
-    throw peer_access_unsupported();
-  case hipErrorInvalidKernelFile:
-    throw invalid_ptx();
-  case hipErrorInvalidGraphicsContext:
-    throw invalid_graphics_context();
-  case hipErrorInvalidSource:
-    throw invalid_source();
-  case hipErrorFileNotFound:
-    throw file_not_found();
-  case hipErrorSharedObjectSymbolNotFound:
-    throw shared_object_symbol_not_found();
-  case hipErrorSharedObjectInitFailed:
-    throw shared_object_init_failed();
-  case hipErrorOperatingSystem:
-    throw operating_system();
-  case hipErrorInvalidResourceHandle:
-    throw invalid_handle();
-  case hipErrorNotFound:
-    throw not_found();
-  case hipErrorNotReady:
-    throw not_ready();
-  case hipErrorIllegalAddress:
-    throw illegal_address();
-  case hipErrorLaunchOutOfResources:
-    throw launch_out_of_resources();
-  case hipErrorLaunchTimeOut:
-    throw launch_timeout();
-  // case hipErrorLaunchIncompatibleTexturing  : throw
-  // launch_incompatible_texturing();
-  case hipErrorPeerAccessAlreadyEnabled:
-    throw peer_access_already_enabled();
-  case hipErrorPeerAccessNotEnabled:
-    throw peer_access_not_enabled();
-  // case hipErrorPrimaryContextActive         : throw primary_context_active();
-  // case hipErrorContextIsDestroyed           : throw context_is_destroyed();
-  case hipErrorAssert:
-    throw assert_error();
-  // case hipErrorTooManyPeers                 : throw too_many_peers();
-  case hipErrorHostMemoryAlreadyRegistered:
-    throw host_memory_already_registered();
-  case hipErrorHostMemoryNotRegistered:
-    throw host_memory_not_registered();
-  // case hipErrorHardwareStackError           : throw hardware_stack_error();
-  // case hipErrorIllegalInstruction            : throw illegal_instruction();
-  // case hipErrorMisalignedAddress             : throw misaligned_address();
-  // case hipErrorInvalidAddressSpace          : throw invalid_address_space();
-  // case hipErrorInvalidPc                     : throw invalid_pc();
-  case hipErrorLaunchFailure:
-    throw launch_failed();
-  // case hipErrorNotPermitted                  : throw not_permitted();
-  case hipErrorNotSupported:
-    throw not_supported();
-  case hipErrorUnknown:
-    throw unknown();
-  default:
-    throw unknown();
-  }
-}
-
-} // namespace driver
-} // namespace triton
--- a/lib/driver/llvm.cc
+++ b/lib/driver/llvm.cc
@@ -1,392 +0,0 @@
-/* Copyright 2015-2017 Philippe Tillet
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files
- * (the "Software"), to deal in the Software without restriction,
- * including without limitation the rights to use, copy, modify, merge,
- * publish, distribute, sublicense, and/or sell copies of the Software,
- * and to permit persons to whom the Software is furnished to do so,
- * subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
- * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-#include <fstream>
-
-#if defined __has_include
-#if __has_include(<unistd.h>)
-#include <unistd.h>
-#endif
-#endif
-
-#include "triton/driver/dispatch.h"
-#include "triton/driver/error.h"
-#include "triton/driver/llvm.h"
-#include "triton/tools/sha1.hpp"
-#include "triton/tools/sys/exec.hpp"
-#include "triton/tools/sys/getenv.hpp"
-#include "triton/tools/sys/mkdir.hpp"
-#include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IRPrintingPasses.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <memory>
-#include <regex>
-
-// begin AMD stuff
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/Program.h"
-#include "llvm/Support/ToolOutputFile.h"
-// end AMD stuff
-
-extern "C" {
-int set_curterm(char *nterm) { return 0; }
-int del_curterm(char *nterm) { return 0; }
-int tigetnum(char *capname) { return 0; }
-int setupterm(char *term, int fildes, int *errret) { return 0; }
-}
-
-namespace triton {
-namespace driver {
-
-void init_llvm() {
-  LLVMInitializeNVPTXTargetInfo();
-  LLVMInitializeNVPTXTarget();
-  LLVMInitializeNVPTXTargetMC();
-  LLVMInitializeNVPTXAsmPrinter();
-  LLVMInitializeAMDGPUTargetInfo();
-  LLVMInitializeAMDGPUTarget();
-  LLVMInitializeAMDGPUTargetMC();
-  LLVMInitializeAMDGPUAsmPrinter();
-}
-
-/* ------------------------ */
-//         CUDA             //
-/* ------------------------ */
-static bool find_and_replace(std::string &str, const std::string &begin,
-                             const std::string &end,
-                             const std::string &target) {
-  size_t start_replace = str.find(begin);
-  if (start_replace == std::string::npos)
-    return false;
-  size_t end_replace = str.find(end, start_replace);
-  if (end_replace == std::string::npos)
-    return false;
-  str.replace(start_replace, end_replace + 1 - start_replace, target);
-  return true;
-}
-
-std::string path_to_ptxas(int &version) {
-  std::vector<std::string> rets;
-  std::string ret;
-  // search paths for ptxas
-  std::vector<std::string> ptxas_prefixes = {"", "/usr/local/cuda/bin/"};
-  std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH");
-  if (!triton_ptxas.empty())
-    ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas);
-  // see what path for ptxas are valid
-  std::vector<std::string> working_ptxas;
-  for (const std::string &prefix : ptxas_prefixes) {
-    std::string ptxas = prefix + "ptxas";
-    bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0;
-    if (works) {
-      working_ptxas.push_back(ptxas);
-      rets.push_back(ret);
-    }
-  }
-  // error if no working ptxas was found
-  if (working_ptxas.empty())
-    throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, "
-                             "/usr/local/cuda/bin/ or PATH"
-                             " but a working version could not be found.");
-  std::string ptxas = working_ptxas.front();
-  // parse version
-  std::regex version_regex("release (\\d+)\\.(\\d+)");
-  std::smatch match;
-  bool found = false;
-  // currently choosing the first ptxas. Other logics can be implemented in
-  // future
-  size_t i = 0;
-  while (i < rets.size()) {
-    if (std::regex_search(rets[i], match, version_regex)) {
-      int major = std::stoi(match[1]);
-      int minor = std::stoi(match[2]);
-      version = major * 1000 + minor * 10;
-      found = true;
-      break;
-    }
-    ++i;
-  }
-  if (not found) {
-    throw std::runtime_error("Error in parsing version");
-  }
-  return working_ptxas[i];
-}
-
-int vptx(int version) {
-  if (version >= 11040)
-    return 74;
-  if (version >= 11030)
-    return 73;
-  if (version >= 11020)
-    return 72;
-  if (version >= 11010)
-    return 71;
-  if (version >= 11000)
-    return 70;
-  if (version >= 10020)
-    return 65;
-  if (version >= 10010)
-    return 64;
-  if (version >= 10000)
-    return 63;
-  throw std::runtime_error("Triton requires CUDA 10+");
-}
-
-std::string llir_to_ptx(llvm::Module *module, int cc, int version) {
-  // LLVM version in use may not officially support target hardware
-  int max_nvvm_cc = 75;
-  int max_nvvm_ptx = 74;
-  // options
-  auto options = llvm::cl::getRegisteredOptions();
-  auto *short_ptr =
-      static_cast<llvm::cl::opt<bool> *>(options["nvptx-short-ptr"]);
-  assert(short_ptr);
-  short_ptr->setValue(true);
-  // compute capability
-  std::string sm = "sm_" + std::to_string(cc);
-  // max PTX version
-  int ptx = vptx(version);
-  int ptx_major = ptx / 10;
-  int ptx_minor = ptx % 10;
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "nvptx64-nvidia-cuda";
-  std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc));
-  std::string layout = "";
-  std::string features = "";
-  // std::string features = "+ptx" + std::to_string(std::min(ptx,
-  // max_nvvm_ptx));
-  init_llvm();
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // module->print(llvm::outs(), nullptr);
-
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target =
-      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
-      llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if (layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-  // emit
-  machine->addPassesToEmitFile(pass, stream, nullptr,
-                               llvm::CodeGenFileType::CGFT_AssemblyFile);
-  pass.run(*module);
-
-  // post-process
-  std::string result(buffer.begin(), buffer.end());
-  find_and_replace(result, ".version", "\n",
-                   ".version " + std::to_string(ptx_major) + "." +
-                       std::to_string(ptx_minor) + "\n");
-  find_and_replace(result, ".target", "\n", ".target " + sm + "\n");
-  while (find_and_replace(result, "\t// begin inline asm", "\n", ""))
-    ;
-  while (find_and_replace(result, "\t// end inline asm", "\n", ""))
-    ;
-  return result;
-}
-
-std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas,
-                         int cc) {
-  // compile ptx with ptxas
-  char _fsrc[L_tmpnam];
-  char _flog[L_tmpnam];
-  std::tmpnam(_fsrc);
-  std::tmpnam(_flog);
-  std::string fsrc = _fsrc;
-  std::string flog = _flog;
-  std::string fbin = fsrc + ".o";
-  const char *_fbin = fbin.c_str();
-  std::ofstream ofs(fsrc);
-  ofs << ptx << std::endl;
-  ofs.close();
-  std::string cmd;
-  int err;
-  cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc +
-        " -o " + fsrc + ".o 2> " + flog;
-  err = system(cmd.c_str());
-  if (err != 0) {
-    std::ifstream _log(_flog);
-    std::string log(std::istreambuf_iterator<char>(_log), {});
-    unlink(_fsrc);
-    unlink(_flog);
-    throw std::runtime_error("Internal Triton PTX codegen error: \n" + log);
-  }
-  CUmodule ret;
-  std::ifstream _cubin(_fbin, std::ios::binary);
-  std::string cubin(std::istreambuf_iterator<char>(_cubin), {});
-  _cubin.close();
-  unlink(_fsrc);
-  unlink(_flog);
-  unlink(_fbin);
-  dispatch::cuModuleLoadData(&ret, cubin.c_str());
-  return cubin;
-}
-
-/* ------------------------ */
-//         HIP              //
-/* ------------------------ */
-
-std::string llir_to_amdgpu(llvm::Module *module, const std::string &_proc) {
-  init_llvm();
-
-  //  proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo));
-  //  features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo));
-
-  // create
-  llvm::SmallVector<char, 0> buffer;
-  std::string triple = "amdgcn-amd-amdhsa";
-  std::string layout = "";
-  std::string features;
-  std::string proc = "gfx908";
-  // verify and store llvm
-  llvm::legacy::PassManager pm;
-  pm.add(llvm::createVerifierPass());
-  pm.run(*module);
-  // create machine
-  module->setTargetTriple(triple);
-  std::string error;
-  auto target =
-      llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error);
-  llvm::TargetOptions opt;
-  opt.AllowFPOpFusion = llvm::FPOpFusion::Fast;
-  opt.UnsafeFPMath = false;
-  opt.NoInfsFPMath = false;
-  opt.NoNaNsFPMath = true;
-  llvm::TargetMachine *machine = target->createTargetMachine(
-      module->getTargetTriple(), proc, features, opt, llvm::Reloc::PIC_,
-      llvm::None, llvm::CodeGenOpt::Aggressive);
-  // set data layout
-  if (layout.empty())
-    module->setDataLayout(machine->createDataLayout());
-  else
-    module->setDataLayout(layout);
-  // emit machine code
-  for (llvm::Function &f : module->functions())
-    f.addFnAttr(llvm::Attribute::AlwaysInline);
-  llvm::legacy::PassManager pass;
-  llvm::raw_svector_ostream stream(buffer);
-
-  // create dump files
-  std::string module_name = module->getModuleIdentifier();
-  std::error_code ec;
-
-  // Save GCN ISA binary.
-  std::string isabin_path =
-      std::string("/tmp/") + module_name + std::string(".o");
-  std::unique_ptr<llvm::raw_fd_ostream> isabin_fs(
-      new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text));
-  if (ec) {
-    std::cout << isabin_path << " was not created. error code: " << ec
-              << std::endl;
-  }
-
-  // emit
-  machine->addPassesToEmitFile(pass, *isabin_fs, nullptr,
-                               llvm::CGFT_ObjectFile);
-  pass.run(*module);
-  // Save GCN ISA.
-  std::string amdgcn_path =
-      std::string("/tmp/") + module_name + std::string(".gcn");
-  std::string result(buffer.begin(), buffer.end());
-  std::ofstream amdgcn(amdgcn_path);
-  amdgcn << result;
-  amdgcn.close();
-
-  // generate HASCO file
-  std::string hsaco_path =
-      std::string("/tmp/") + module_name + std::string(".hsaco");
-  std::string error_message;
-  int lld_result =
-      llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld",
-                                {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu",
-                                 "-shared", "-o", hsaco_path, isabin_path},
-                                llvm::None, {}, 0, 0, &error_message);
-  if (lld_result) {
-    std::cout << "ld.lld execute fail: " << std::endl;
-    std::cout << error_message << std::endl;
-    std::cout << lld_result << std::endl;
-  }
-
-  return hsaco_path;
-}
-
-hipModule_t amdgpu_to_hipmodule(const std::string &path) {
-  // Read HSACO.
-  std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate);
-  std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg();
-
-  std::vector<unsigned char> hsaco(hsaco_file_size);
-  hsaco_file.seekg(0, std::ios::beg);
-  hsaco_file.read(reinterpret_cast<char *>(&hsaco[0]), hsaco_file_size);
-  hsaco_file.close();
-  hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes,
-                        hipJitOptionErrorLogBuffer,
-                        hipJitOptionInfoLogBufferSizeBytes,
-                        hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose};
-  const unsigned int errbufsize = 8192;
-  const unsigned int logbufsize = 8192;
-  char _err[errbufsize];
-  char _log[logbufsize];
-  void *optval[] = {(void *)(uintptr_t)errbufsize, (void *)_err,
-                    (void *)(uintptr_t)logbufsize, (void *)_log, (void *)1};
-  hipModule_t ret;
-  dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval);
-  return ret;
-}
-
-} // namespace driver
-} // namespace triton