[CI] run clang-format (#24)

2022-07-26 17:25:03 -07:00
parent 25357083e6
commit 6d62d88d4f
62 changed files with 13673 additions and 11367 deletions
--- a/include/triton/Analysis/AxisInfo.h
+++ b/include/triton/Analysis/AxisInfo.h
@@ -10,7 +10,6 @@

 namespace mlir {

-
 //===----------------------------------------------------------------------===//
 // AxisInfo
 //===----------------------------------------------------------------------===//
@@ -25,26 +24,25 @@ public:

 public:
  // Default constructor
-  AxisInfo(): AxisInfo({}, {}, {}) { }
+  AxisInfo() : AxisInfo({}, {}, {}) {}
  // Construct contiguity info with known contiguity
  AxisInfo(ContiguityT knownContiguity, DivisibilityT knownDivisibility,
           ConstancyT knownConstancy)
-    : contiguity(knownContiguity), divisibility(knownDivisibility), 
-      constancy(knownConstancy), rank(contiguity.size()) { 
-      assert(knownDivisibility.size() == rank);
-      assert(knownConstancy.size() == rank);
-    }
-  
-  
+      : contiguity(knownContiguity), divisibility(knownDivisibility),
+        constancy(knownConstancy), rank(contiguity.size()) {
+    assert(knownDivisibility.size() == rank);
+    assert(knownConstancy.size() == rank);
+  }
+
  // Accessors
-  int getContiguity(size_t d) const { return contiguity[d];   }
-  const ContiguityT& getContiguity() const { return contiguity; }
+  int getContiguity(size_t d) const { return contiguity[d]; }
+  const ContiguityT &getContiguity() const { return contiguity; }

  int getDivisibility(size_t d) const { return divisibility[d]; }
-  const DivisibilityT& getDivisibility() const { return divisibility; }
+  const DivisibilityT &getDivisibility() const { return divisibility; }

-  int getConstancy(size_t d) const { return constancy[d];    }
-  const ConstancyT& getConstancy() const { return constancy; }
+  int getConstancy(size_t d) const { return constancy[d]; }
+  const ConstancyT &getConstancy() const { return constancy; }

  int getRank() const { return rank; }

@@ -56,13 +54,13 @@ public:
  }

  /// The pessimistic value state of the contiguity is unknown.
-  static AxisInfo getPessimisticValueState(MLIRContext *context) 
-  { return AxisInfo(); }
+  static AxisInfo getPessimisticValueState(MLIRContext *context) {
+    return AxisInfo();
+  }
  static AxisInfo getPessimisticValueState(Value value);

  // The gcd of both arguments for each dimension
-  static AxisInfo join(const AxisInfo &lhs,
-                       const AxisInfo &rhs);
+  static AxisInfo join(const AxisInfo &lhs, const AxisInfo &rhs);

 private:
  /// The _contiguity_ information maps the `d`-th
@@ -81,7 +79,7 @@ private:
  /// [19, 23, 27, 31]
  /// Would have contiguity [2, 1].
  ContiguityT contiguity;
-  
+
  /// The _divisibility_ information maps the `d`-th
  /// dimension to the largest power-of-two that
  /// divides the first element of all the values along it
@@ -107,39 +105,36 @@ private:
  /// [16, 16, 16, 16, 20, 20, 20, 20]
  /// would have constancy [1, 4]
  ConstancyT constancy;
-  
+
  // number of dimensions of the lattice
  int rank;
 };

-
-class AxisInfoAnalysis
-    : public ForwardDataFlowAnalysis<AxisInfo> {
+class AxisInfoAnalysis : public ForwardDataFlowAnalysis<AxisInfo> {

 private:
  static const int maxPow2Divisor = 65536;
- 
-  int highestPowOf2Divisor(int n){
-    if(n==0)
+
+  int highestPowOf2Divisor(int n) {
+    if (n == 0)
      return maxPow2Divisor;
    return (n & (~(n - 1)));
  }

-  AxisInfo visitBinaryOp(Operation* op, AxisInfo lhsInfo, AxisInfo rhsInfo,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getContiguity,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getDivisibility,
-                         const std::function<int(AxisInfo,AxisInfo,int)>& getConstancy);
+  AxisInfo visitBinaryOp(
+      Operation *op, AxisInfo lhsInfo, AxisInfo rhsInfo,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getContiguity,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getDivisibility,
+      const std::function<int(AxisInfo, AxisInfo, int)> &getConstancy);

 public:
  using ForwardDataFlowAnalysis<AxisInfo>::ForwardDataFlowAnalysis;

-  ChangeResult visitOperation(Operation *op,
-                      ArrayRef<LatticeElement<AxisInfo> *> operands) override;
-
+  ChangeResult
+  visitOperation(Operation *op,
+                 ArrayRef<LatticeElement<AxisInfo> *> operands) override;
 };

-
-}
- 
+} // namespace mlir

 #endif
--- a/include/triton/Conversion/Passes.h
+++ b/include/triton/Conversion/Passes.h
@@ -3,17 +3,13 @@

 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h"

-namespace mlir
-{
-namespace triton
-{
+namespace mlir {
+namespace triton {

 #define GEN_PASS_REGISTRATION
 #include "triton/Conversion/Passes.h.inc"

-
 } // namespace triton
 } // namespace mlir

-
 #endif
--- a/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h
+++ b/include/triton/Conversion/TritonToTritonGPU/TritonToTritonGPU.h
@@ -3,18 +3,17 @@

 #include <memory>

-namespace mlir{
+namespace mlir {

 class ModuleOp;
 template <typename T> class OperationPass;

-namespace triton{
+namespace triton {

-std::unique_ptr<OperationPass<ModuleOp>> 
+std::unique_ptr<OperationPass<ModuleOp>>
 createConvertTritonToTritonGPUPass(int numWarps = 4);

 }
 } // namespace mlir

-
 #endif
--- a/include/triton/Dialect/Triton/IR/Dialect.h
+++ b/include/triton/Dialect/Triton/IR/Dialect.h
@@ -1,17 +1,16 @@
 #ifndef TRITON_DIALECT_TRITON_IR_DIALECT_H_
 #define TRITON_DIALECT_TRITON_IR_DIALECT_H_

-
+#include "mlir/Dialect/SCF/SCF.h"
+#include "mlir/Dialect/StandardOps/IR/Ops.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
-#include "mlir/Dialect/StandardOps/IR/Ops.h"
-#include "mlir/Dialect/SCF/SCF.h"

-#include "triton/Dialect/Triton/IR/Traits.h"
-#include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/IR/Dialect.h.inc"
 #include "triton/Dialect/Triton/IR/OpsEnums.h.inc"
+#include "triton/Dialect/Triton/IR/Traits.h"
+#include "triton/Dialect/Triton/IR/Types.h"

 #define GET_OP_CLASSES
 #include "triton/Dialect/Triton/IR/Ops.h.inc"
--- a/include/triton/Dialect/Triton/IR/Traits.h
+++ b/include/triton/Dialect/Triton/IR/Traits.h
@@ -19,7 +19,7 @@ public:
  static LogicalResult verifyTrait(Operation *op) {
    // The rationale for this number is to prevent users from creating programs
    // that would have catastrophic register pressure and cause the compiler to
-    // hang. 
+    // hang.
    // Since H100 has 256KB registers, we should allow users to create tensors
    // of size up to 256K elements. It will spill for datatypes wider than 1B,
    // but we probably should limit number of elements (rather than bytes) to
@@ -31,8 +31,8 @@ public:
        for (int64_t s : tensorType.getShape())
          numElements *= s;
        if (numElements > maxElement)
-          return op->emitError("Maximum allowed number of elements is ") << maxElement << ", but "
-                 << *op << " has more than that";
+          return op->emitError("Maximum allowed number of elements is ")
+                 << maxElement << ", but " << *op << " has more than that";
        if ((numElements & (numElements - 1)) != 0)
          return op->emitError("Number of elements must be power-of-two, but ")
                 << *op << " doesn't follow the rule";
@@ -45,8 +45,8 @@ public:
        for (int64_t s : tensorType.getShape())
          numElements *= s;
        if (numElements > maxElement)
-          return op->emitError("Maximum allowed number of elements is ") << maxElement << ", but "
-                 << *op << " has more than that";
+          return op->emitError("Maximum allowed number of elements is ")
+                 << maxElement << ", but " << *op << " has more than that";
        if ((numElements & (numElements - 1)) != 0)
          return op->emitError("Number of elements must be power-of-two, but ")
                 << *op << " doesn't follow the rule";
@@ -57,7 +57,7 @@ public:
  }
 };

-}
-}
+} // namespace OpTrait
+} // namespace mlir

 #endif
--- a/include/triton/Dialect/Triton/Transforms/Passes.h
+++ b/include/triton/Dialect/Triton/Transforms/Passes.h
@@ -13,6 +13,6 @@ std::unique_ptr<Pass> createCombineOpsPass();
 #define GEN_PASS_REGISTRATION
 #include "triton/Dialect/Triton/Transforms/Passes.h.inc"

-}
+} // namespace mlir

 #endif
--- a/include/triton/Dialect/TritonGPU/IR/Dialect.h
+++ b/include/triton/Dialect/TritonGPU/IR/Dialect.h
@@ -15,5 +15,4 @@
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.h.inc"

-
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
--- a/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h
+++ b/include/triton/Dialect/TritonGPU/Transforms/TritonGPUConversion.h
@@ -14,6 +14,7 @@ namespace mlir {
 class TritonGPUTypeConverter : public TypeConverter {
 public:
  TritonGPUTypeConverter(MLIRContext *context, int numThreads);
+
 private:
  MLIRContext *context;
  int numThreads;
@@ -21,8 +22,10 @@ private:

 class TritonGPUConversionTarget : public ConversionTarget {
  TritonGPUTypeConverter &typeConverter;
+
 public:
-  explicit TritonGPUConversionTarget(MLIRContext &ctx, TritonGPUTypeConverter &typeConverter);
+  explicit TritonGPUConversionTarget(MLIRContext &ctx,
+                                     TritonGPUTypeConverter &typeConverter);

  /// update layouts & insert ConvertLayoutOp if necessary
  LogicalResult refineLayouts(ModuleOp mod, int numThreads);
--- a/include/triton/driver/dispatch.h
+++ b/include/triton/driver/dispatch.h
@@ -3,10 +3,10 @@
 #ifndef _TRITON_DRIVER_DISPATCH_H_
 #define _TRITON_DRIVER_DISPATCH_H_

-#include <type_traits>
 #include <dlfcn.h>
+#include <type_traits>

-//CUDA Backend
+// CUDA Backend
 #include "triton/external/CUDA/cuda.h"
 #include "triton/external/CUDA/nvml.h"

@@ -14,47 +14,43 @@
 //#define __HIP_PLATFORM_AMD__
 #include "triton/external/hip.h"

-//Exceptions
+// Exceptions
 #include <iostream>
 #include <stdexcept>

 namespace llvm {
 class PassRegistry;
 class Module;
-}
+} // namespace llvm

-namespace triton
-{
-namespace driver
-{
+namespace triton {
+namespace driver {

 class cu_context;

-template<class T> void check(T){}
+template <class T> void check(T) {}
 void check(CUresult err);
 void check(hipError_t err);

-class dispatch
-{
+class dispatch {
 protected:
-  template <class F>
-  struct return_type;
+  template <class F> struct return_type;

-  template <class R, class... A>
-  struct return_type<R (*)(A...)>
-  { typedef R type; };
+  template <class R, class... A> struct return_type<R (*)(A...)> {
+    typedef R type;
+  };

  typedef bool (*f_init_t)();

-  template<f_init_t initializer, typename FunPtrT, typename... Args>
-  static typename return_type<FunPtrT>::type f_impl(void*& lib_h, FunPtrT, void*& cache, const char * name, Args... args)
-  {
+  template <f_init_t initializer, typename FunPtrT, typename... Args>
+  static typename return_type<FunPtrT>::type
+  f_impl(void *&lib_h, FunPtrT, void *&cache, const char *name, Args... args) {
    initializer();
-    if(cache == nullptr){
+    if (cache == nullptr) {
      cache = dlsym(lib_h, name);
-			if(cache == 0)
-				throw std::runtime_error("dlsym unable to load function");
-		}
+      if (cache == 0)
+        throw std::runtime_error("dlsym unable to load function");
+    }
    FunPtrT fptr;
    *reinterpret_cast<void **>(&fptr) = cache;
    typename return_type<FunPtrT>::type res = (*fptr)(args...);
@@ -76,63 +72,99 @@ public:
  // context management
  static CUresult cuInit(unsigned int Flags);
  static CUresult cuCtxDestroy_v2(CUcontext ctx);
-  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags, CUdevice dev);
+  static CUresult cuCtxCreate_v2(CUcontext *pctx, unsigned int flags,
+                                 CUdevice dev);
  static CUresult cuCtxPushCurrent_v2(CUcontext ctx);
  static CUresult cuCtxPopCurrent_v2(CUcontext *pctx);
-  static CUresult cuCtxGetDevice(CUdevice* result);
-  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext, unsigned int flags);
+  static CUresult cuCtxGetDevice(CUdevice *result);
+  static CUresult cuCtxEnablePeerAccess(CUcontext peerContext,
+                                        unsigned int flags);
  static CUresult cuDriverGetVersion(int *driverVersion);
  // device management
  static CUresult cuDeviceGet(CUdevice *device, int ordinal);
  static CUresult cuDeviceGetName(char *name, int len, CUdevice dev);
  static CUresult cuDeviceGetPCIBusId(char *id, int len, CUdevice dev);
-  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib, CUdevice dev);
+  static CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                                       CUdevice dev);
  static CUresult cuDeviceGetCount(int *count);
  // link management
-  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type, void* data, size_t size, const char* name, unsigned int numOptions, CUjit_option* options, void** optionValues);
-  static CUresult cuLinkCreate_v2(unsigned int  numOptions, CUjit_option* options, void** optionValues, CUlinkState* stateOut);
-  static CUresult cuLinkComplete(CUlinkState state, void** cubinOut, size_t* sizeOut);
+  static CUresult cuLinkAddData_v2(CUlinkState state, CUjitInputType type,
+                                   void *data, size_t size, const char *name,
+                                   unsigned int numOptions,
+                                   CUjit_option *options, void **optionValues);
+  static CUresult cuLinkCreate_v2(unsigned int numOptions,
+                                  CUjit_option *options, void **optionValues,
+                                  CUlinkState *stateOut);
+  static CUresult cuLinkComplete(CUlinkState state, void **cubinOut,
+                                 size_t *sizeOut);
  static CUresult cuLinkDestroy(CUlinkState state);
  // module management
-  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t* bytes, CUmodule hmod, const char *name);
+  static CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes,
+                                       CUmodule hmod, const char *name);
  static CUresult cuModuleLoad(CUmodule *module, const char *fname);
-  static CUresult cuModuleLoadData(CUmodule* module, const void* image);
+  static CUresult cuModuleLoadData(CUmodule *module, const void *image);
  static CUresult cuModuleUnload(CUmodule hmod);
-  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
-  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
+  static CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+                                     unsigned int numOptions,
+                                     CUjit_option *options,
+                                     void **optionValues);
+  static CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                                      const char *name);
  // stream management
  static CUresult cuStreamCreate(CUstream *phStream, unsigned int Flags);
  static CUresult cuStreamSynchronize(CUstream hStream);
-  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext* pctx);
+  static CUresult cuStreamGetCtx(CUstream hStream, CUcontext *pctx);
  static CUresult cuStreamDestroy_v2(CUstream hStream);
-  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream, void **kernelParams, void **extra);
+  static CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                                 unsigned int gridDimY, unsigned int gridDimZ,
+                                 unsigned int blockDimX, unsigned int blockDimY,
+                                 unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes, CUstream hStream,
+                                 void **kernelParams, void **extra);
  // function management
-  static CUresult cuFuncGetAttribute(int* pi, CUfunction_attribute attrib, CUfunction hfunc);
-  static CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib, int value);
+  static CUresult cuFuncGetAttribute(int *pi, CUfunction_attribute attrib,
+                                     CUfunction hfunc);
+  static CUresult cuFuncSetAttribute(CUfunction hfunc,
+                                     CUfunction_attribute attrib, int value);
  static CUresult cuFuncSetCacheConfig(CUfunction hfunc, CUfunc_cache config);
  // memory management
  static CUresult cuMemAlloc_v2(CUdeviceptr *dptr, size_t bytesize);
-  static CUresult cuPointerGetAttribute(void * data, CUpointer_attribute attribute, CUdeviceptr ptr);
-  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N, CUstream stream);
-  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount);
+  static CUresult cuPointerGetAttribute(void *data,
+                                        CUpointer_attribute attribute,
+                                        CUdeviceptr ptr);
+  static CUresult cuMemsetD8Async(CUdeviceptr dst, unsigned char x, size_t N,
+                                  CUstream stream);
+  static CUresult cuMemcpyDtoH_v2(void *dstHost, CUdeviceptr srcDevice,
+                                  size_t ByteCount);
  static CUresult cuMemFree_v2(CUdeviceptr dptr);
-  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount, CUstream hStream);
-  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost, size_t ByteCount);
+  static CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice,
+                                       size_t ByteCount, CUstream hStream);
+  static CUresult cuMemcpyHtoDAsync_v2(CUdeviceptr dstDevice,
+                                       const void *srcHost, size_t ByteCount,
+                                       CUstream hStream);
+  static CUresult cuMemcpyHtoD_v2(CUdeviceptr dstDevice, const void *srcHost,
+                                  size_t ByteCount);
  // event management
  static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags);
-  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart, CUevent hEnd);
+  static CUresult cuEventElapsedTime(float *pMilliseconds, CUevent hStart,
+                                     CUevent hEnd);
  static CUresult cuEventRecord(CUevent hEvent, CUstream hStream);
  static CUresult cuEventDestroy_v2(CUevent hEvent);

-
  /* ------------------- *
   * NVML
   * ------------------- */
-  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2( const char* pciBusId, nvmlDevice_t* device);
-  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device, nvmlClockType_t type, unsigned int *clock);
-  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device, unsigned int mem_clock, unsigned int sm_clock);
+  static nvmlReturn_t nvmlDeviceGetHandleByPciBusId_v2(const char *pciBusId,
+                                                       nvmlDevice_t *device);
+  static nvmlReturn_t nvmlDeviceGetClockInfo(nvmlDevice_t device,
+                                             nvmlClockType_t type,
+                                             unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceGetMaxClockInfo(nvmlDevice_t device,
+                                                nvmlClockType_t type,
+                                                unsigned int *clock);
+  static nvmlReturn_t nvmlDeviceSetApplicationsClocks(nvmlDevice_t device,
+                                                      unsigned int mem_clock,
+                                                      unsigned int sm_clock);

  /* ------------------- *
   * HIP
@@ -140,177 +172,198 @@ public:
  // context management
  static hipError_t hipInit(unsigned int Flags);
  static hipError_t hipCtxDestroy(hipCtx_t ctx);
-  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags, hipDevice_t dev);
+  static hipError_t hipCtxCreate(hipCtx_t *pctx, unsigned int flags,
+                                 hipDevice_t dev);
  static hipError_t hipCtxPushCurrent(hipCtx_t ctx);
  static hipError_t hipCtxPopCurrent(hipCtx_t *pctx);
-  static hipError_t hipCtxGetDevice(hipDevice_t* result);
-  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext, unsigned int flags);
+  static hipError_t hipCtxGetDevice(hipDevice_t *result);
+  static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerContext,
+                                           unsigned int flags);
  static hipError_t hipDriverGetVersion(int *driverVersion);
  // device management
  static hipError_t hipGetDevice(hipDevice_t *device, int ordinal);
  static hipError_t hipDeviceGetName(char *name, int len, hipDevice_t dev);
  static hipError_t hipDeviceGetPCIBusId(char *id, int len, hipDevice_t dev);
-  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib, hipDevice_t dev);
+  static hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attrib,
+                                          hipDevice_t dev);
  static hipError_t hipGetDeviceCount(int *count);
  // module management
-  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t* bytes, hipModule_t hmod, const char *name);
+  static hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                                       hipModule_t hmod, const char *name);
  static hipError_t hipModuleLoad(hipModule_t *module, const char *fname);
-  static hipError_t hipModuleLoadData(hipModule_t* module, const void* image);
+  static hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
  static hipError_t hipModuleUnload(hipModule_t hmod);
-  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues);
-  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name);
+  static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image,
+                                        unsigned int numOptions,
+                                        hipJitOption *options,
+                                        void **optionValues);
+  static hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod,
+                                         const char *name);
  // stream management
  static hipError_t hipStreamCreate(hipStream_t *phStream, unsigned int Flags);
  static hipError_t hipStreamSynchronize(hipStream_t hStream);
  static hipError_t hipStreamDestroy(hipStream_t hStream);
-  static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra);
+  static hipError_t
+  hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                        unsigned int gridDimY, unsigned int gridDimZ,
+                        unsigned int blockDimX, unsigned int blockDimY,
+                        unsigned int blockDimZ, unsigned int sharedMemBytes,
+                        hipStream_t hStream, void **kernelParams, void **extra);
  // function management
-  static hipError_t hipFuncGetAttributes(hipFuncAttributes* attrib, void* hfunc);
-  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc, hipFuncAttribute attrib, int value);
-  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc, hipFuncCache_t config);
+  static hipError_t hipFuncGetAttributes(hipFuncAttributes *attrib,
+                                         void *hfunc);
+  static hipError_t hipFuncSetAttribute(hipFunction_t hfunc,
+                                        hipFuncAttribute attrib, int value);
+  static hipError_t hipFuncSetCacheConfig(hipFunction_t hfunc,
+                                          hipFuncCache_t config);
  // memory management
  static hipError_t hipMalloc(hipDeviceptr_t *dptr, size_t bytesize);
-  static hipError_t hipPointerGetAttribute(void * data, CUpointer_attribute attribute, hipDeviceptr_t ptr);
-  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x, size_t N, hipStream_t stream);
-  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount);
+  static hipError_t hipPointerGetAttribute(void *data,
+                                           CUpointer_attribute attribute,
+                                           hipDeviceptr_t ptr);
+  static hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char x,
+                                     size_t N, hipStream_t stream);
+  static hipError_t hipMemcpyDtoH(void *dstHost, hipDeviceptr_t srcDevice,
+                                  size_t ByteCount);
  static hipError_t hipFree(hipDeviceptr_t dptr);
-  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice, size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount, hipStream_t hStream);
-  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost, size_t ByteCount);
+  static hipError_t hipMemcpyDtoHAsync(void *dstHost, hipDeviceptr_t srcDevice,
+                                       size_t ByteCount, hipStream_t hStream);
+  static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dstDevice,
+                                       const void *srcHost, size_t ByteCount,
+                                       hipStream_t hStream);
+  static hipError_t hipMemcpyHtoD(hipDeviceptr_t dstDevice, const void *srcHost,
+                                  size_t ByteCount);
  // event management
  static hipError_t hipEventCreate(hipEvent_t *phEvent, unsigned int Flags);
-  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart, hipEvent_t hEnd);
+  static hipError_t hipEventElapsedTime(float *pMilliseconds, hipEvent_t hStart,
+                                        hipEvent_t hEnd);
  static hipError_t hipEventRecord(hipEvent_t hEvent, hipStream_t hStream);
  static hipError_t hipEventDestroy(hipEvent_t hEvent);

-
-
 private:
-
  // Libraries
-  static void* cuda_;
-  static void* nvml_;
-  static void* hip_;
-
+  static void *cuda_;
+  static void *nvml_;
+  static void *hip_;

  /* ------------------- *
   * CUDA
   * ------------------- */
  // context management
-  static void* cuCtxGetCurrent_;
-  static void* cuCtxSetCurrent_;
-  static void* cuCtxDestroy_v2_;
-  static void* cuCtxCreate_v2_;
-  static void* cuCtxGetDevice_;
-  static void* cuCtxPushCurrent_v2_;
-  static void* cuCtxPopCurrent_v2_;
-  static void* cuCtxEnablePeerAccess_;
-  static void* cuDriverGetVersion_;
-  static void* cuInit_;
+  static void *cuCtxGetCurrent_;
+  static void *cuCtxSetCurrent_;
+  static void *cuCtxDestroy_v2_;
+  static void *cuCtxCreate_v2_;
+  static void *cuCtxGetDevice_;
+  static void *cuCtxPushCurrent_v2_;
+  static void *cuCtxPopCurrent_v2_;
+  static void *cuCtxEnablePeerAccess_;
+  static void *cuDriverGetVersion_;
+  static void *cuInit_;
  // device management
-  static void* cuDeviceGet_;
-  static void* cuDeviceGetName_;
-  static void* cuDeviceGetPCIBusId_;
-  static void* cuDeviceGetAttribute_;
-  static void* cuDeviceGetCount_;
+  static void *cuDeviceGet_;
+  static void *cuDeviceGetName_;
+  static void *cuDeviceGetPCIBusId_;
+  static void *cuDeviceGetAttribute_;
+  static void *cuDeviceGetCount_;
  // link management
-  static void* cuLinkAddData_v2_;
-  static void* cuLinkCreate_v2_;
-  static void* cuLinkDestroy_;
-  static void* cuLinkComplete_;
+  static void *cuLinkAddData_v2_;
+  static void *cuLinkCreate_v2_;
+  static void *cuLinkDestroy_;
+  static void *cuLinkComplete_;
  // module management
-  static void* cuModuleGetGlobal_v2_;
-  static void* cuModuleLoad_;
-  static void* cuModuleUnload_;
-  static void* cuModuleLoadDataEx_;
-  static void* cuModuleLoadData_;
-  static void* cuModuleGetFunction_;
+  static void *cuModuleGetGlobal_v2_;
+  static void *cuModuleLoad_;
+  static void *cuModuleUnload_;
+  static void *cuModuleLoadDataEx_;
+  static void *cuModuleLoadData_;
+  static void *cuModuleGetFunction_;
  // stream management
-  static void* cuStreamCreate_;
-  static void* cuStreamSynchronize_;
-  static void* cuStreamDestroy_v2_;
-  static void* cuStreamGetCtx_;
-  static void* cuLaunchKernel_;
+  static void *cuStreamCreate_;
+  static void *cuStreamSynchronize_;
+  static void *cuStreamDestroy_v2_;
+  static void *cuStreamGetCtx_;
+  static void *cuLaunchKernel_;
  // function management
-  static void* cuFuncGetAttribute_;
-  static void* cuFuncSetAttribute_;
-  static void* cuFuncSetCacheConfig_;
+  static void *cuFuncGetAttribute_;
+  static void *cuFuncSetAttribute_;
+  static void *cuFuncSetCacheConfig_;
  // memory management
-  static void* cuMemcpyDtoH_v2_;
-  static void* cuMemFree_v2_;
-  static void* cuMemcpyDtoHAsync_v2_;
-  static void* cuMemcpyHtoDAsync_v2_;
-  static void* cuMemcpyHtoD_v2_;
-  static void* cuMemAlloc_v2_;
-  static void* cuMemsetD8Async_;
-  static void* cuPointerGetAttribute_;
+  static void *cuMemcpyDtoH_v2_;
+  static void *cuMemFree_v2_;
+  static void *cuMemcpyDtoHAsync_v2_;
+  static void *cuMemcpyHtoDAsync_v2_;
+  static void *cuMemcpyHtoD_v2_;
+  static void *cuMemAlloc_v2_;
+  static void *cuMemsetD8Async_;
+  static void *cuPointerGetAttribute_;
  // event management
-  static void* cuEventCreate_;
-  static void* cuEventElapsedTime_;
-  static void* cuEventRecord_;
-  static void* cuEventDestroy_v2_;
+  static void *cuEventCreate_;
+  static void *cuEventElapsedTime_;
+  static void *cuEventRecord_;
+  static void *cuEventDestroy_v2_;

  /* ------------------- *
   * NVML
   * ------------------- */
-  static void* nvmlInit_v2_;
-  static void* nvmlDeviceGetHandleByPciBusId_v2_;
-  static void* nvmlDeviceGetClockInfo_;
-  static void* nvmlDeviceGetMaxClockInfo_;
-  static void* nvmlDeviceSetApplicationsClocks_;
+  static void *nvmlInit_v2_;
+  static void *nvmlDeviceGetHandleByPciBusId_v2_;
+  static void *nvmlDeviceGetClockInfo_;
+  static void *nvmlDeviceGetMaxClockInfo_;
+  static void *nvmlDeviceSetApplicationsClocks_;

  /* ------------------- *
   * HIP
   * ------------------- */
  // context management
-  static void* hipInit_;
-  static void* hipCtxDestroy_;
-  static void* hipCtxCreate_;
-  static void* hipCtxPushCurrent_;
-  static void* hipCtxPopCurrent_;
-  static void* hipCtxGetDevice_;
-  static void* hipCtxEnablePeerAccess_;
-  static void* hipDriverGetVersion_;
+  static void *hipInit_;
+  static void *hipCtxDestroy_;
+  static void *hipCtxCreate_;
+  static void *hipCtxPushCurrent_;
+  static void *hipCtxPopCurrent_;
+  static void *hipCtxGetDevice_;
+  static void *hipCtxEnablePeerAccess_;
+  static void *hipDriverGetVersion_;
  // device management
-  static void* hipGetDevice_;
-  static void* hipDeviceGetName_;
-  static void* hipDeviceGetPCIBusId_;
-  static void* hipDeviceGetAttribute_;
-  static void* hipGetDeviceCount_;
+  static void *hipGetDevice_;
+  static void *hipDeviceGetName_;
+  static void *hipDeviceGetPCIBusId_;
+  static void *hipDeviceGetAttribute_;
+  static void *hipGetDeviceCount_;
  // module management
-  static void* hipModuleGetGlobal_;
-  static void* hipModuleLoad_;
-  static void* hipModuleLoadData_;
-  static void* hipModuleUnload_;
-  static void* hipModuleLoadDataEx_;
-  static void* hipModuleGetFunction_;
+  static void *hipModuleGetGlobal_;
+  static void *hipModuleLoad_;
+  static void *hipModuleLoadData_;
+  static void *hipModuleUnload_;
+  static void *hipModuleLoadDataEx_;
+  static void *hipModuleGetFunction_;
  // stream management
-  static void* hipStreamCreate_;
-  static void* hipStreamSynchronize_;
-  static void* hipStreamDestroy_;
-  static void* hipModuleLaunchKernel_;;
+  static void *hipStreamCreate_;
+  static void *hipStreamSynchronize_;
+  static void *hipStreamDestroy_;
+  static void *hipModuleLaunchKernel_;
+  ;
  // function management
-  static void* hipFuncGetAttributes_;
-  static void* hipFuncSetAttribute_;
-  static void* hipFuncSetCacheConfig_;
+  static void *hipFuncGetAttributes_;
+  static void *hipFuncSetAttribute_;
+  static void *hipFuncSetCacheConfig_;
  // memory management
-  static void* hipMalloc_;
-  static void* hipPointerGetAttribute_;
-  static void* hipMemsetD8Async_;
-  static void* hipMemcpyDtoH_;
-  static void* hipFree_;
-  static void* hipMemcpyDtoHAsync_;
-  static void* hipMemcpyHtoDAsync_;
-  static void* hipMemcpyHtoD_;
+  static void *hipMalloc_;
+  static void *hipPointerGetAttribute_;
+  static void *hipMemsetD8Async_;
+  static void *hipMemcpyDtoH_;
+  static void *hipFree_;
+  static void *hipMemcpyDtoHAsync_;
+  static void *hipMemcpyHtoDAsync_;
+  static void *hipMemcpyHtoD_;
  // event management
-  static void* hipEventCreate_;
-  static void* hipEventElapsedTime_;
-  static void* hipEventRecord_;
-  static void* hipEventDestroy_;
+  static void *hipEventCreate_;
+  static void *hipEventElapsedTime_;
+  static void *hipEventRecord_;
+  static void *hipEventDestroy_;
 };

-}
-}
-
+} // namespace driver
+} // namespace triton

 #endif
--- a/include/triton/driver/error.h
+++ b/include/triton/driver/error.h
@@ -3,223 +3,252 @@
 #ifndef _TRITON_DRIVER_ERROR_H_
 #define _TRITON_DRIVER_ERROR_H_

-#include <exception>
 #include "triton/driver/dispatch.h"
+#include <exception>

+namespace triton {

-namespace triton
-{
+namespace driver {

-  namespace driver
-  {
+namespace exception {

-  namespace exception
-  {
+namespace nvrtc {

-  namespace nvrtc
-  {
+#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg)                               \
+  class name : public std::exception {                                         \
+  public:                                                                      \
+    const char *what() const throw() override { return "NVRTC: Error- " msg; } \
+  }

-#define TRITON_CREATE_NVRTC_EXCEPTION(name, msg) \
-class name: public std::exception { public: const char * what() const throw() override { return "NVRTC: Error- " msg; } }
-
-  TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory              ,"out of memory");
-  TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure   ,"program creation failure");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_input              ,"invalid input");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_program            ,"invalid program");
-  TRITON_CREATE_NVRTC_EXCEPTION(invalid_option             ,"invalid option");
-  TRITON_CREATE_NVRTC_EXCEPTION(compilation                ,"compilation");
-  TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure  ,"builtin operation failure");
-  TRITON_CREATE_NVRTC_EXCEPTION(unknown_error              ,"unknown error");
+TRITON_CREATE_NVRTC_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_NVRTC_EXCEPTION(program_creation_failure,
+                              "program creation failure");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_input, "invalid input");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_program, "invalid program");
+TRITON_CREATE_NVRTC_EXCEPTION(invalid_option, "invalid option");
+TRITON_CREATE_NVRTC_EXCEPTION(compilation, "compilation");
+TRITON_CREATE_NVRTC_EXCEPTION(builtin_operation_failure,
+                              "builtin operation failure");
+TRITON_CREATE_NVRTC_EXCEPTION(unknown_error, "unknown error");

 #undef TRITON_CREATE_NVRTC_EXCEPTION
+} // namespace nvrtc
+
+namespace cuda {
+class base : public std::exception {};
+
+#define TRITON_CREATE_CUDA_EXCEPTION(name, msg)                                \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override { return "CUDA: Error- " msg; }  \
  }

-
-  namespace cuda
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_CUDA_EXCEPTION(name, msg) \
-class name: public base { public:const char * what() const throw() override { return "CUDA: Error- " msg; } }
-
-
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_value                   ,"invalid value");
-  TRITON_CREATE_CUDA_EXCEPTION(out_of_memory                   ,"out of memory");
-  TRITON_CREATE_CUDA_EXCEPTION(not_initialized                 ,"not initialized");
-  TRITON_CREATE_CUDA_EXCEPTION(deinitialized                   ,"deinitialized");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled               ,"profiler disabled");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started        ,"profiler already started");
-  TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
-  TRITON_CREATE_CUDA_EXCEPTION(no_device                       ,"no device");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_device                  ,"invalid device");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_image                   ,"invalid image");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_context                 ,"invalid context");
-  TRITON_CREATE_CUDA_EXCEPTION(context_already_current         ,"context already current");
-  TRITON_CREATE_CUDA_EXCEPTION(map_failed                      ,"map failed");
-  TRITON_CREATE_CUDA_EXCEPTION(unmap_failed                    ,"unmap failed");
-  TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped                 ,"array is mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(already_mapped                  ,"already mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
-  TRITON_CREATE_CUDA_EXCEPTION(already_acquired                ,"already acquired");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped                      ,"not mapped");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
-  TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
-  TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
-  TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit               ,"unsupported limit");
-  TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use          ,"context already in use");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx                     ,"invalid ptx");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_source                  ,"invalid source");
-  TRITON_CREATE_CUDA_EXCEPTION(file_not_found                  ,"file not found");
-  TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
-  TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
-  TRITON_CREATE_CUDA_EXCEPTION(operating_system                ,"operating system");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_handle                  ,"invalid handle");
-  TRITON_CREATE_CUDA_EXCEPTION(not_found                       ,"not found");
-  TRITON_CREATE_CUDA_EXCEPTION(not_ready                       ,"not ready");
-  TRITON_CREATE_CUDA_EXCEPTION(illegal_address                 ,"illegal address");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_timeout                  ,"launch timeout");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
-  TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
-  TRITON_CREATE_CUDA_EXCEPTION(primary_context_active          ,"primary context active");
-  TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed            ,"context is destroyed");
-  TRITON_CREATE_CUDA_EXCEPTION(assert_error                    ,"assert");
-  TRITON_CREATE_CUDA_EXCEPTION(too_many_peers                  ,"too many peers");
-  TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
-  TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
-  TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error            ,"hardware stack error");
-  TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction             ,"illegal instruction");
-  TRITON_CREATE_CUDA_EXCEPTION(misaligned_address              ,"misaligned address");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space           ,"invalid address space");
-  TRITON_CREATE_CUDA_EXCEPTION(invalid_pc                      ,"invalid pc");
-  TRITON_CREATE_CUDA_EXCEPTION(launch_failed                   ,"launch failed");
-  TRITON_CREATE_CUDA_EXCEPTION(not_permitted                   ,"not permitted");
-  TRITON_CREATE_CUDA_EXCEPTION(not_supported                   ,"not supported");
-  TRITON_CREATE_CUDA_EXCEPTION(unknown                         ,"unknown");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUDA_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_CUDA_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUDA_EXCEPTION(deinitialized, "deinitialized");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_disabled, "profiler disabled");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_not_initialized,
+                             "profiler not initialized");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_already_started,
+                             "profiler already started");
+TRITON_CREATE_CUDA_EXCEPTION(profiler_already_stopped,
+                             "profiler already stopped");
+TRITON_CREATE_CUDA_EXCEPTION(no_device, "no device");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_device, "invalid device");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_image, "invalid image");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_context, "invalid context");
+TRITON_CREATE_CUDA_EXCEPTION(context_already_current,
+                             "context already current");
+TRITON_CREATE_CUDA_EXCEPTION(map_failed, "map failed");
+TRITON_CREATE_CUDA_EXCEPTION(unmap_failed, "unmap failed");
+TRITON_CREATE_CUDA_EXCEPTION(array_is_mapped, "array is mapped");
+TRITON_CREATE_CUDA_EXCEPTION(already_mapped, "already mapped");
+TRITON_CREATE_CUDA_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
+TRITON_CREATE_CUDA_EXCEPTION(already_acquired, "already acquired");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped, "not mapped");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_array, "not mapped as array");
+TRITON_CREATE_CUDA_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
+TRITON_CREATE_CUDA_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
+TRITON_CREATE_CUDA_EXCEPTION(unsupported_limit, "unsupported limit");
+TRITON_CREATE_CUDA_EXCEPTION(context_already_in_use, "context already in use");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_unsupported,
+                             "peer access unsupported");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_ptx, "invalid ptx");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_graphics_context,
+                             "invalid graphics context");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_source, "invalid source");
+TRITON_CREATE_CUDA_EXCEPTION(file_not_found, "file not found");
+TRITON_CREATE_CUDA_EXCEPTION(shared_object_symbol_not_found,
+                             "shared object symbol not found");
+TRITON_CREATE_CUDA_EXCEPTION(shared_object_init_failed,
+                             "shared object init failed");
+TRITON_CREATE_CUDA_EXCEPTION(operating_system, "operating system");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_handle, "invalid handle");
+TRITON_CREATE_CUDA_EXCEPTION(not_found, "not found");
+TRITON_CREATE_CUDA_EXCEPTION(not_ready, "not ready");
+TRITON_CREATE_CUDA_EXCEPTION(illegal_address, "illegal address");
+TRITON_CREATE_CUDA_EXCEPTION(launch_out_of_resources,
+                             "launch out of resources");
+TRITON_CREATE_CUDA_EXCEPTION(launch_timeout, "launch timeout");
+TRITON_CREATE_CUDA_EXCEPTION(launch_incompatible_texturing,
+                             "launch incompatible texturing");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_already_enabled,
+                             "peer access already enabled");
+TRITON_CREATE_CUDA_EXCEPTION(peer_access_not_enabled,
+                             "peer access not enabled");
+TRITON_CREATE_CUDA_EXCEPTION(primary_context_active, "primary context active");
+TRITON_CREATE_CUDA_EXCEPTION(context_is_destroyed, "context is destroyed");
+TRITON_CREATE_CUDA_EXCEPTION(assert_error, "assert");
+TRITON_CREATE_CUDA_EXCEPTION(too_many_peers, "too many peers");
+TRITON_CREATE_CUDA_EXCEPTION(host_memory_already_registered,
+                             "host memory already registered");
+TRITON_CREATE_CUDA_EXCEPTION(host_memory_not_registered,
+                             "hot memory not registered");
+TRITON_CREATE_CUDA_EXCEPTION(hardware_stack_error, "hardware stack error");
+TRITON_CREATE_CUDA_EXCEPTION(illegal_instruction, "illegal instruction");
+TRITON_CREATE_CUDA_EXCEPTION(misaligned_address, "misaligned address");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_address_space, "invalid address space");
+TRITON_CREATE_CUDA_EXCEPTION(invalid_pc, "invalid pc");
+TRITON_CREATE_CUDA_EXCEPTION(launch_failed, "launch failed");
+TRITON_CREATE_CUDA_EXCEPTION(not_permitted, "not permitted");
+TRITON_CREATE_CUDA_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUDA_EXCEPTION(unknown, "unknown");

 #undef TRITON_CREATE_CUDA_EXCEPTION
+} // namespace cuda
+
+namespace cublas {
+class base : public std::exception {};
+
+#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg)                              \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override {                                \
+      return "CUBLAS: Error- " msg;                                            \
+    }                                                                          \
  }

-  namespace cublas
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_CUBLAS_EXCEPTION(name, msg) \
-class name: public base { public: const char * what() const throw() override { return "CUBLAS: Error- " msg; } }
-
-  TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized              ,"not initialized");
-  TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed                 ,"alloc failed");
-  TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value                ,"invalid value");
-  TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error                ,"mapping error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed             ,"execution failed");
-  TRITON_CREATE_CUBLAS_EXCEPTION(internal_error               ,"internal error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(not_supported                ,"not supported");
-  TRITON_CREATE_CUBLAS_EXCEPTION(license_error                ,"license error");
-  TRITON_CREATE_CUBLAS_EXCEPTION(unknown                      ,"unknown");
+TRITON_CREATE_CUBLAS_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUBLAS_EXCEPTION(alloc_failed, "alloc failed");
+TRITON_CREATE_CUBLAS_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUBLAS_EXCEPTION(arch_mismatch, "arch mismatch");
+TRITON_CREATE_CUBLAS_EXCEPTION(mapping_error, "mapping error");
+TRITON_CREATE_CUBLAS_EXCEPTION(execution_failed, "execution failed");
+TRITON_CREATE_CUBLAS_EXCEPTION(internal_error, "internal error");
+TRITON_CREATE_CUBLAS_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUBLAS_EXCEPTION(license_error, "license error");
+TRITON_CREATE_CUBLAS_EXCEPTION(unknown, "unknown");

 #undef TRITON_CREATE_CUBLAS_EXCEPTION
+} // namespace cublas
+
+namespace cudnn {
+#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg)                               \
+  class name : public std::exception {                                         \
+  public:                                                                      \
+    const char *what() const throw() override { return "CUDNN: Error- " msg; } \
  }

-  namespace cudnn
-  {
-#define TRITON_CREATE_CUDNN_EXCEPTION(name, msg) \
-class name: public std::exception { public: const char * what() const throw() override { return "CUDNN: Error- " msg; } }
+TRITON_CREATE_CUDNN_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed, "allocation failed");
+TRITON_CREATE_CUDNN_EXCEPTION(bad_param, "bad param");
+TRITON_CREATE_CUDNN_EXCEPTION(internal_error, "internal error");
+TRITON_CREATE_CUDNN_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch, "arch mismatch");
+TRITON_CREATE_CUDNN_EXCEPTION(mapping_error, "mapping error");
+TRITON_CREATE_CUDNN_EXCEPTION(execution_failed, "execution failed");
+TRITON_CREATE_CUDNN_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_CUDNN_EXCEPTION(license_error, "license error");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing,
+                              "prerequisite missing");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress, "runtime in progress");
+TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow, "runtime fp overflow");
+} // namespace cudnn

-  TRITON_CREATE_CUDNN_EXCEPTION(not_initialized              ,"not initialized");
-  TRITON_CREATE_CUDNN_EXCEPTION(alloc_failed                 ,"allocation failed");
-  TRITON_CREATE_CUDNN_EXCEPTION(bad_param                    ,"bad param");
-  TRITON_CREATE_CUDNN_EXCEPTION(internal_error               ,"internal error");
-  TRITON_CREATE_CUDNN_EXCEPTION(invalid_value                ,"invalid value");
-  TRITON_CREATE_CUDNN_EXCEPTION(arch_mismatch                ,"arch mismatch");
-  TRITON_CREATE_CUDNN_EXCEPTION(mapping_error                ,"mapping error");
-  TRITON_CREATE_CUDNN_EXCEPTION(execution_failed             ,"execution failed");
-  TRITON_CREATE_CUDNN_EXCEPTION(not_supported                ,"not supported");
-  TRITON_CREATE_CUDNN_EXCEPTION(license_error                ,"license error");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_prerequisite_missing ,"prerequisite missing");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_in_progress          ,"runtime in progress");
-  TRITON_CREATE_CUDNN_EXCEPTION(runtime_fp_overflow          ,"runtime fp overflow");
+namespace hip {
+class base : public std::exception {};
+
+#define TRITON_CREATE_HIP_EXCEPTION(name, msg)                                 \
+  class name : public base {                                                   \
+  public:                                                                      \
+    const char *what() const throw() override { return "HIP: Error- " msg; }   \
  }

-
-
-
-  namespace hip
-  {
-  class base: public std::exception{};
-
-#define TRITON_CREATE_HIP_EXCEPTION(name, msg) \
-class name: public base { public:const char * what() const throw() override { return "HIP: Error- " msg; } }
-
-
-  TRITON_CREATE_HIP_EXCEPTION(invalid_value                   ,"invalid value");
-  TRITON_CREATE_HIP_EXCEPTION(out_of_memory                   ,"out of memory");
-  TRITON_CREATE_HIP_EXCEPTION(not_initialized                 ,"not initialized");
-  TRITON_CREATE_HIP_EXCEPTION(deinitialized                   ,"deinitialized");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_disabled               ,"profiler disabled");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized        ,"profiler not initialized");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_already_started        ,"profiler already started");
-  TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped        ,"profiler already stopped");
-  TRITON_CREATE_HIP_EXCEPTION(no_device                       ,"no device");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_device                  ,"invalid device");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_image                   ,"invalid image");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_context                 ,"invalid context");
-  TRITON_CREATE_HIP_EXCEPTION(context_already_current         ,"context already current");
-  TRITON_CREATE_HIP_EXCEPTION(map_failed                      ,"map failed");
-  TRITON_CREATE_HIP_EXCEPTION(unmap_failed                    ,"unmap failed");
-  TRITON_CREATE_HIP_EXCEPTION(array_is_mapped                 ,"array is mapped");
-  TRITON_CREATE_HIP_EXCEPTION(already_mapped                  ,"already mapped");
-  TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu               ,"no binary for gpu");
-  TRITON_CREATE_HIP_EXCEPTION(already_acquired                ,"already acquired");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped                      ,"not mapped");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array             ,"not mapped as array");
-  TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer           ,"not mapped as pointer");
-  TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable               ,"ecc uncorrectable");
-  TRITON_CREATE_HIP_EXCEPTION(unsupported_limit               ,"unsupported limit");
-  TRITON_CREATE_HIP_EXCEPTION(context_already_in_use          ,"context already in use");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported         ,"peer access unsupported");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_ptx                     ,"invalid ptx");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context        ,"invalid graphics context");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_source                  ,"invalid source");
-  TRITON_CREATE_HIP_EXCEPTION(file_not_found                  ,"file not found");
-  TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found  ,"shared object symbol not found");
-  TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed       ,"shared object init failed");
-  TRITON_CREATE_HIP_EXCEPTION(operating_system                ,"operating system");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_handle                  ,"invalid handle");
-  TRITON_CREATE_HIP_EXCEPTION(not_found                       ,"not found");
-  TRITON_CREATE_HIP_EXCEPTION(not_ready                       ,"not ready");
-  TRITON_CREATE_HIP_EXCEPTION(illegal_address                 ,"illegal address");
-  TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources         ,"launch out of resources");
-  TRITON_CREATE_HIP_EXCEPTION(launch_timeout                  ,"launch timeout");
-  TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing   ,"launch incompatible texturing");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled     ,"peer access already enabled");
-  TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled         ,"peer access not enabled");
-  TRITON_CREATE_HIP_EXCEPTION(primary_context_active          ,"primary context active");
-  TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed            ,"context is destroyed");
-  TRITON_CREATE_HIP_EXCEPTION(assert_error                    ,"assert");
-  TRITON_CREATE_HIP_EXCEPTION(too_many_peers                  ,"too many peers");
-  TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered  ,"host memory already registered");
-  TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered      ,"hot memory not registered");
-  TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error            ,"hardware stack error");
-  TRITON_CREATE_HIP_EXCEPTION(illegal_instruction             ,"illegal instruction");
-  TRITON_CREATE_HIP_EXCEPTION(misaligned_address              ,"misaligned address");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_address_space           ,"invalid address space");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_pc                      ,"invalid pc");
-  TRITON_CREATE_HIP_EXCEPTION(launch_failed                   ,"launch failed");
-  TRITON_CREATE_HIP_EXCEPTION(not_permitted                   ,"not permitted");
-  TRITON_CREATE_HIP_EXCEPTION(not_supported                   ,"not supported");
-  TRITON_CREATE_HIP_EXCEPTION(invalid_symbol                   ,"invalid symbol");
-  TRITON_CREATE_HIP_EXCEPTION(unknown                         ,"unknown");
+TRITON_CREATE_HIP_EXCEPTION(invalid_value, "invalid value");
+TRITON_CREATE_HIP_EXCEPTION(out_of_memory, "out of memory");
+TRITON_CREATE_HIP_EXCEPTION(not_initialized, "not initialized");
+TRITON_CREATE_HIP_EXCEPTION(deinitialized, "deinitialized");
+TRITON_CREATE_HIP_EXCEPTION(profiler_disabled, "profiler disabled");
+TRITON_CREATE_HIP_EXCEPTION(profiler_not_initialized,
+                            "profiler not initialized");
+TRITON_CREATE_HIP_EXCEPTION(profiler_already_started,
+                            "profiler already started");
+TRITON_CREATE_HIP_EXCEPTION(profiler_already_stopped,
+                            "profiler already stopped");
+TRITON_CREATE_HIP_EXCEPTION(no_device, "no device");
+TRITON_CREATE_HIP_EXCEPTION(invalid_device, "invalid device");
+TRITON_CREATE_HIP_EXCEPTION(invalid_image, "invalid image");
+TRITON_CREATE_HIP_EXCEPTION(invalid_context, "invalid context");
+TRITON_CREATE_HIP_EXCEPTION(context_already_current, "context already current");
+TRITON_CREATE_HIP_EXCEPTION(map_failed, "map failed");
+TRITON_CREATE_HIP_EXCEPTION(unmap_failed, "unmap failed");
+TRITON_CREATE_HIP_EXCEPTION(array_is_mapped, "array is mapped");
+TRITON_CREATE_HIP_EXCEPTION(already_mapped, "already mapped");
+TRITON_CREATE_HIP_EXCEPTION(no_binary_for_gpu, "no binary for gpu");
+TRITON_CREATE_HIP_EXCEPTION(already_acquired, "already acquired");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped, "not mapped");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_array, "not mapped as array");
+TRITON_CREATE_HIP_EXCEPTION(not_mapped_as_pointer, "not mapped as pointer");
+TRITON_CREATE_HIP_EXCEPTION(ecc_uncorrectable, "ecc uncorrectable");
+TRITON_CREATE_HIP_EXCEPTION(unsupported_limit, "unsupported limit");
+TRITON_CREATE_HIP_EXCEPTION(context_already_in_use, "context already in use");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_unsupported, "peer access unsupported");
+TRITON_CREATE_HIP_EXCEPTION(invalid_ptx, "invalid ptx");
+TRITON_CREATE_HIP_EXCEPTION(invalid_graphics_context,
+                            "invalid graphics context");
+TRITON_CREATE_HIP_EXCEPTION(invalid_source, "invalid source");
+TRITON_CREATE_HIP_EXCEPTION(file_not_found, "file not found");
+TRITON_CREATE_HIP_EXCEPTION(shared_object_symbol_not_found,
+                            "shared object symbol not found");
+TRITON_CREATE_HIP_EXCEPTION(shared_object_init_failed,
+                            "shared object init failed");
+TRITON_CREATE_HIP_EXCEPTION(operating_system, "operating system");
+TRITON_CREATE_HIP_EXCEPTION(invalid_handle, "invalid handle");
+TRITON_CREATE_HIP_EXCEPTION(not_found, "not found");
+TRITON_CREATE_HIP_EXCEPTION(not_ready, "not ready");
+TRITON_CREATE_HIP_EXCEPTION(illegal_address, "illegal address");
+TRITON_CREATE_HIP_EXCEPTION(launch_out_of_resources, "launch out of resources");
+TRITON_CREATE_HIP_EXCEPTION(launch_timeout, "launch timeout");
+TRITON_CREATE_HIP_EXCEPTION(launch_incompatible_texturing,
+                            "launch incompatible texturing");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_already_enabled,
+                            "peer access already enabled");
+TRITON_CREATE_HIP_EXCEPTION(peer_access_not_enabled, "peer access not enabled");
+TRITON_CREATE_HIP_EXCEPTION(primary_context_active, "primary context active");
+TRITON_CREATE_HIP_EXCEPTION(context_is_destroyed, "context is destroyed");
+TRITON_CREATE_HIP_EXCEPTION(assert_error, "assert");
+TRITON_CREATE_HIP_EXCEPTION(too_many_peers, "too many peers");
+TRITON_CREATE_HIP_EXCEPTION(host_memory_already_registered,
+                            "host memory already registered");
+TRITON_CREATE_HIP_EXCEPTION(host_memory_not_registered,
+                            "hot memory not registered");
+TRITON_CREATE_HIP_EXCEPTION(hardware_stack_error, "hardware stack error");
+TRITON_CREATE_HIP_EXCEPTION(illegal_instruction, "illegal instruction");
+TRITON_CREATE_HIP_EXCEPTION(misaligned_address, "misaligned address");
+TRITON_CREATE_HIP_EXCEPTION(invalid_address_space, "invalid address space");
+TRITON_CREATE_HIP_EXCEPTION(invalid_pc, "invalid pc");
+TRITON_CREATE_HIP_EXCEPTION(launch_failed, "launch failed");
+TRITON_CREATE_HIP_EXCEPTION(not_permitted, "not permitted");
+TRITON_CREATE_HIP_EXCEPTION(not_supported, "not supported");
+TRITON_CREATE_HIP_EXCEPTION(invalid_symbol, "invalid symbol");
+TRITON_CREATE_HIP_EXCEPTION(unknown, "unknown");

 #undef TRITON_CREATE_CUDA_EXCEPTION
-  }
+} // namespace hip

-  }
-  }
-}
+} // namespace exception
+} // namespace driver
+} // namespace triton

 #endif
--- a/include/triton/driver/llvm.h
+++ b/include/triton/driver/llvm.h
@@ -1,20 +1,21 @@
-#include <string>
 #include "triton/driver/dispatch.h"
+#include <string>

-namespace llvm{
+namespace llvm {
 class Module;
 }

-namespace triton{
-namespace driver{
+namespace triton {
+namespace driver {

 void init_llvm();
-std::string path_to_ptxas(int& version);
-std::string llir_to_ptx(llvm::Module* module, int cc, int version);
-std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas_path, int cc);
-CUmodule ptx_to_cumodule(const std::string& ptx, int cc);
-std::string llir_to_amdgpu(llvm::Module* module, const std::string& proc);
-hipModule_t amdgpu_to_hipmodule(const std::string& path);
+std::string path_to_ptxas(int &version);
+std::string llir_to_ptx(llvm::Module *module, int cc, int version);
+std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas_path,
+                         int cc);
+CUmodule ptx_to_cumodule(const std::string &ptx, int cc);
+std::string llir_to_amdgpu(llvm::Module *module, const std::string &proc);
+hipModule_t amdgpu_to_hipmodule(const std::string &path);

-}
-}
+} // namespace driver
+} // namespace triton
--- a/include/triton/tools/bench.hpp
+++ b/include/triton/tools/bench.hpp
@@ -3,52 +3,55 @@
 #ifndef _TRITON_TOOLS_BENCH_H_
 #define _TRITON_TOOLS_BENCH_H_

-#include <chrono>
-#include <functional>
-#include <algorithm>
 #include "triton/driver/device.h"
 #include "triton/driver/stream.h"
+#include <algorithm>
+#include <chrono>
+#include <functional>

-namespace triton{
-namespace tools{
+namespace triton {
+namespace tools {

-class timer{
-    typedef std::chrono::high_resolution_clock high_resolution_clock;
-    typedef std::chrono::nanoseconds nanoseconds;
+class timer {
+  typedef std::chrono::high_resolution_clock high_resolution_clock;
+  typedef std::chrono::nanoseconds nanoseconds;

 public:
-    explicit timer(bool run = false)
-    { if (run) start(); }
+  explicit timer(bool run = false) {
+    if (run)
+      start();
+  }

-    void start()
-    { _start = high_resolution_clock::now(); }
+  void start() { _start = high_resolution_clock::now(); }

-    nanoseconds get() const
-    { return std::chrono::duration_cast<nanoseconds>(high_resolution_clock::now() - _start); }
+  nanoseconds get() const {
+    return std::chrono::duration_cast<nanoseconds>(
+        high_resolution_clock::now() - _start);
+  }

 private:
-    high_resolution_clock::time_point _start;
+  high_resolution_clock::time_point _start;
 };

-inline double bench(std::function<void()> const & op, driver::stream * stream, size_t warmup = 10, size_t repeat = 200)
-{
+inline double bench(std::function<void()> const &op, driver::stream *stream,
+                    size_t warmup = 10, size_t repeat = 200) {
  timer tmr;
  std::vector<size_t> times;
  double total_time = 0;
-  for(size_t i = 0; i < warmup; i++)
+  for (size_t i = 0; i < warmup; i++)
    op();
  stream->synchronize();
  tmr.start();
-  for(size_t i = 0; i < repeat; i++){
+  for (size_t i = 0; i < repeat; i++) {
    op();
  }
  stream->synchronize();
  return (float)tmr.get().count() / repeat;

-//  return *std::min_element(times.begin(), times.end());
+  //  return *std::min_element(times.begin(), times.end());
 }

-}
-}
+} // namespace tools
+} // namespace triton

 #endif
--- a/include/triton/tools/graph.h
+++ b/include/triton/tools/graph.h
@@ -3,16 +3,15 @@
 #ifndef _TRITON_TOOLS_THREAD_GRAPH_H_
 #define _TRITON_TOOLS_THREAD_GRAPH_H_

+#include <iostream>
 #include <map>
 #include <set>
 #include <vector>
-#include <iostream>

 namespace triton {
-namespace tools{
+namespace tools {

-template<class node_t>
-class graph {
+template <class node_t> class graph {
  typedef std::map<node_t, std::set<node_t>> edges_t;

 public:
@@ -21,27 +20,27 @@ public:

 private:
  void connected_components_impl(node_t x, std::set<node_t> &nodes,
-                                 nmap_t* nmap, cmap_t* cmap, int id) const {
-    if(nmap)
+                                 nmap_t *nmap, cmap_t *cmap, int id) const {
+    if (nmap)
      (*nmap)[x] = id;
-    if(cmap)
+    if (cmap)
      (*cmap)[id].push_back(x);
-    if(nodes.find(x) != nodes.end()) {
+    if (nodes.find(x) != nodes.end()) {
      nodes.erase(x);
-      for(const node_t &y: edges_.at(x))
+      for (const node_t &y : edges_.at(x))
        connected_components_impl(y, nodes, nmap, cmap, id);
    }
  }

 public:
  void connected_components(cmap_t *cmap, nmap_t *nmap) const {
-    if(cmap)
+    if (cmap)
      cmap->clear();
-    if(nmap)
+    if (nmap)
      nmap->clear();
    std::set<node_t> nodes = nodes_;
    unsigned id = 0;
-    while(!nodes.empty()){
+    while (!nodes.empty()) {
      connected_components_impl(*nodes.begin(), nodes, nmap, cmap, id++);
    }
  }
@@ -63,7 +62,7 @@ private:
  edges_t edges_;
 };

-}
-}
+} // namespace tools
+} // namespace triton

 #endif
--- a/include/triton/tools/sha1.hpp
+++ b/include/triton/tools/sha1.hpp
@@ -33,154 +33,140 @@
 #ifndef _TRITON_TOOLS_SHA1_HPP_
 #define _TRITON_TOOLS_SHA1_HPP_

-namespace sha1
+namespace sha1 {
+namespace // local
 {
-    namespace // local
-    {
-        // Rotate an integer value to left.
-        inline unsigned int rol(const unsigned int value,
-                const unsigned int steps)
-        {
-            return ((value << steps) | (value >> (32 - steps)));
-        }
+// Rotate an integer value to left.
+inline unsigned int rol(const unsigned int value, const unsigned int steps) {
+  return ((value << steps) | (value >> (32 - steps)));
+}

-        // Sets the first 16 integers in the buffert to zero.
-        // Used for clearing the W buffert.
-        inline void clearWBuffert(unsigned int* buffert)
-        {
-            for (int pos = 16; --pos >= 0;)
-            {
-                buffert[pos] = 0;
-            }
-        }
+// Sets the first 16 integers in the buffert to zero.
+// Used for clearing the W buffert.
+inline void clearWBuffert(unsigned int *buffert) {
+  for (int pos = 16; --pos >= 0;) {
+    buffert[pos] = 0;
+  }
+}

-        inline void innerHash(unsigned int* result, unsigned int* w)
-        {
-            unsigned int a = result[0];
-            unsigned int b = result[1];
-            unsigned int c = result[2];
-            unsigned int d = result[3];
-            unsigned int e = result[4];
+inline void innerHash(unsigned int *result, unsigned int *w) {
+  unsigned int a = result[0];
+  unsigned int b = result[1];
+  unsigned int c = result[2];
+  unsigned int d = result[3];
+  unsigned int e = result[4];

-            int round = 0;
+  int round = 0;

-            #define sha1macro(func,val) \
-      { \
-                const unsigned int t = rol(a, 5) + (func) + e + val + w[round]; \
-        e = d; \
-        d = c; \
-        c = rol(b, 30); \
-        b = a; \
-        a = t; \
-      }
+#define sha1macro(func, val)                                                   \
+  {                                                                            \
+    const unsigned int t = rol(a, 5) + (func) + e + val + w[round];            \
+    e = d;                                                                     \
+    d = c;                                                                     \
+    c = rol(b, 30);                                                            \
+    b = a;                                                                     \
+    a = t;                                                                     \
+  }

-            while (round < 16)
-            {
-                sha1macro((b & c) | (~b & d), 0x5a827999)
-                ++round;
-            }
-            while (round < 20)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro((b & c) | (~b & d), 0x5a827999)
-                ++round;
-            }
-            while (round < 40)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro(b ^ c ^ d, 0x6ed9eba1)
-                ++round;
-            }
-            while (round < 60)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)
-                ++round;
-            }
-            while (round < 80)
-            {
-                w[round] = rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
-                sha1macro(b ^ c ^ d, 0xca62c1d6)
-                ++round;
-            }
+  while (round < 16) {
+    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
+  }
+  while (round < 20) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro((b & c) | (~b & d), 0x5a827999)++ round;
+  }
+  while (round < 40) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro(b ^ c ^ d, 0x6ed9eba1)++ round;
+  }
+  while (round < 60) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro((b & c) | (b & d) | (c & d), 0x8f1bbcdc)++ round;
+  }
+  while (round < 80) {
+    w[round] =
+        rol((w[round - 3] ^ w[round - 8] ^ w[round - 14] ^ w[round - 16]), 1);
+    sha1macro(b ^ c ^ d, 0xca62c1d6)++ round;
+  }

-            #undef sha1macro
+#undef sha1macro

-            result[0] += a;
-            result[1] += b;
-            result[2] += c;
-            result[3] += d;
-            result[4] += e;
-        }
-    } // namespace
+  result[0] += a;
+  result[1] += b;
+  result[2] += c;
+  result[3] += d;
+  result[4] += e;
+}
+} // namespace

-    inline void calc(const void* src, const int bytelength, unsigned char* hash)
-    {
-        // Init the result array.
-        unsigned int result[5] = { 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 };
+inline void calc(const void *src, const int bytelength, unsigned char *hash) {
+  // Init the result array.
+  unsigned int result[5] = {0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476,
+                            0xc3d2e1f0};

-        // Cast the void src pointer to be the byte array we can work with.
-        const unsigned char* sarray = (const unsigned char*) src;
+  // Cast the void src pointer to be the byte array we can work with.
+  const unsigned char *sarray = (const unsigned char *)src;

-        // The reusable round buffer
-        unsigned int w[80];
+  // The reusable round buffer
+  unsigned int w[80];

-        // Loop through all complete 64byte blocks.
-        const int endOfFullBlocks = bytelength - 64;
-        int endCurrentBlock;
-        int currentBlock = 0;
+  // Loop through all complete 64byte blocks.
+  const int endOfFullBlocks = bytelength - 64;
+  int endCurrentBlock;
+  int currentBlock = 0;

-        while (currentBlock <= endOfFullBlocks)
-        {
-            endCurrentBlock = currentBlock + 64;
+  while (currentBlock <= endOfFullBlocks) {
+    endCurrentBlock = currentBlock + 64;

-            // Init the round buffer with the 64 byte block data.
-            for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4)
-            {
-                // This line will swap endian on big endian and keep endian on little endian.
-                w[roundPos++] = (unsigned int) sarray[currentBlock + 3]
-                        | (((unsigned int) sarray[currentBlock + 2]) << 8)
-                        | (((unsigned int) sarray[currentBlock + 1]) << 16)
-                        | (((unsigned int) sarray[currentBlock]) << 24);
-            }
-            innerHash(result, w);
-        }
-
-        // Handle the last and not full 64 byte block if existing.
-        endCurrentBlock = bytelength - currentBlock;
-        clearWBuffert(w);
-        int lastBlockBytes = 0;
-        for (;lastBlockBytes < endCurrentBlock; ++lastBlockBytes)
-        {
-            w[lastBlockBytes >> 2] |= (unsigned int) sarray[lastBlockBytes + currentBlock] << ((3 - (lastBlockBytes & 3)) << 3);
-        }
-        w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
-        if (endCurrentBlock >= 56)
-        {
-            innerHash(result, w);
-            clearWBuffert(w);
-        }
-        w[15] = bytelength << 3;
-        innerHash(result, w);
-
-        // Store hash in result pointer, and make sure we get in in the correct order on both endian models.
-        for (int hashByte = 20; --hashByte >= 0;)
-        {
-            hash[hashByte] = (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
-        }
+    // Init the round buffer with the 64 byte block data.
+    for (int roundPos = 0; currentBlock < endCurrentBlock; currentBlock += 4) {
+      // This line will swap endian on big endian and keep endian on little
+      // endian.
+      w[roundPos++] = (unsigned int)sarray[currentBlock + 3] |
+                      (((unsigned int)sarray[currentBlock + 2]) << 8) |
+                      (((unsigned int)sarray[currentBlock + 1]) << 16) |
+                      (((unsigned int)sarray[currentBlock]) << 24);
    }
+    innerHash(result, w);
+  }

-    inline void toHexString(const unsigned char* hash, char* hexstring)
-    {
-        const char hexDigits[] = { "0123456789abcdef" };
+  // Handle the last and not full 64 byte block if existing.
+  endCurrentBlock = bytelength - currentBlock;
+  clearWBuffert(w);
+  int lastBlockBytes = 0;
+  for (; lastBlockBytes < endCurrentBlock; ++lastBlockBytes) {
+    w[lastBlockBytes >> 2] |=
+        (unsigned int)sarray[lastBlockBytes + currentBlock]
+        << ((3 - (lastBlockBytes & 3)) << 3);
+  }
+  w[lastBlockBytes >> 2] |= 0x80 << ((3 - (lastBlockBytes & 3)) << 3);
+  if (endCurrentBlock >= 56) {
+    innerHash(result, w);
+    clearWBuffert(w);
+  }
+  w[15] = bytelength << 3;
+  innerHash(result, w);

-        for (int hashByte = 20; --hashByte >= 0;)
-        {
-            hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
-            hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
-        }
-        hexstring[40] = 0;
-    }
+  // Store hash in result pointer, and make sure we get in in the correct order
+  // on both endian models.
+  for (int hashByte = 20; --hashByte >= 0;) {
+    hash[hashByte] =
+        (result[hashByte >> 2] >> (((3 - hashByte) & 0x3) << 3)) & 0xff;
+  }
+}
+
+inline void toHexString(const unsigned char *hash, char *hexstring) {
+  const char hexDigits[] = {"0123456789abcdef"};
+
+  for (int hashByte = 20; --hashByte >= 0;) {
+    hexstring[hashByte << 1] = hexDigits[(hash[hashByte] >> 4) & 0xf];
+    hexstring[(hashByte << 1) + 1] = hexDigits[hash[hashByte] & 0xf];
+  }
+  hexstring[40] = 0;
+}
 } // namespace sha1

 #endif
--- a/include/triton/tools/sys/exec.hpp
+++ b/include/triton/tools/sys/exec.hpp
@@ -7,11 +7,8 @@
 #include <stdexcept>
 #include <string>

-namespace triton
-{
-namespace tools
-{
-
+namespace triton {
+namespace tools {

 #ifdef _WIN32
 #define popen _popen
@@ -19,12 +16,12 @@ namespace tools
 #endif

 #ifndef WEXITSTATUS
-#define WEXITSTATUS(stat_val) ((unsigned)(stat_val) & 255)
+#define WEXITSTATUS(stat_val) ((unsigned)(stat_val)&255)
 #endif

-int exec(const std::string& cmd, std::string& result) {
+int exec(const std::string &cmd, std::string &result) {
  char buffer[128];
-  FILE* pipe = popen(cmd.c_str(), "r");
+  FILE *pipe = popen(cmd.c_str(), "r");
  if (!pipe)
    return 0;
  result.clear();
@@ -37,10 +34,9 @@ int exec(const std::string& cmd, std::string& result) {
  }
  int status = pclose(pipe);
  return WEXITSTATUS(status);
-
 }

-}
-}
+} // namespace tools
+} // namespace triton

 #endif
--- a/include/triton/tools/sys/getenv.hpp
+++ b/include/triton/tools/sys/getenv.hpp
@@ -22,26 +22,23 @@
 #ifndef TDL_TOOLS_SYS_GETENV_HPP
 #define TDL_TOOLS_SYS_GETENV_HPP

-#include <string>
 #include <cstdlib>
+#include <string>

-namespace triton
-{
+namespace triton {

-namespace tools
-{
-
-    inline std::string getenv(const char * name)
-    {
-        const char * cstr = std::getenv(name);
-        if(!cstr)
-            return "";
-        std::string result(cstr);
-        return result;
-    }
+namespace tools {

+inline std::string getenv(const char *name) {
+  const char *cstr = std::getenv(name);
+  if (!cstr)
+    return "";
+  std::string result(cstr);
+  return result;
 }

-}
+} // namespace tools
+
+} // namespace triton

 #endif
--- a/include/triton/tools/sys/mkdir.hpp
+++ b/include/triton/tools/sys/mkdir.hpp
@@ -22,55 +22,49 @@
 #ifndef TDL_TOOLS_SYS_MKDIR_HPP
 #define TDL_TOOLS_SYS_MKDIR_HPP

-#include <cstring>
-#include <string>
 #include <cstdlib>
-#include <sys/stat.h>
+#include <cstring>
 #include <errno.h>
+#include <string>
+#include <sys/stat.h>
 #if defined(_WIN32)
-  #include <direct.h>
+#include <direct.h>
 #endif

-namespace triton
-{
+namespace triton {

-namespace tools
-{
-
-    inline int mkdir(std::string const & path)
-    {
-        #if defined(_WIN32)
-            return _mkdir(path.c_str());
-        #else
-            return ::mkdir(path.c_str(), 0777);
-        #endif
-    }
-
-    inline int mkpath(std::string const & path)
-    {
-        int status = 0;
-        size_t pp = 0;
-        size_t sp;
-        while ((sp = path.find('/', pp)) != std::string::npos)
-        {
-            if (sp != pp){
-                status = mkdir(path.substr(0, sp));
-            }
-            pp = sp + 1;
-        }
-        return (status==0 || errno==EEXIST)?0:-1;
-    }
-
-    inline int mtime(std::string const & path)
-    {
-      struct stat st;
-      if(stat(path.c_str(), &st) != 0)
-        return 0;
-      return st.st_mtime;
-    }
+namespace tools {

+inline int mkdir(std::string const &path) {
+#if defined(_WIN32)
+  return _mkdir(path.c_str());
+#else
+  return ::mkdir(path.c_str(), 0777);
+#endif
 }

+inline int mkpath(std::string const &path) {
+  int status = 0;
+  size_t pp = 0;
+  size_t sp;
+  while ((sp = path.find('/', pp)) != std::string::npos) {
+    if (sp != pp) {
+      status = mkdir(path.substr(0, sp));
+    }
+    pp = sp + 1;
+  }
+  return (status == 0 || errno == EEXIST) ? 0 : -1;
 }

+inline int mtime(std::string const &path) {
+  struct stat st;
+  if (stat(path.c_str(), &st) != 0)
+    return 0;
+  return st.st_mtime;
+}
+
+} // namespace tools
+
+} // namespace triton
+
 #endif
--- a/include/triton/tools/thread_pool.h
+++ b/include/triton/tools/thread_pool.h
@@ -3,88 +3,79 @@
 #ifndef _TRITON_TOOLS_THREAD_POOL_H_
 #define _TRITON_TOOLS_THREAD_POOL_H_

-#include <vector>
-#include <queue>
-#include <memory>
-#include <thread>
-#include <mutex>
 #include <condition_variable>
-#include <future>
 #include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
 #include <stdexcept>
+#include <thread>
+#include <vector>

 class ThreadPool {
 public:
-    ThreadPool(size_t threads)
-        :   stop(false) {
-      for(size_t i = 0;i < threads;++i)
-          workers.emplace_back(
-              [this] {
-                for(;;){
-                  std::function<void()> task;
-                  {
-                    std::unique_lock<std::mutex> lock(this->queue_mutex);
-                    this->condition.wait(lock,
-                      [this]{ return this->stop || !this->tasks.empty(); });
-                    if(this->stop && this->tasks.empty())
-                      return;
-                    task = std::move(this->tasks.front());
-                    this->tasks.pop();
-                  }
-                  task();
-                }
-              }
-          );
-    }
+  ThreadPool(size_t threads) : stop(false) {
+    for (size_t i = 0; i < threads; ++i)
+      workers.emplace_back([this] {
+        for (;;) {
+          std::function<void()> task;
+          {
+            std::unique_lock<std::mutex> lock(this->queue_mutex);
+            this->condition.wait(
+                lock, [this] { return this->stop || !this->tasks.empty(); });
+            if (this->stop && this->tasks.empty())
+              return;
+            task = std::move(this->tasks.front());
+            this->tasks.pop();
+          }
+          task();
+        }
+      });
+  }

+  template <class F, class... Args>
+  auto enqueue(F &&f, Args &&... args)
+      -> std::future<typename std::result_of<F(Args...)>::type> {
+    using return_type = typename std::result_of<F(Args...)>::type;

-    template<class F, class... Args>
-    auto enqueue(F&& f, Args&&... args)
-        -> std::future<typename std::result_of<F(Args...)>::type>
+    auto task = std::make_shared<std::packaged_task<return_type()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+    std::future<return_type> res = task->get_future();
    {
-        using return_type = typename std::result_of<F(Args...)>::type;
+      std::unique_lock<std::mutex> lock(queue_mutex);

-        auto task = std::make_shared< std::packaged_task<return_type()> >(
-                std::bind(std::forward<F>(f), std::forward<Args>(args)...)
-            );
+      // don't allow enqueueing after stopping the pool
+      if (stop)
+        throw std::runtime_error("enqueue on stopped ThreadPool");

-        std::future<return_type> res = task->get_future();
-        {
-            std::unique_lock<std::mutex> lock(queue_mutex);
-
-            // don't allow enqueueing after stopping the pool
-            if(stop)
-                throw std::runtime_error("enqueue on stopped ThreadPool");
-
-            tasks.emplace([task](){ (*task)(); });
-        }
-        condition.notify_one();
-        return res;
+      tasks.emplace([task]() { (*task)(); });
    }
+    condition.notify_one();
+    return res;
+  }

-
-    ~ThreadPool() {
-        {
-          std::unique_lock<std::mutex> lock(queue_mutex);
-          stop = true;
-        }
-        condition.notify_all();
-        for(std::thread &worker: workers)
-          worker.join();
+  ~ThreadPool() {
+    {
+      std::unique_lock<std::mutex> lock(queue_mutex);
+      stop = true;
    }
-
+    condition.notify_all();
+    for (std::thread &worker : workers)
+      worker.join();
+  }

 private:
-    // need to keep track of threads so we can join them
-    std::vector< std::thread > workers;
-    // the task queue
-    std::queue< std::function<void()> > tasks;
+  // need to keep track of threads so we can join them
+  std::vector<std::thread> workers;
+  // the task queue
+  std::queue<std::function<void()>> tasks;

-    // synchronization
-    std::mutex queue_mutex;
-    std::condition_variable condition;
-    bool stop;
+  // synchronization
+  std::mutex queue_mutex;
+  std::condition_variable condition;
+  bool stop;
 };

-
 #endif