[GENERAL] Merged v1.0alpha into master. Added features are:

- A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill
2021-01-11 19:20:34 -05:00
parent c0bc7ed8b0
commit 083bbd1e8d
75 changed files with 2688 additions and 4512 deletions
--- a/lib/driver/dispatch.cc
+++ b/lib/driver/dispatch.cc
@@ -93,6 +93,7 @@ namespace driver

 bool dispatch::cuinit(){
  if(cuda_==nullptr){
+    putenv((char*)"CUDA_CACHE_DISABLE=1");
    std::string libcuda = tools::getenv("TRITON_LIBCUDA");
    if(libcuda.empty())
      cuda_ = dlopen("libcuda.so", RTLD_LAZY);