[GENERAL] Merged v1.0alpha into master. Added features are:
- A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill
This commit is contained in:
@@ -93,6 +93,7 @@ namespace driver
|
||||
|
||||
bool dispatch::cuinit(){
|
||||
if(cuda_==nullptr){
|
||||
putenv((char*)"CUDA_CACHE_DISABLE=1");
|
||||
std::string libcuda = tools::getenv("TRITON_LIBCUDA");
|
||||
if(libcuda.empty())
|
||||
cuda_ = dlopen("libcuda.so", RTLD_LAZY);
|
||||
|
Reference in New Issue
Block a user