From 33e6f0df7f848b8ec300b7a9b7049a6ec757c455 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 12 Oct 2022 12:02:30 -0700 Subject: [PATCH] [DRIVER] Bumped CUDA requirement to 11.4+. This is to avoid bad performance surprises as older `ptxas` are much slower. (#769) This also makes codegen simpler by avoiding special handling of eviction policies --- lib/driver/llvm.cc | 628 +++++++++++++++++++++++---------------------- 1 file changed, 322 insertions(+), 306 deletions(-) diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc index 555a2b14e..a73e6541d 100644 --- a/lib/driver/llvm.cc +++ b/lib/driver/llvm.cc @@ -1,27 +1,27 @@ /* Copyright 2015-2017 Philippe Tillet -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files -* (the "Software"), to deal in the Software without restriction, -* including without limitation the rights to use, copy, modify, merge, -* publish, distribute, sublicense, and/or sell copies of the Software, -* and to permit persons to whom the Software is furnished to do so, -* subject to the following conditions: -* -* The above copyright notice and this permission notice shall be -* included in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -*/ + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files + * (the "Software"), to deal in the Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, + * subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ #include #if __has_include() - #include +#include #endif #include #include @@ -59,302 +59,318 @@ #include "llvm/Analysis/TargetLibraryInfo.h" // end AMD stuff -extern "C"{ - int set_curterm(char* nterm){ return 0; } - int del_curterm(char* nterm){ return 0; } +extern "C" +{ + int set_curterm(char *nterm) { return 0; } + int del_curterm(char *nterm) { return 0; } int tigetnum(char *capname) { return 0; } int setupterm(char *term, int fildes, int *errret) { return 0; } } -namespace triton{ -namespace driver{ - -void init_llvm() { - LLVMInitializeNVPTXTargetInfo(); - LLVMInitializeNVPTXTarget(); - LLVMInitializeNVPTXTargetMC(); - LLVMInitializeNVPTXAsmPrinter(); - LLVMInitializeAMDGPUTargetInfo(); - LLVMInitializeAMDGPUTarget(); - LLVMInitializeAMDGPUTargetMC(); - LLVMInitializeAMDGPUAsmPrinter(); -} - - -/* ------------------------ */ -// CUDA // -/* ------------------------ */ -static bool find_and_replace(std::string& str, const std::string& begin, const std::string& end, const std::string& target){ - size_t start_replace = str.find(begin); - size_t end_replace = str.find(end, start_replace); - if(start_replace == std::string::npos) - return false; - str.replace(start_replace, end_replace + 1 - start_replace, target); - return true; -} - -std::string path_to_ptxas(int& version) { - std::vector rets; - std::string ret; - // search paths for ptxas - std::vector ptxas_prefixes = {"", "/usr/local/cuda/bin/"}; - std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH"); - if(!triton_ptxas.empty()) - ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas); - // see what path for ptxas are valid - std::vector working_ptxas; - for(std::string prefix: ptxas_prefixes){ - std::string ptxas = prefix + "ptxas"; - bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0; - if(works) { - working_ptxas.push_back(ptxas); - rets.push_back(ret); - } - } - // error if no working ptxas was found - if(working_ptxas.empty()) - throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, /usr/local/cuda/bin/ or PATH" - " but a working version could not be found."); - std::string ptxas = working_ptxas.front(); - // parse version - std::regex version_regex("release (\\d+)\\.(\\d+)"); - std::smatch match; - bool found = false; - // currently choosing the first ptxas. Other logics can be implemented in future - for(std::string ret : rets) { - if(std::regex_search(ret, match, version_regex)){ - int major = std::stoi(match[1]); - int minor = std::stoi(match[2]); - version = major*1000 + minor*10; - found = true; - break; - } - } - if ( not found) { - throw std::runtime_error("Error in parsing version"); - } - return ptxas; -} - - -int vptx(int version){ - if(version >= 11040) return 74; - if(version >= 11030) return 73; - if(version >= 11020) return 72; - if(version >= 11010) return 71; - if(version >= 11000) return 70; - if(version >= 10020) return 65; - if(version >= 10010) return 64; - if(version >= 10000) return 63; - throw std::runtime_error("Triton requires CUDA 10+"); -} - -std::string llir_to_ptx(llvm::Module* module, int cc, int version){ - // LLVM version in use may not officially support target hardware - int max_nvvm_cc = 75; - int max_nvvm_ptx = 74; - // options - auto options = llvm::cl::getRegisteredOptions(); - auto* short_ptr = static_cast*>(options["nvptx-short-ptr"]); - assert(short_ptr); - short_ptr->setValue(true); - // compute capability - std::string sm = "sm_" + std::to_string(cc); - // max PTX version - int ptx = vptx(version); - int ptx_major = ptx / 10; - int ptx_minor = ptx % 10; - // create - llvm::SmallVector buffer; - std::string triple = "nvptx64-nvidia-cuda"; - std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc)); - std::string layout = ""; - std::string features = ""; - // std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx)); - init_llvm(); - // verify and store llvm - llvm::legacy::PassManager pm; -// pm.add(llvm::createPrintModulePass(llvm::outs())); - pm.add(llvm::createVerifierPass()); - pm.run(*module); - // module->print(llvm::outs(), nullptr); - - // create machine - module->setTargetTriple(triple); - std::string error; - llvm::TargetMachine* machine; - auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetOptions opt; - opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; - opt.NoInfsFPMath = false; - opt.NoNaNsFPMath = true; - machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, - llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); - // set data layout - if(layout.empty()) - module->setDataLayout(machine->createDataLayout()); - else - module->setDataLayout(layout); - // emit machine code - for (llvm::Function &f : module->functions()) - f.addFnAttr(llvm::Attribute::AlwaysInline); - llvm::legacy::PassManager pass; - llvm::raw_svector_ostream stream(buffer); - // emit - machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile); - pass.run(*module); - - // post-process - std::string result(buffer.begin(), buffer.end()); - find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n"); - find_and_replace(result, ".target", "\n", ".target " + sm + "\n"); - while(find_and_replace(result, "\t// begin inline asm", "\n", "")); - while(find_and_replace(result, "\t// end inline asm", "\n", "")); - return result; -} - - -std::string ptx_to_cubin(const std::string& ptx, const std::string& ptxas, int cc) { - // compile ptx with ptxas - char _fsrc[L_tmpnam]; - char _flog[L_tmpnam]; - std::tmpnam(_fsrc); - std::tmpnam(_flog); - std::string fsrc = _fsrc; - std::string flog = _flog; - std::string fbin = fsrc + ".o"; - const char* _fbin = fbin.c_str(); - std::ofstream ofs(fsrc); - ofs << ptx << std::endl; - ofs.close(); - std::string cmd; - int err; - cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog; - err = system(cmd.c_str()); - if(err != 0){ - std::ifstream _log(_flog); - std::string log(std::istreambuf_iterator(_log), {}); - unlink(_fsrc); - unlink(_flog); - throw std::runtime_error("Internal Triton PTX codegen error: \n" + log); - } - std::ifstream _cubin(_fbin, std::ios::binary ); - std::string cubin(std::istreambuf_iterator(_cubin), {}); - _cubin.close(); - unlink(_fsrc); - unlink(_flog); - unlink(_fbin); - return cubin; -} - -/* ------------------------ */ -// HIP // -/* ------------------------ */ - -std::string llir_to_amdgpu(llvm::Module* module, const std::string& _proc) { - init_llvm(); - -// proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo)); -// features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo)); - - // create - llvm::SmallVector buffer; - std::string triple = "amdgcn-amd-amdhsa"; - std::string layout = ""; - std::string features; - std::string proc = "gfx908"; - // verify and store llvm - llvm::legacy::PassManager pm; - pm.add(llvm::createVerifierPass()); - pm.run(*module); - // create machine - module->setTargetTriple(triple); - std::string error; - auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); - llvm::TargetOptions opt; - opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; - opt.NoInfsFPMath = false; - opt.NoNaNsFPMath = true; - llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, - llvm::Reloc::PIC_, llvm::None, - llvm::CodeGenOpt::Aggressive); - // set data layout - if(layout.empty()) - module->setDataLayout(machine->createDataLayout()); - else - module->setDataLayout(layout); - // emit machine code - for (llvm::Function &f : module->functions()) - f.addFnAttr(llvm::Attribute::AlwaysInline); - llvm::legacy::PassManager pass; - llvm::raw_svector_ostream stream(buffer); - - // create dump files - std::string module_name = module->getModuleIdentifier(); - std::error_code ec; - - // Save GCN ISA binary. - std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o"); - std::unique_ptr isabin_fs( - new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text)); - if (ec) +namespace triton +{ + namespace driver { - std::cout << isabin_path << " was not created. error code: " << ec << std::endl; - } - // emit - machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile); - pass.run(*module); - // Save GCN ISA. - std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn"); - std::string result(buffer.begin(), buffer.end()); - std::ofstream amdgcn(amdgcn_path); - amdgcn << result; - amdgcn.close(); + void init_llvm() + { + LLVMInitializeNVPTXTargetInfo(); + LLVMInitializeNVPTXTarget(); + LLVMInitializeNVPTXTargetMC(); + LLVMInitializeNVPTXAsmPrinter(); + LLVMInitializeAMDGPUTargetInfo(); + LLVMInitializeAMDGPUTarget(); + LLVMInitializeAMDGPUTargetMC(); + LLVMInitializeAMDGPUAsmPrinter(); + } - // generate HASCO file - std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco"); - std::string error_message; - int lld_result = - llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld", - {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path}, - llvm::None, {}, 0, 0, &error_message); - if (lld_result) - { - std::cout << "ld.lld execute fail: " << std::endl; - std::cout << error_message << std::endl; - std::cout << lld_result << std::endl; - } + /* ------------------------ */ + // CUDA // + /* ------------------------ */ + static bool find_and_replace(std::string &str, const std::string &begin, const std::string &end, const std::string &target) + { + size_t start_replace = str.find(begin); + size_t end_replace = str.find(end, start_replace); + if (start_replace == std::string::npos) + return false; + str.replace(start_replace, end_replace + 1 - start_replace, target); + return true; + } - return hsaco_path; -} + std::string path_to_ptxas(int &version) + { + std::vector rets; + std::string ret; + // search paths for ptxas + std::vector ptxas_prefixes = {"", "/usr/local/cuda/bin/"}; + std::string triton_ptxas = tools::getenv("TRITON_PTXAS_PATH"); + if (!triton_ptxas.empty()) + ptxas_prefixes.insert(ptxas_prefixes.begin(), triton_ptxas); + // see what path for ptxas are valid + std::vector working_ptxas; + for (std::string prefix : ptxas_prefixes) + { + std::string ptxas = prefix + "ptxas"; + bool works = tools::exec(ptxas + " --version 2>&1", ret) == 0; + if (works) + { + working_ptxas.push_back(ptxas); + rets.push_back(ret); + } + } + // error if no working ptxas was found + if (working_ptxas.empty()) + throw std::runtime_error("`ptxas` was searched in TRITON_PTXAS_PATH, /usr/local/cuda/bin/ or PATH" + " but a working version could not be found."); + std::string ptxas = working_ptxas.front(); + // parse version + std::regex version_regex("release (\\d+)\\.(\\d+)"); + std::smatch match; + bool found = false; + // currently choosing the first ptxas. Other logics can be implemented in future + for (std::string ret : rets) + { + if (std::regex_search(ret, match, version_regex)) + { + int major = std::stoi(match[1]); + int minor = std::stoi(match[2]); + version = major * 1000 + minor * 10; + found = true; + break; + } + } + if (not found) + { + throw std::runtime_error("Error in parsing version"); + } + return ptxas; + } + int vptx(int version) + { + if (version >= 11040) + return 74; + // if(version >= 11030) return 73; + // if(version >= 11020) return 72; + // if(version >= 11010) return 71; + // if(version >= 11000) return 70; + // if(version >= 10020) return 65; + // if(version >= 10010) return 64; + // if(version >= 10000) return 63; + throw std::runtime_error("Triton requires CUDA 11.4+"); + } -hipModule_t amdgpu_to_hipmodule(const std::string& path) { - // Read HSACO. - std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate); - std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg(); + std::string llir_to_ptx(llvm::Module *module, int cc, int version) + { + // LLVM version in use may not officially support target hardware + int max_nvvm_cc = 75; + int max_nvvm_ptx = 74; + // options + auto options = llvm::cl::getRegisteredOptions(); + auto *short_ptr = static_cast *>(options["nvptx-short-ptr"]); + assert(short_ptr); + short_ptr->setValue(true); + // compute capability + std::string sm = "sm_" + std::to_string(cc); + // max PTX version + int ptx = vptx(version); + int ptx_major = ptx / 10; + int ptx_minor = ptx % 10; + // create + llvm::SmallVector buffer; + std::string triple = "nvptx64-nvidia-cuda"; + std::string proc = "sm_" + std::to_string(std::min(cc, max_nvvm_cc)); + std::string layout = ""; + std::string features = ""; + // std::string features = "+ptx" + std::to_string(std::min(ptx, max_nvvm_ptx)); + init_llvm(); + // verify and store llvm + llvm::legacy::PassManager pm; + // pm.add(llvm::createPrintModulePass(llvm::outs())); + pm.add(llvm::createVerifierPass()); + pm.run(*module); + // module->print(llvm::outs(), nullptr); - std::vector hsaco(hsaco_file_size); - hsaco_file.seekg(0, std::ios::beg); - hsaco_file.read(reinterpret_cast(&hsaco[0]), hsaco_file_size); - hsaco_file.close(); - hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer, + // create machine + module->setTargetTriple(triple); + std::string error; + llvm::TargetMachine *machine; + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetOptions opt; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, + llvm::Reloc::PIC_, llvm::None, llvm::CodeGenOpt::Aggressive); + // set data layout + if (layout.empty()) + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + // emit machine code + for (llvm::Function &f : module->functions()) + f.addFnAttr(llvm::Attribute::AlwaysInline); + llvm::legacy::PassManager pass; + llvm::raw_svector_ostream stream(buffer); + // emit + machine->addPassesToEmitFile(pass, stream, nullptr, llvm::CodeGenFileType::CGFT_AssemblyFile); + pass.run(*module); + + // post-process + std::string result(buffer.begin(), buffer.end()); + find_and_replace(result, ".version", "\n", ".version " + std::to_string(ptx_major) + "." + std::to_string(ptx_minor) + "\n"); + find_and_replace(result, ".target", "\n", ".target " + sm + "\n"); + while (find_and_replace(result, "\t// begin inline asm", "\n", "")) + ; + while (find_and_replace(result, "\t// end inline asm", "\n", "")) + ; + return result; + } + + std::string ptx_to_cubin(const std::string &ptx, const std::string &ptxas, int cc) + { + // compile ptx with ptxas + char _fsrc[L_tmpnam]; + char _flog[L_tmpnam]; + std::tmpnam(_fsrc); + std::tmpnam(_flog); + std::string fsrc = _fsrc; + std::string flog = _flog; + std::string fbin = fsrc + ".o"; + const char *_fbin = fbin.c_str(); + std::ofstream ofs(fsrc); + ofs << ptx << std::endl; + ofs.close(); + std::string cmd; + int err; + cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog; + err = system(cmd.c_str()); + if (err != 0) + { + std::ifstream _log(_flog); + std::string log(std::istreambuf_iterator(_log), {}); + unlink(_fsrc); + unlink(_flog); + throw std::runtime_error("Internal Triton PTX codegen error: \n" + log); + } + std::ifstream _cubin(_fbin, std::ios::binary); + std::string cubin(std::istreambuf_iterator(_cubin), {}); + _cubin.close(); + unlink(_fsrc); + unlink(_flog); + unlink(_fbin); + return cubin; + } + + /* ------------------------ */ + // HIP // + /* ------------------------ */ + + std::string llir_to_amdgpu(llvm::Module *module, const std::string &_proc) + { + init_llvm(); + + // proc = std::get<0>(GetFeatureStrFromGCNArchName(rocminfo)); + // features = std::get<1>(GetFeatureStrFromGCNArchName(rocminfo)); + + // create + llvm::SmallVector buffer; + std::string triple = "amdgcn-amd-amdhsa"; + std::string layout = ""; + std::string features; + std::string proc = "gfx908"; + // verify and store llvm + llvm::legacy::PassManager pm; + pm.add(llvm::createVerifierPass()); + pm.run(*module); + // create machine + module->setTargetTriple(triple); + std::string error; + auto target = llvm::TargetRegistry::lookupTarget(module->getTargetTriple(), error); + llvm::TargetOptions opt; + opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; + opt.UnsafeFPMath = false; + opt.NoInfsFPMath = false; + opt.NoNaNsFPMath = true; + llvm::TargetMachine *machine = target->createTargetMachine(module->getTargetTriple(), proc, features, opt, + llvm::Reloc::PIC_, llvm::None, + llvm::CodeGenOpt::Aggressive); + // set data layout + if (layout.empty()) + module->setDataLayout(machine->createDataLayout()); + else + module->setDataLayout(layout); + // emit machine code + for (llvm::Function &f : module->functions()) + f.addFnAttr(llvm::Attribute::AlwaysInline); + llvm::legacy::PassManager pass; + llvm::raw_svector_ostream stream(buffer); + + // create dump files + std::string module_name = module->getModuleIdentifier(); + std::error_code ec; + + // Save GCN ISA binary. + std::string isabin_path = std::string("/tmp/") + module_name + std::string(".o"); + std::unique_ptr isabin_fs( + new llvm::raw_fd_ostream(isabin_path, ec, llvm::sys::fs::OF_Text)); + if (ec) + { + std::cout << isabin_path << " was not created. error code: " << ec << std::endl; + } + + // emit + machine->addPassesToEmitFile(pass, *isabin_fs, nullptr, llvm::CGFT_ObjectFile); + pass.run(*module); + // Save GCN ISA. + std::string amdgcn_path = std::string("/tmp/") + module_name + std::string(".gcn"); + std::string result(buffer.begin(), buffer.end()); + std::ofstream amdgcn(amdgcn_path); + amdgcn << result; + amdgcn.close(); + + // generate HASCO file + std::string hsaco_path = std::string("/tmp/") + module_name + std::string(".hsaco"); + std::string error_message; + int lld_result = + llvm::sys::ExecuteAndWait("/opt/rocm/llvm/bin/ld.lld", + {"/opt/rocm/llvm/bin/ld.lld", "-flavor", "gnu", "-shared", "-o", hsaco_path, isabin_path}, + llvm::None, {}, 0, 0, &error_message); + if (lld_result) + { + std::cout << "ld.lld execute fail: " << std::endl; + std::cout << error_message << std::endl; + std::cout << lld_result << std::endl; + } + + return hsaco_path; + } + + hipModule_t amdgpu_to_hipmodule(const std::string &path) + { + // Read HSACO. + std::ifstream hsaco_file(path, std::ios::binary | std::ios::ate); + std::ifstream::pos_type hsaco_file_size = hsaco_file.tellg(); + + std::vector hsaco(hsaco_file_size); + hsaco_file.seekg(0, std::ios::beg); + hsaco_file.read(reinterpret_cast(&hsaco[0]), hsaco_file_size); + hsaco_file.close(); + hipJitOption opt[] = {hipJitOptionErrorLogBufferSizeBytes, hipJitOptionErrorLogBuffer, hipJitOptionInfoLogBufferSizeBytes, hipJitOptionInfoLogBuffer, hipJitOptionLogVerbose}; - const unsigned int errbufsize = 8192; - const unsigned int logbufsize = 8192; - char _err[errbufsize]; - char _log[logbufsize]; - void* optval[] = {(void*)(uintptr_t)errbufsize, - (void*)_err, (void*)(uintptr_t)logbufsize, - (void*)_log, (void*)1}; - hipModule_t ret; - dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval); - return ret; -} + const unsigned int errbufsize = 8192; + const unsigned int logbufsize = 8192; + char _err[errbufsize]; + char _log[logbufsize]; + void *optval[] = {(void *)(uintptr_t)errbufsize, + (void *)_err, (void *)(uintptr_t)logbufsize, + (void *)_log, (void *)1}; + hipModule_t ret; + dispatch::hipModuleLoadDataEx(&ret, hsaco.data(), 5, opt, optval); + return ret; + } -} // namespace driver -} // namespace triton + } // namespace driver +} // namespace triton