diff --git a/lib/driver/llvm.cc b/lib/driver/llvm.cc index f25b763ca..be8200b86 100644 --- a/lib/driver/llvm.cc +++ b/lib/driver/llvm.cc @@ -218,63 +218,6 @@ std::string ptx_to_cubin(const std::string& ptx, int cc) { return cubin; } -//CUmodule ptx_to_cumodule(const std::string& ptx, int cc) { -// // JIT compile source-code -// try{ -// // use ptxas if present in PATH. Otherwise, use JIT from the driver -// std::string ptxas = "ptxas"; -// std::string version; -// int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0; - -// // Use PTXAS via system call -// if(use_system_ptxas){ -// // compile ptx with ptxas -// char _fsrc[] = "/tmp/triton_k_XXXXXX"; -// char _flog[] = "/tmp/triton_l_XXXXXX"; -// mkstemp(_fsrc); -// mkstemp(_flog); -// std::string fsrc = _fsrc; -// std::string flog = _flog; -// std::string fbin = fsrc + ".o"; -// const char* _fbin = fbin.c_str(); -// std::ofstream ofs(fsrc); -// ofs << ptx; -// ofs.close(); -// std::string cmd; -// int err; -// cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog; -// err = system(cmd.c_str()); -// CUmodule ret; -// std::ifstream _cubin(_fbin, std::ios::binary ); -// std::string cubin(std::istreambuf_iterator(_cubin), {}); -// _cubin.close(); -// dispatch::cuModuleLoadData(&ret, cubin.c_str()); -// unlink(_fsrc); -// unlink(_flog); -// unlink(_fbin); -// return ret; -// } - -// // Use PTXAS included in driver -// CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER, -// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER, -// CU_JIT_LOG_VERBOSE}; -// unsigned int errbufsize = 8192; -// unsigned int logbufsize = 8192; -// char _err[errbufsize]; -// char _log[logbufsize]; -// void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1}; -// CUmodule ret; -// dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval); -// return ret; -// } -// catch(exception::cuda::invalid_ptx const &){ -// std::cout << ptx << std::endl; -// std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl; -// throw; -// } -//} - /* ------------------------ */ // HIP // /* ------------------------ */ diff --git a/python/triton/code_gen.py b/python/triton/code_gen.py index dc2b375b8..6e932859e 100644 --- a/python/triton/code_gen.py +++ b/python/triton/code_gen.py @@ -472,6 +472,7 @@ class CodeGenerator(ast.NodeVisitor): def visit_Str(self, node): return triton.language.constexpr(ast.literal_eval(node)) + def visit_Attribute(self, node): lhs = self.visit(node.value) return getattr(lhs, node.attr) diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py index ca8b463fe..d684106f1 100644 --- a/python/tutorials/01-vector-add.py +++ b/python/tutorials/01-vector-add.py @@ -38,8 +38,8 @@ def add_kernel( offsets = block_start + tl.arange(0, BLOCK_SIZE) # Create a mask to guard memory operations against out-of-bounds accesses mask = offsets < n_elements - # Load x and y from DRAM, masking out any extra elements in case - # the input is not a multiple of the block size + # Load x and y from DRAM, masking out any extra elements in case the input is not a + # multiple of the block size x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y