Merge branch 'master' into v2.0
This commit is contained in:
@@ -218,63 +218,6 @@ std::string ptx_to_cubin(const std::string& ptx, int cc) {
|
|||||||
return cubin;
|
return cubin;
|
||||||
}
|
}
|
||||||
|
|
||||||
//CUmodule ptx_to_cumodule(const std::string& ptx, int cc) {
|
|
||||||
// // JIT compile source-code
|
|
||||||
// try{
|
|
||||||
// // use ptxas if present in PATH. Otherwise, use JIT from the driver
|
|
||||||
// std::string ptxas = "ptxas";
|
|
||||||
// std::string version;
|
|
||||||
// int use_system_ptxas = tools::exec(ptxas + " --version 2>&1", version) == 0;
|
|
||||||
|
|
||||||
// // Use PTXAS via system call
|
|
||||||
// if(use_system_ptxas){
|
|
||||||
// // compile ptx with ptxas
|
|
||||||
// char _fsrc[] = "/tmp/triton_k_XXXXXX";
|
|
||||||
// char _flog[] = "/tmp/triton_l_XXXXXX";
|
|
||||||
// mkstemp(_fsrc);
|
|
||||||
// mkstemp(_flog);
|
|
||||||
// std::string fsrc = _fsrc;
|
|
||||||
// std::string flog = _flog;
|
|
||||||
// std::string fbin = fsrc + ".o";
|
|
||||||
// const char* _fbin = fbin.c_str();
|
|
||||||
// std::ofstream ofs(fsrc);
|
|
||||||
// ofs << ptx;
|
|
||||||
// ofs.close();
|
|
||||||
// std::string cmd;
|
|
||||||
// int err;
|
|
||||||
// cmd = ptxas + " -v --gpu-name=sm_" + std::to_string(cc) + " " + fsrc + " -o " + fsrc + ".o 2> " + flog;
|
|
||||||
// err = system(cmd.c_str());
|
|
||||||
// CUmodule ret;
|
|
||||||
// std::ifstream _cubin(_fbin, std::ios::binary );
|
|
||||||
// std::string cubin(std::istreambuf_iterator<char>(_cubin), {});
|
|
||||||
// _cubin.close();
|
|
||||||
// dispatch::cuModuleLoadData(&ret, cubin.c_str());
|
|
||||||
// unlink(_fsrc);
|
|
||||||
// unlink(_flog);
|
|
||||||
// unlink(_fbin);
|
|
||||||
// return ret;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// // Use PTXAS included in driver
|
|
||||||
// CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER,
|
|
||||||
// CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, CU_JIT_INFO_LOG_BUFFER,
|
|
||||||
// CU_JIT_LOG_VERBOSE};
|
|
||||||
// unsigned int errbufsize = 8192;
|
|
||||||
// unsigned int logbufsize = 8192;
|
|
||||||
// char _err[errbufsize];
|
|
||||||
// char _log[logbufsize];
|
|
||||||
// void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)_err, (void*)(uintptr_t)logbufsize, (void*)_log, (void*)1};
|
|
||||||
// CUmodule ret;
|
|
||||||
// dispatch::cuModuleLoadDataEx(&ret, ptx.data(), 5, opt, optval);
|
|
||||||
// return ret;
|
|
||||||
// }
|
|
||||||
// catch(exception::cuda::invalid_ptx const &){
|
|
||||||
// std::cout << ptx << std::endl;
|
|
||||||
// std::cerr << "It appears that Triton produced invalid PTX code:" << std::endl;
|
|
||||||
// throw;
|
|
||||||
// }
|
|
||||||
//}
|
|
||||||
|
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
// HIP //
|
// HIP //
|
||||||
/* ------------------------ */
|
/* ------------------------ */
|
||||||
|
@@ -472,6 +472,7 @@ class CodeGenerator(ast.NodeVisitor):
|
|||||||
def visit_Str(self, node):
|
def visit_Str(self, node):
|
||||||
return triton.language.constexpr(ast.literal_eval(node))
|
return triton.language.constexpr(ast.literal_eval(node))
|
||||||
|
|
||||||
|
|
||||||
def visit_Attribute(self, node):
|
def visit_Attribute(self, node):
|
||||||
lhs = self.visit(node.value)
|
lhs = self.visit(node.value)
|
||||||
return getattr(lhs, node.attr)
|
return getattr(lhs, node.attr)
|
||||||
|
@@ -38,8 +38,8 @@ def add_kernel(
|
|||||||
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
||||||
# Create a mask to guard memory operations against out-of-bounds accesses
|
# Create a mask to guard memory operations against out-of-bounds accesses
|
||||||
mask = offsets < n_elements
|
mask = offsets < n_elements
|
||||||
# Load x and y from DRAM, masking out any extra elements in case
|
# Load x and y from DRAM, masking out any extra elements in case the input is not a
|
||||||
# the input is not a multiple of the block size
|
# multiple of the block size
|
||||||
x = tl.load(x_ptr + offsets, mask=mask)
|
x = tl.load(x_ptr + offsets, mask=mask)
|
||||||
y = tl.load(y_ptr + offsets, mask=mask)
|
y = tl.load(y_ptr + offsets, mask=mask)
|
||||||
output = x + y
|
output = x + y
|
||||||
|
Reference in New Issue
Block a user