write hipmodule bytes
This commit is contained in:
@@ -501,6 +501,7 @@ void init_triton_codegen(py::module &&m) {
|
|||||||
std::string hipmodule;
|
std::string hipmodule;
|
||||||
std::string name;
|
std::string name;
|
||||||
{
|
{
|
||||||
|
std::cout << "triton.cc: compile_ttir_to_amdgpu:" << std::endl;
|
||||||
// Scope where the GIL is released
|
// Scope where the GIL is released
|
||||||
py::gil_scoped_release allow_threads;
|
py::gil_scoped_release allow_threads;
|
||||||
name = ir.get_function_list()[0]->get_name();
|
name = ir.get_function_list()[0]->get_name();
|
||||||
@@ -524,26 +525,27 @@ void init_triton_codegen(py::module &&m) {
|
|||||||
int version;
|
int version;
|
||||||
// std::string ptxas_path = drv::path_to_ptxas(version);
|
// std::string ptxas_path = drv::path_to_ptxas(version);
|
||||||
// Triton-IR -> AMDGCN LLVM-IR
|
// Triton-IR -> AMDGCN LLVM-IR
|
||||||
|
std::cout << "\t" << ttir.str() << std::endl;
|
||||||
|
std::cout << "\t" << tmp << std::endl;
|
||||||
triton::codegen::amd_cl_target target;
|
triton::codegen::amd_cl_target target;
|
||||||
auto llvm = triton::codegen::add_passes_to_emit_bin(
|
auto llvm = triton::codegen::add_passes_to_emit_bin(
|
||||||
ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map);
|
ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map);
|
||||||
llvm::raw_string_ostream llir(tmp);
|
llvm::raw_string_ostream llir(tmp);
|
||||||
llir << *llvm;
|
llir << *llvm;
|
||||||
llir.flush();
|
llir.flush();
|
||||||
// LLVM-IR -> AMD HSACO
|
// LLVM-IR -> AMDGPU
|
||||||
std::string amdgpu = drv::llir_to_amdgpu(llvm.get(), "gfx90a");
|
std::string amdgpu = drv::llir_to_amdgpu(llvm.get(), "gfx90a");
|
||||||
// HSACO -> GCN
|
std::cout << "amdgpu = " << amdgpu << std::endl;
|
||||||
|
// AMDGPU -> Binary
|
||||||
hipModule_t hipmodule = drv::amdgpu_to_hipmodule(amdgpu);
|
hipModule_t hipmodule = drv::amdgpu_to_hipmodule(amdgpu);
|
||||||
|
std::cout << "hipmodule = " << hipmodule << std::endl;
|
||||||
}
|
}
|
||||||
asm_map_t asm_map;
|
asm_map_t asm_map;
|
||||||
asm_map["ttir"] = py::cast(ttir.str());
|
asm_map["ttir"] = py::cast(ttir.str());
|
||||||
asm_map["llir"] = py::cast(tmp);
|
asm_map["llir"] = py::cast(tmp);
|
||||||
asm_map["amdgpu"] = py::cast(amdgpu);
|
asm_map["amdgpu"] = py::cast(amdgpu);
|
||||||
|
asm_map["hipmodule"] = py::bytes(hipmodule);
|
||||||
if(!hipmodule.empty()){
|
|
||||||
py::bytes bytes(hipmodule);
|
|
||||||
asm_map["hipmodule"] = bytes;
|
|
||||||
}
|
|
||||||
return std::make_tuple(name, asm_map, n_shared_bytes);
|
return std::make_tuple(name, asm_map, n_shared_bytes);
|
||||||
},
|
},
|
||||||
py::return_value_policy::take_ownership);
|
py::return_value_policy::take_ownership);
|
||||||
|
@@ -893,7 +893,7 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(),
|
|||||||
if output == "ttir":
|
if output == "ttir":
|
||||||
return module
|
return module
|
||||||
|
|
||||||
assert (output == "cubin" or output == "hsaco")
|
assert (output == "cubin" or output == "hipmodule")
|
||||||
if torch.version.hip is not None:
|
if torch.version.hip is not None:
|
||||||
backend = _triton.runtime.backend.ROCM
|
backend = _triton.runtime.backend.ROCM
|
||||||
else:
|
else:
|
||||||
@@ -1285,15 +1285,23 @@ def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: i
|
|||||||
|
|
||||||
if torch.version.hip is not None:
|
if torch.version.hip is not None:
|
||||||
asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
|
asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
|
||||||
extern_libs, "hsaco", cc)
|
extern_libs, "hipmodule", cc)
|
||||||
|
# cache AMD assembly and binary
|
||||||
|
fn_cache_manager.put(asm["hipmodule"], cubin_name)
|
||||||
|
fn_cache_manager.put(asm["amdgpu"], ptx_name, binary=False)
|
||||||
else:
|
else:
|
||||||
asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
|
asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
|
||||||
extern_libs, "cubin", cc)
|
extern_libs, "cubin", cc)
|
||||||
metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages}
|
# cache Nvidia assembly and binary
|
||||||
fn_cache_manager.put(asm["cubin"], cubin_name)
|
fn_cache_manager.put(asm["cubin"], cubin_name)
|
||||||
fn_cache_manager.put(asm["ptx"], ptx_name, binary=False)
|
fn_cache_manager.put(asm["ptx"], ptx_name, binary=False)
|
||||||
|
|
||||||
|
# cache triton and llvm ir
|
||||||
fn_cache_manager.put(asm["ttir"], ttir_name, binary=False)
|
fn_cache_manager.put(asm["ttir"], ttir_name, binary=False)
|
||||||
fn_cache_manager.put(asm["llir"], llir_name, binary=False)
|
fn_cache_manager.put(asm["llir"], llir_name, binary=False)
|
||||||
|
|
||||||
|
# cache metadata
|
||||||
|
metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages}
|
||||||
fn_cache_manager.put(json.dumps(metadata), data_name, binary=False)
|
fn_cache_manager.put(json.dumps(metadata), data_name, binary=False)
|
||||||
|
|
||||||
if warm_cache_only:
|
if warm_cache_only:
|
||||||
|
@@ -10,8 +10,8 @@ chmod -R 777 $LOG_DIR
|
|||||||
bash scripts/amd/clean.sh
|
bash scripts/amd/clean.sh
|
||||||
# bash scripts/amd/deps.sh
|
# bash scripts/amd/deps.sh
|
||||||
bash scripts/amd/build.sh
|
bash scripts/amd/build.sh
|
||||||
# bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
|
bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log
|
||||||
bash scripts/amd/debug.sh
|
# bash scripts/amd/debug.sh
|
||||||
# bash scripts/amd/backtrace.sh 2>&1 |tee $LOG_DIR/backtrace.log
|
# bash scripts/amd/backtrace.sh 2>&1 |tee $LOG_DIR/backtrace.log
|
||||||
|
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user