From 8da4323514c02c528e07afc5bc4ef35b56ba8a45 Mon Sep 17 00:00:00 2001 From: Michael Melesse Date: Mon, 24 Oct 2022 17:58:25 +0000 Subject: [PATCH] write hipmodule bytes --- python/src/triton.cc | 16 +++++++++------- python/triton/compiler.py | 18 +++++++++++++----- scripts/amd/run.sh | 4 ++-- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/python/src/triton.cc b/python/src/triton.cc index 888ca179c..52867f9fe 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -501,6 +501,7 @@ void init_triton_codegen(py::module &&m) { std::string hipmodule; std::string name; { + std::cout << "triton.cc: compile_ttir_to_amdgpu:" << std::endl; // Scope where the GIL is released py::gil_scoped_release allow_threads; name = ir.get_function_list()[0]->get_name(); @@ -524,26 +525,27 @@ void init_triton_codegen(py::module &&m) { int version; // std::string ptxas_path = drv::path_to_ptxas(version); // Triton-IR -> AMDGCN LLVM-IR + std::cout << "\t" << ttir.str() << std::endl; + std::cout << "\t" << tmp << std::endl; triton::codegen::amd_cl_target target; auto llvm = triton::codegen::add_passes_to_emit_bin( ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map); llvm::raw_string_ostream llir(tmp); llir << *llvm; llir.flush(); - // LLVM-IR -> AMD HSACO + // LLVM-IR -> AMDGPU std::string amdgpu = drv::llir_to_amdgpu(llvm.get(), "gfx90a"); - // HSACO -> GCN + std::cout << "amdgpu = " << amdgpu << std::endl; + // AMDGPU -> Binary hipModule_t hipmodule = drv::amdgpu_to_hipmodule(amdgpu); + std::cout << "hipmodule = " << hipmodule << std::endl; } asm_map_t asm_map; asm_map["ttir"] = py::cast(ttir.str()); asm_map["llir"] = py::cast(tmp); asm_map["amdgpu"] = py::cast(amdgpu); - - if(!hipmodule.empty()){ - py::bytes bytes(hipmodule); - asm_map["hipmodule"] = bytes; - } + asm_map["hipmodule"] = py::bytes(hipmodule); + return std::make_tuple(name, asm_map, n_shared_bytes); }, py::return_value_policy::take_ownership); diff --git a/python/triton/compiler.py b/python/triton/compiler.py index ce5c77e93..c04540ea4 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -893,7 +893,7 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(), if output == "ttir": return module - assert (output == "cubin" or output == "hsaco") + assert (output == "cubin" or output == "hipmodule") if torch.version.hip is not None: backend = _triton.runtime.backend.ROCM else: @@ -1285,15 +1285,23 @@ def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: i if torch.version.hip is not None: asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages, - extern_libs, "hsaco", cc) + extern_libs, "hipmodule", cc) + # cache AMD assembly and binary + fn_cache_manager.put(asm["hipmodule"], cubin_name) + fn_cache_manager.put(asm["amdgpu"], ptx_name, binary=False) else: asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages, extern_libs, "cubin", cc) - metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages} - fn_cache_manager.put(asm["cubin"], cubin_name) - fn_cache_manager.put(asm["ptx"], ptx_name, binary=False) + # cache Nvidia assembly and binary + fn_cache_manager.put(asm["cubin"], cubin_name) + fn_cache_manager.put(asm["ptx"], ptx_name, binary=False) + + # cache triton and llvm ir fn_cache_manager.put(asm["ttir"], ttir_name, binary=False) fn_cache_manager.put(asm["llir"], llir_name, binary=False) + + # cache metadata + metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages} fn_cache_manager.put(json.dumps(metadata), data_name, binary=False) if warm_cache_only: diff --git a/scripts/amd/run.sh b/scripts/amd/run.sh index fcfaac2a3..3bdea1a4e 100644 --- a/scripts/amd/run.sh +++ b/scripts/amd/run.sh @@ -10,8 +10,8 @@ chmod -R 777 $LOG_DIR bash scripts/amd/clean.sh # bash scripts/amd/deps.sh bash scripts/amd/build.sh -# bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log -bash scripts/amd/debug.sh +bash scripts/amd/test.sh 2>&1 |tee $LOG_DIR/test.log +# bash scripts/amd/debug.sh # bash scripts/amd/backtrace.sh 2>&1 |tee $LOG_DIR/backtrace.log