write hipmodule bytes

2022-10-24 17:58:25 +00:00
parent eb89e9bdd9
commit 8da4323514
3 changed files with 24 additions and 14 deletions
--- a/python/src/triton.cc
+++ b/python/src/triton.cc
@@ -501,6 +501,7 @@ void init_triton_codegen(py::module &&m) {
          std::string hipmodule;
          std::string name;
          { 
+            std::cout << "triton.cc: compile_ttir_to_amdgpu:" << std::endl;
            // Scope where the GIL is released
            py::gil_scoped_release allow_threads;
            name = ir.get_function_list()[0]->get_name();
@@ -524,26 +525,27 @@ void init_triton_codegen(py::module &&m) {
            int version;
            // std::string ptxas_path = drv::path_to_ptxas(version);
            // Triton-IR -> AMDGCN LLVM-IR
+            std::cout << "\t" << ttir.str() << std::endl;
+            std::cout << "\t" << tmp << std::endl;
            triton::codegen::amd_cl_target target;
            auto llvm = triton::codegen::add_passes_to_emit_bin(
                ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map);
            llvm::raw_string_ostream llir(tmp);
            llir << *llvm;
            llir.flush();
-            // LLVM-IR -> AMD HSACO
+            // LLVM-IR -> AMDGPU
            std::string amdgpu = drv::llir_to_amdgpu(llvm.get(), "gfx90a");
-            // HSACO -> GCN
+            std::cout << "amdgpu = " << amdgpu << std::endl;
+            // AMDGPU -> Binary
            hipModule_t hipmodule = drv::amdgpu_to_hipmodule(amdgpu);
+            std::cout << "hipmodule = " << hipmodule << std::endl;
          }
          asm_map_t asm_map;
          asm_map["ttir"] = py::cast(ttir.str());
          asm_map["llir"] = py::cast(tmp);
          asm_map["amdgpu"] = py::cast(amdgpu);
-
-          if(!hipmodule.empty()){
-            py::bytes bytes(hipmodule);
-            asm_map["hipmodule"] = bytes;
-          }
+          asm_map["hipmodule"] = py::bytes(hipmodule);
+          
          return std::make_tuple(name, asm_map, n_shared_bytes);
      },
      py::return_value_policy::take_ownership);
--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -893,7 +893,7 @@ def _compile(fn, signature: str, device: int = -1, constants=dict(),
    if output == "ttir":
        return module

-    assert (output == "cubin" or output == "hsaco")
+    assert (output == "cubin" or output == "hipmodule")
    if torch.version.hip is not None:
        backend = _triton.runtime.backend.ROCM
    else:
@@ -1285,15 +1285,23 @@ def compile(fn, signature: str, device: int = -1, constants=dict(), num_warps: i

        if torch.version.hip is not None:
            asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
-                                                extern_libs, "hsaco", cc)
+                                                extern_libs, "hipmodule", cc)
+            # cache AMD assembly and binary
+            fn_cache_manager.put(asm["hipmodule"], cubin_name)
+            fn_cache_manager.put(asm["amdgpu"], ptx_name, binary=False)
        else:
            asm, shared, kernel_name = _compile(fn, signature, device, constants, configs[0], num_warps, num_stages,
                                                extern_libs, "cubin", cc)
-        metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages}
-        fn_cache_manager.put(asm["cubin"], cubin_name)
-        fn_cache_manager.put(asm["ptx"], ptx_name, binary=False)
+            # cache Nvidia assembly and binary
+            fn_cache_manager.put(asm["cubin"], cubin_name)
+            fn_cache_manager.put(asm["ptx"], ptx_name, binary=False)
+
+        # cache triton and llvm ir
        fn_cache_manager.put(asm["ttir"], ttir_name, binary=False)
        fn_cache_manager.put(asm["llir"], llir_name, binary=False)
+
+        # cache metadata
+        metadata = {"name": kernel_name, "shared": shared, "num_warps": num_warps, "num_stages": num_stages}
        fn_cache_manager.put(json.dumps(metadata), data_name, binary=False)

    if warm_cache_only:
--- a/scripts/amd/run.sh
+++ b/scripts/amd/run.sh
@@ -10,8 +10,8 @@ chmod -R 777 $LOG_DIR
 bash scripts/amd/clean.sh
 # bash scripts/amd/deps.sh
 bash scripts/amd/build.sh
-# bash scripts/amd/test.sh  2>&1 |tee $LOG_DIR/test.log
-bash scripts/amd/debug.sh
+bash scripts/amd/test.sh  2>&1 |tee $LOG_DIR/test.log
+# bash scripts/amd/debug.sh
 # bash scripts/amd/backtrace.sh 2>&1 |tee $LOG_DIR/backtrace.log