From 6e50f8b2c0b66d59e798a7e75edda80fc39bf8eb Mon Sep 17 00:00:00 2001 From: Michael Melesse Date: Fri, 28 Oct 2022 17:46:52 +0000 Subject: [PATCH] print irs --- python/src/triton.cc | 8 ++++++-- python/test/unit/language/test_core.py | 7 ++++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/python/src/triton.cc b/python/src/triton.cc index da8b33227..9393ead43 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -525,18 +525,22 @@ void init_triton_codegen(py::module &&m) { int version; // std::string ptxas_path = drv::path_to_ptxas(version); // Triton-IR -> AMDGCN LLVM-IR + std::cout << "ttir:" << std::endl; std::cout << "\t" << ttir.str() << std::endl; - std::cout << "\t" << tmp << std::endl; triton::codegen::amd_cl_target target; auto llvm = triton::codegen::add_passes_to_emit_bin( ir, ctx, &target, num_warps, num_stages, n_shared_bytes, extern_lib_map); llvm::raw_string_ostream llir(tmp); llir << *llvm; + std::cout << "llir:" << std::endl; + std::cout << "\t" << llir.str() << std::endl; llir.flush(); // LLVM-IR -> AMDGPU - std::tuple amdgpu = drv::llir_to_amdgcn(llvm.get(), "gfx90a"); + std::tuple amdgpu = drv::llir_to_amdgcn(llvm.get(), "gfx90a"); amdgcn = std::get<0>(amdgpu); hsaco_path = std::get<1>(amdgpu); + std::cout << "amdgcn:" << std::endl; + std::cout << "\t" << amdgcn << std::endl; } asm_map_t asm_map; asm_map["ttir"] = py::cast(ttir.str()); diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py index 9b037165f..4284baa03 100644 --- a/python/test/unit/language/test_core.py +++ b/python/test/unit/language/test_core.py @@ -113,14 +113,14 @@ def check_type_supported(dtype): pytest.skip("bfloat16 is only supported on NVGPU with cc >= 80") -@pytest.mark.parametrize("dtype_x", [dtype_x for dtype_x in dtypes]) +@pytest.mark.parametrize("dtype_x", [dtype_x for dtype_x in dtypes] + ["bfloat16"]) def test_empty_kernel(dtype_x, device='cuda'): SIZE = 128 @triton.jit def kernel(X, SIZE: tl.constexpr): pass - # check_type_supported(dtype_x) + check_type_supported(dtype_x) x = to_triton(numpy_random(SIZE, dtype_str=dtype_x), device=device, dst_type=dtype_x) kernel[(1, )](x, SIZE=SIZE, num_warps=4) @@ -885,9 +885,9 @@ def test_f16_to_f8_rounding(): for dtype in dtypes_with_bfloat16 for shape in [32, 64, 128, 512]]) def test_reduce1d(op, dtype_str, shape, device='cuda'): + check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested if torch.version.hip is not None: pytest.skip(f"test_reduce1d currently has segfaults on ROCM") - check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested # triton kernel @triton.jit @@ -946,6 +946,7 @@ reduce_configs2 = [ @pytest.mark.parametrize("op, dtype_str, shape, axis", reduce_configs1 + reduce_configs2) def test_reduce2d(op, dtype_str, shape, axis, device='cuda'): + check_type_supported(dtype_str) # bfloat16 on cc < 80 will not be tested if torch.version.hip is not None: pytest.skip(f"test_reduce2d currently has segfaults on ROCM") # triton kernel