[OPTIMIZER] Fixed up order of shared layouts (#881)

2022-11-21 06:25:02 +01:00
parent 4d64ffb5fe
commit 23f71daa27
6 changed files with 27 additions and 27 deletions
--- a/python/tests/test_gemm.py
+++ b/python/tests/test_gemm.py
@@ -186,9 +186,9 @@ def get_proper_err(a, b, golden):
    [128, 256, 128, 4, 128, 256, 32, False, False],
    [256, 128, 64, 4, 256, 128, 16, False, False],
    [128, 64, 128, 4, 128, 64, 32, False, False],
-    # TODO[goostavz]: fix these cases
-    #[128, 64, 128, 4, 128, 64, 32, True, False],
-    #[128, 64, 128, 4, 128, 64, 32, False, True],
+    # trans
+    [128, 64, 128, 4, 128, 64, 32, True, False],
+    [128, 64, 128, 4, 128, 64, 32, False, True],
 ])
 def test_gemm(SIZE_M, SIZE_N, SIZE_K, NUM_WARPS, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, TRANS_A, TRANS_B):
    if (TRANS_A):
--- a/python/triton/compiler.py
+++ b/python/triton/compiler.py
@@ -882,6 +882,7 @@ def ttir_to_ttgir(mod, num_warps, num_stages):
    pm.enable_debug()
    # Convert blocked layout to mma layout for dot ops so that pipeline
    # can get shared memory swizzled correctly.
+    pm.add_coalesce_pass()
    pm.add_triton_gpu_combine_pass()
    pm.add_tritongpu_pipeline_pass(num_stages)
    # Prefetch must be done after pipeline pass because pipeline pass
@@ -889,7 +890,6 @@ def ttir_to_ttgir(mod, num_warps, num_stages):
    pm.add_tritongpu_prefetch_pass()
    pm.add_canonicalizer_pass()
    pm.add_cse_pass()
-    pm.add_coalesce_pass()
    pm.add_triton_gpu_combine_pass()
    pm.add_licm_pass()
    pm.add_triton_gpu_combine_pass()