[OPTIMIZER] Fixed up order of shared layouts (#881)

This commit is contained in:
Philippe Tillet
2022-11-21 06:25:02 +01:00
committed by GitHub
parent 4d64ffb5fe
commit 23f71daa27
6 changed files with 27 additions and 27 deletions

View File

@@ -186,9 +186,9 @@ def get_proper_err(a, b, golden):
[128, 256, 128, 4, 128, 256, 32, False, False],
[256, 128, 64, 4, 256, 128, 16, False, False],
[128, 64, 128, 4, 128, 64, 32, False, False],
# TODO[goostavz]: fix these cases
#[128, 64, 128, 4, 128, 64, 32, True, False],
#[128, 64, 128, 4, 128, 64, 32, False, True],
# trans
[128, 64, 128, 4, 128, 64, 32, True, False],
[128, 64, 128, 4, 128, 64, 32, False, True],
])
def test_gemm(SIZE_M, SIZE_N, SIZE_K, NUM_WARPS, BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, TRANS_A, TRANS_B):
if (TRANS_A):

View File

@@ -882,6 +882,7 @@ def ttir_to_ttgir(mod, num_warps, num_stages):
pm.enable_debug()
# Convert blocked layout to mma layout for dot ops so that pipeline
# can get shared memory swizzled correctly.
pm.add_coalesce_pass()
pm.add_triton_gpu_combine_pass()
pm.add_tritongpu_pipeline_pass(num_stages)
# Prefetch must be done after pipeline pass because pipeline pass
@@ -889,7 +890,6 @@ def ttir_to_ttgir(mod, num_warps, num_stages):
pm.add_tritongpu_prefetch_pass()
pm.add_canonicalizer_pass()
pm.add_cse_pass()
pm.add_coalesce_pass()
pm.add_triton_gpu_combine_pass()
pm.add_licm_pass()
pm.add_triton_gpu_combine_pass()