[CODEGEN] Fixed over-aggressive division handling in alignment pass (#280)

2021-09-15 00:40:17 -07:00
parent da5063d898
commit 313d6488f6
4 changed files with 15 additions and 16 deletions
--- a/python/test/regression/test_performance.py
+++ b/python/test/regression/test_performance.py
@@ -52,7 +52,7 @@ def test_matmul(M, N, K):
    cur_sm_clock = nvsmi(['clocks.current.sm'])[0]
    ref_sm_clock = 1350
    max_gpu_perf = 1e-6*80*8*128*cur_sm_clock
-    assert cur_sm_clock == ref_sm_clock, f'GPU SMs must run at {ref_sm_clock} MHz'
+    assert abs(cur_sm_clock - ref_sm_clock) < 5, f'GPU SMs must run at {ref_sm_clock} MHz'
    a = torch.randn((M, K), dtype=torch.float16, device='cuda')
    b = torch.randn((K, N), dtype=torch.float16, device='cuda')
    fn = lambda: triton.ops.matmul(a, b)
@@ -95,7 +95,7 @@ def test_elementwise(N):
    cur_mem_clock = nvsmi(['clocks.current.memory'])[0]
    ref_mem_clock = 877
    max_gpu_perf = 512*2*ref_mem_clock*1e-3
-    assert cur_mem_clock == ref_mem_clock, f'GPU memmory must run at {ref_mem_clock} MHz'
+    assert abs(cur_mem_clock - ref_mem_clock) < 5, f'GPU memmory must run at {ref_mem_clock} MHz'
    z = torch.empty((N, ), dtype=torch.float16, device='cuda')
    x = torch.randn_like(z)
    y = torch.randn_like(z)