diff --git a/python/test/regression/test_performance.py b/python/test/regression/test_performance.py index 16811eaa9..afec19019 100644 --- a/python/test/regression/test_performance.py +++ b/python/test/regression/test_performance.py @@ -128,7 +128,7 @@ elementwise_data = { 1024 * 16: 0.0219, 1024 * 64: 0.0791, 1024 * 256: 0.243, - 1024 * 1024: 0.534, + 1024 * 1024: 0.530, 1024 * 4096: 0.796, 1024 * 16384: 0.905, 1024 * 65536: 0.939, diff --git a/python/triton/testing.py b/python/triton/testing.py index b474baaf8..c83c1e682 100644 --- a/python/triton/testing.py +++ b/python/triton/testing.py @@ -115,7 +115,9 @@ def nvsmi(attrs): return ret -def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0.8], record_clocks=False): +def do_bench(fn, warmup=25, rep=100, grad_to_none=None, + percentiles=(0.5, 0.2, 0.8), + record_clocks=False, fast_flush=False): """ Benchmark the runtime of the provided function. By default, return the median runtime of :code:`fn` along with the 20-th and 80-th performance percentile. @@ -130,6 +132,8 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0 :type grad_to_none: torch.tensor, optional :param percentiles: Performance percentile to return in addition to the median. :type percentiles: list[float] + :param fast_flush: Use faster kernel to flush L2 between measurements + :type fast_flush: bool """ # Estimate the runtime of the function @@ -151,7 +155,10 @@ def do_bench(fn, warmup=25, rep=100, grad_to_none=None, percentiles=[0.5, 0.2, 0 # doesn't contain any input data before the run start_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] end_event = [torch.cuda.Event(enable_timing=True) for i in range(n_repeat)] - cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda') + if fast_flush: + cache = torch.empty(int(256e6 // 4), dtype=torch.int, device='cuda') + else: + cache = torch.empty(int(256e6), dtype=torch.int8, device='cuda') # Warm-up for _ in range(n_warmup): fn()