From ff62f7fffca07687c62c92aefa40a1e875e65d61 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Fri, 26 Feb 2021 02:37:05 -0500 Subject: [PATCH] [PYTHON] bugfix in bench_cross_entropy --- python/bench/bench_cross_entropy.py | 2 +- python/triton/testing.py | 30 ++++++++--------------------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/python/bench/bench_cross_entropy.py b/python/bench/bench_cross_entropy.py index 7d98fad9f..238b6b0ec 100644 --- a/python/bench/bench_cross_entropy.py +++ b/python/bench/bench_cross_entropy.py @@ -30,7 +30,7 @@ def bench_op(M, N, dtype, mode, provider): if mode == 'backward': y = op(x, idx) dy = torch.randn_like(y) - ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True)) + ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), grad_to_none=x) return num_gb / ms * 1e3 if __name__ == '__main__': diff --git a/python/triton/testing.py b/python/triton/testing.py index 2415e8f43..0148eb6b9 100644 --- a/python/triton/testing.py +++ b/python/triton/testing.py @@ -1,24 +1,17 @@ import torch - def sparsify_tensor(x, mask, block): - ret = torch.empty( - (x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device - ) + ret = torch.empty((x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device) for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))): - ret[:, idx, :, :] = x[ - :, h, i * block : (i + 1) * block, j * block : (j + 1) * block - ] + ret[:, idx, :, :] = x[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] return ret - def mask_tensor(x, mask, block, value=0): ret = x.clone() for h, i, j in zip(*(mask == 0).nonzero(as_tuple=True)): - ret[:, h, i * block : (i + 1) * block, j * block : (j + 1) * block] = value + ret[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] = value return ret - def allclose(x, y): assert x.dtype == y.dtype diff = abs(x - y) @@ -28,8 +21,7 @@ def allclose(x, y): err = torch.max(diff) / torch.max(x_max, y_max) return err < tol - -def do_bench(fn, flops=0, warmup=10, rep=50): +def do_bench(fn, flops=0, warmup=10, rep=50, grad_to_none=None): start_event = torch.cuda.Event(enable_timing=True) end_event = torch.cuda.Event(enable_timing=True) ret = fn() @@ -38,17 +30,16 @@ def do_bench(fn, flops=0, warmup=10, rep=50): torch.cuda.synchronize() start_event.record() for i in range(rep): + if grad_to_none is not None: + grad_to_none.grad = None fn() end_event.record() torch.cuda.synchronize() time_ms = start_event.elapsed_time(end_event) / rep return time_ms - class Benchmark: - def __init__( - self, x_names, x_vals, y_name, y_vals, y_lines, ylabel, loglog, plot_name, args - ): + def __init__(self, x_names, x_vals, y_name, y_vals, y_lines, ylabel, loglog, plot_name, args): self.x_names = x_names self.x_vals = x_vals self.y_name = y_name @@ -59,7 +50,6 @@ class Benchmark: self.plot_name = plot_name self.args = args - class Mark: def __init__(self, fn, benchmarks): self.fn = fn @@ -73,10 +63,7 @@ class Mark: df = pd.DataFrame(columns=[bench.x_names[0]] + bench.y_lines) for x in bench.x_vals: x_args = {x_name: x for x_name in bench.x_names} - row = [ - self.fn(**x_args, **{bench.y_name: y}, **bench.args) - for y in bench.y_vals - ] + row = [self.fn(**x_args, **{bench.y_name: y}, **bench.args) for y in bench.y_vals] df.loc[len(df)] = [x] + row if with_plot and bench.plot_name: xlabel = " = ".join(bench.x_names) @@ -93,7 +80,6 @@ class Mark: for bench in self.benchmarks: self._run(bench, result_path, with_plot) - def perf_report(benchmarks): wrapper = lambda fn: Mark(fn, benchmarks) return wrapper