From ff62f7fffca07687c62c92aefa40a1e875e65d61 Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Fri, 26 Feb 2021 02:37:05 -0500
Subject: [PATCH] [PYTHON] bugfix in bench_cross_entropy

---
 python/bench/bench_cross_entropy.py |  2 +-
 python/triton/testing.py            | 30 ++++++++---------------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/python/bench/bench_cross_entropy.py b/python/bench/bench_cross_entropy.py
index 7d98fad9f..238b6b0ec 100644
--- a/python/bench/bench_cross_entropy.py
+++ b/python/bench/bench_cross_entropy.py
@@ -30,7 +30,7 @@ def bench_op(M, N, dtype, mode, provider):
     if mode == 'backward':
         y = op(x, idx)
         dy = torch.randn_like(y)
-        ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True))
+        ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), grad_to_none=x)
     return num_gb / ms * 1e3
 
 if __name__ == '__main__':
diff --git a/python/triton/testing.py b/python/triton/testing.py
index 2415e8f43..0148eb6b9 100644
--- a/python/triton/testing.py
+++ b/python/triton/testing.py
@@ -1,24 +1,17 @@
 import torch
 
-
 def sparsify_tensor(x, mask, block):
-    ret = torch.empty(
-        (x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device
-    )
+    ret = torch.empty((x.size(0), mask.sum(), block, block), dtype=x.dtype, device=x.device)
     for idx, (h, i, j) in enumerate(zip(*mask.nonzero(as_tuple=True))):
-        ret[:, idx, :, :] = x[
-            :, h, i * block : (i + 1) * block, j * block : (j + 1) * block
-        ]
+        ret[:, idx, :, :] = x[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block]
     return ret
 
-
 def mask_tensor(x, mask, block, value=0):
     ret = x.clone()
     for h, i, j in zip(*(mask == 0).nonzero(as_tuple=True)):
-        ret[:, h, i * block : (i + 1) * block, j * block : (j + 1) * block] = value
+        ret[:, h, i * block:(i + 1) * block, j * block:(j + 1) * block] = value
     return ret
 
-
 def allclose(x, y):
     assert x.dtype == y.dtype
     diff = abs(x - y)
@@ -28,8 +21,7 @@ def allclose(x, y):
     err = torch.max(diff) / torch.max(x_max, y_max)
     return err < tol
 
-
-def do_bench(fn, flops=0, warmup=10, rep=50):
+def do_bench(fn, flops=0, warmup=10, rep=50, grad_to_none=None):
     start_event = torch.cuda.Event(enable_timing=True)
     end_event = torch.cuda.Event(enable_timing=True)
     ret = fn()
@@ -38,17 +30,16 @@ def do_bench(fn, flops=0, warmup=10, rep=50):
     torch.cuda.synchronize()
     start_event.record()
     for i in range(rep):
+        if grad_to_none is not None:
+            grad_to_none.grad = None
         fn()
     end_event.record()
     torch.cuda.synchronize()
     time_ms = start_event.elapsed_time(end_event) / rep
     return time_ms
 
-
 class Benchmark:
-    def __init__(
-        self, x_names, x_vals, y_name, y_vals, y_lines, ylabel, loglog, plot_name, args
-    ):
+    def __init__(self, x_names, x_vals, y_name, y_vals, y_lines, ylabel, loglog, plot_name, args):
         self.x_names = x_names
         self.x_vals = x_vals
         self.y_name = y_name
@@ -59,7 +50,6 @@ class Benchmark:
         self.plot_name = plot_name
         self.args = args
 
-
 class Mark:
     def __init__(self, fn, benchmarks):
         self.fn = fn
@@ -73,10 +63,7 @@ class Mark:
         df = pd.DataFrame(columns=[bench.x_names[0]] + bench.y_lines)
         for x in bench.x_vals:
             x_args = {x_name: x for x_name in bench.x_names}
-            row = [
-                self.fn(**x_args, **{bench.y_name: y}, **bench.args)
-                for y in bench.y_vals
-            ]
+            row = [self.fn(**x_args, **{bench.y_name: y}, **bench.args) for y in bench.y_vals]
             df.loc[len(df)] = [x] + row
         if with_plot and bench.plot_name:
             xlabel = " = ".join(bench.x_names)
@@ -93,7 +80,6 @@ class Mark:
         for bench in self.benchmarks:
             self._run(bench, result_path, with_plot)
 
-
 def perf_report(benchmarks):
     wrapper = lambda fn: Mark(fn, benchmarks)
     return wrapper