[DOCS] Improved plots in tutorials

2021-03-11 00:29:16 -05:00
parent eacbb73968
commit 50e58d73db
8 changed files with 122 additions and 82 deletions
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -147,33 +147,35 @@ print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.
 # Benchmarking
 # --------------------------
 # We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
+# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.
+# for different problem sizes.

-import matplotlib.pyplot as plt

-# There are three tensors of 4N bytes each. So the bandwidth of a given kernel
-# is 12N / time_ms * 1e-6 GB/s
-gbps = lambda N, ms: 12 * N / ms * 1e-6
-# We want to benchmark small and large vector alike
-sizes = [2**i for i in range(12, 25, 1)]
-triton_bw = []
-torch_bw = []
-for N in sizes:
-    x = torch.rand(N, device='cuda', dtype=torch.float32)
-    y = torch.rand(N, device='cuda', dtype=torch.float32)
-    # Triton provide a do_bench utility function that can be used to benchmark
-    # arbitrary workloads. It supports a `warmup` parameter that is used to stabilize
-    # GPU clock speeds as well as a `rep` parameter that controls the number of times
-    # the benchmark is repeated. Importantly, we set `clear_l2 = True` to make sure
-    # that the L2 cache does not contain any element of x before each kernel call when
-    # N is small.
-    do_bench = lambda fn: gbps(N, triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True))
-    triton_bw += [do_bench(lambda: add(x, y))]
-    torch_bw += [do_bench(lambda: x + y)]
-# We plot the results as a semi-log
-plt.semilogx(sizes, triton_bw, label='Triton')
-plt.semilogx(sizes, torch_bw, label='Torch')
-plt.legend()
-plt.show()
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=['size'],  # argument names to use as an x-axis for the plot
+        x_vals=[2**i for i in range(12, 28, 1)],  # different possible values for `x_name`
+        x_log=True,  # x axis is logarithmic
+        y_name='provider',  # argument name whose value corresponds to a different line in the plot
+        y_vals=['torch', 'triton'],  # possible keys for `y_name`
+        y_lines=["Torch", "Triton"],  # label name for the lines
+        ylabel="GB/s",  # label name for the y-axis
+        plot_name="vector-add-performance",  # name for the plot. Used also as a file name for saving the plot.
+        args={}  # values for function arguments not in `x_names` and `y_name`
+    )
+)
+def benchmark(size, provider):
+    x = torch.rand(size, device='cuda', dtype=torch.float32)
+    y = torch.rand(size, device='cuda', dtype=torch.float32)
+    if provider == 'torch':
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
+    if provider == 'triton':
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
+    gbps = lambda ms: 12 * size / ms * 1e-6
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+

 # %%
-# Seems like our simple element-wise operation operates at peak bandwidth. While this is a fairly low bar for a custom GPU programming language, this is a good start before we move to more advanced operations.
+# We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
+# `save_path='/path/to/results/' to save them to disk along with raw CSV data
+benchmark.run(show_plots=True)
--- a/python/tutorials/02-fused-softmax.py
+++ b/python/tutorials/02-fused-softmax.py
@@ -179,27 +179,32 @@ print(torch.allclose(y_tri, y_ref))
 # Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
 # We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.

-import matplotlib.pyplot as plt

-M = 4096
-Ns = [256 * i for i in range(2, 50)]
-tri_bw = []
-ref_bw = []
-def_bw = []
-for N in Ns:
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=['N'],  # argument names to use as an x-axis for the plot
+        x_vals=[256 * i for i in range(2, 50)],  # different possible values for `x_name`
+        y_name='provider',  # argument name whose value corresponds to a different line in the plot
+        y_vals=['torch', 'triton', 'naive'],  # possible keys for `y_name`
+        y_lines=["Torch", "Triton", 'Naive'],  # label name for the lines
+        ylabel="GB/s",  # label name for the y-axis
+        plot_name="softmax-performance",  # name for the plot. Used also as a file name for saving the plot.
+        args={'M': 4096}  # values for function arguments not in `x_names` and `y_name`
+    )
+)
+def benchmark(M, N, provider):
    x = torch.randn(M, N, device='cuda', dtype=torch.float32)
-    gbps = lambda ms: x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
-    do_bench = lambda fn: gbps(triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True))
-    tri_bw += [do_bench(lambda: softmax(x))]
-    ref_bw += [do_bench(lambda: torch.softmax(x, axis=1))]
-    def_bw += [do_bench(lambda: naive_softmax(x))]
-plt.xlabel('N')
-plt.ylabel('Bandwidth (GB/s)')
-plt.plot(Ns, tri_bw, label='Triton')
-plt.plot(Ns, ref_bw, label='Torch')
-plt.plot(Ns, def_bw, label='Naive')
-plt.legend()
-plt.show()
+    if provider == 'torch':
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
+    if provider == 'triton':
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))
+    if provider == 'naive':
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))
+    gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+benchmark.run(show_plots=True)

 # %%
 # In the above plot, we can see that: