[GH-PAGES] Updated website

This commit is contained in:
Philippe Tillet
2021-03-11 11:58:42 -05:00
parent 7a7ed5da3b
commit 8316c4bbb1
17 changed files with 205 additions and 164 deletions

View File

@@ -122,7 +122,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"import matplotlib.pyplot as plt\n\nM = 4096\nNs = [256 * i for i in range(2, 50)]\ntri_bw = []\nref_bw = []\ndef_bw = []\nfor N in Ns:\n x = torch.randn(M, N, device='cuda', dtype=torch.float32)\n gbps = lambda ms: x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)\n do_bench = lambda fn: gbps(triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True))\n tri_bw += [do_bench(lambda: softmax(x))]\n ref_bw += [do_bench(lambda: torch.softmax(x, axis=1))]\n def_bw += [do_bench(lambda: naive_softmax(x))]\nplt.xlabel('N')\nplt.ylabel('Bandwidth (GB/s)')\nplt.plot(Ns, tri_bw, label='Triton')\nplt.plot(Ns, ref_bw, label='Torch')\nplt.plot(Ns, def_bw, label='Naive')\nplt.legend()\nplt.show()" "@triton.testing.perf_report(\n triton.testing.Benchmark(\n x_names=['N'], # argument names to use as an x-axis for the plot\n x_vals=[256 * i for i in range(2, 50)], # different possible values for `x_name`\n y_name='provider', # argument name whose value corresponds to a different line in the plot\n y_vals=['torch', 'triton', 'naive'], # possible keys for `y_name`\n y_lines=[\"Torch\", \"Triton\", 'Naive'], # label name for the lines\n ylabel=\"GB/s\", # label name for the y-axis\n plot_name=\"softmax-performance\", # name for the plot. Used also as a file name for saving the plot.\n args={'M': 4096} # values for function arguments not in `x_names` and `y_name`\n )\n)\ndef benchmark(M, N, provider):\n x = torch.randn(M, N, device='cuda', dtype=torch.float32)\n if provider == 'torch':\n ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))\n if provider == 'triton':\n ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))\n if provider == 'naive':\n ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))\n gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)\n return gbps(ms), gbps(max_ms), gbps(min_ms)\n\n\nbenchmark.run(show_plots=True)"
] ]
}, },
{ {

View File

@@ -147,33 +147,35 @@ print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.
# Benchmarking # Benchmarking
# -------------------------- # --------------------------
# We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch. # We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.
# for different problem sizes.
import matplotlib.pyplot as plt
# There are three tensors of 4N bytes each. So the bandwidth of a given kernel @triton.testing.perf_report(
# is 12N / time_ms * 1e-6 GB/s triton.testing.Benchmark(
gbps = lambda N, ms: 12 * N / ms * 1e-6 x_names=['size'], # argument names to use as an x-axis for the plot
# We want to benchmark small and large vector alike x_vals=[2**i for i in range(12, 28, 1)], # different possible values for `x_name`
sizes = [2**i for i in range(12, 25, 1)] x_log=True, # x axis is logarithmic
triton_bw = [] y_name='provider', # argument name whose value corresponds to a different line in the plot
torch_bw = [] y_vals=['torch', 'triton'], # possible keys for `y_name`
for N in sizes: y_lines=["Torch", "Triton"], # label name for the lines
x = torch.rand(N, device='cuda', dtype=torch.float32) ylabel="GB/s", # label name for the y-axis
y = torch.rand(N, device='cuda', dtype=torch.float32) plot_name="vector-add-performance", # name for the plot. Used also as a file name for saving the plot.
# Triton provide a do_bench utility function that can be used to benchmark args={} # values for function arguments not in `x_names` and `y_name`
# arbitrary workloads. It supports a `warmup` parameter that is used to stabilize )
# GPU clock speeds as well as a `rep` parameter that controls the number of times )
# the benchmark is repeated. Importantly, we set `clear_l2 = True` to make sure def benchmark(size, provider):
# that the L2 cache does not contain any element of x before each kernel call when x = torch.rand(size, device='cuda', dtype=torch.float32)
# N is small. y = torch.rand(size, device='cuda', dtype=torch.float32)
do_bench = lambda fn: gbps(N, triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True)) if provider == 'torch':
triton_bw += [do_bench(lambda: add(x, y))] ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
torch_bw += [do_bench(lambda: x + y)] if provider == 'triton':
# We plot the results as a semi-log ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
plt.semilogx(sizes, triton_bw, label='Triton') gbps = lambda ms: 12 * size / ms * 1e-6
plt.semilogx(sizes, torch_bw, label='Torch') return gbps(ms), gbps(max_ms), gbps(min_ms)
plt.legend()
plt.show()
# %% # %%
# Seems like our simple element-wise operation operates at peak bandwidth. While this is a fairly low bar for a custom GPU programming language, this is a good start before we move to more advanced operations. # We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
# `save_path='/path/to/results/' to save them to disk along with raw CSV data
benchmark.run(show_plots=True)

View File

@@ -179,27 +179,32 @@ print(torch.allclose(y_tri, y_ref))
# Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows. # Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
# We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above. # We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.
import matplotlib.pyplot as plt
M = 4096 @triton.testing.perf_report(
Ns = [256 * i for i in range(2, 50)] triton.testing.Benchmark(
tri_bw = [] x_names=['N'], # argument names to use as an x-axis for the plot
ref_bw = [] x_vals=[256 * i for i in range(2, 50)], # different possible values for `x_name`
def_bw = [] y_name='provider', # argument name whose value corresponds to a different line in the plot
for N in Ns: y_vals=['torch', 'triton', 'naive'], # possible keys for `y_name`
y_lines=["Torch", "Triton", 'Naive'], # label name for the lines
ylabel="GB/s", # label name for the y-axis
plot_name="softmax-performance", # name for the plot. Used also as a file name for saving the plot.
args={'M': 4096} # values for function arguments not in `x_names` and `y_name`
)
)
def benchmark(M, N, provider):
x = torch.randn(M, N, device='cuda', dtype=torch.float32) x = torch.randn(M, N, device='cuda', dtype=torch.float32)
gbps = lambda ms: x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3) if provider == 'torch':
do_bench = lambda fn: gbps(triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True)) ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
tri_bw += [do_bench(lambda: softmax(x))] if provider == 'triton':
ref_bw += [do_bench(lambda: torch.softmax(x, axis=1))] ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))
def_bw += [do_bench(lambda: naive_softmax(x))] if provider == 'naive':
plt.xlabel('N') ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))
plt.ylabel('Bandwidth (GB/s)') gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
plt.plot(Ns, tri_bw, label='Triton') return gbps(ms), gbps(max_ms), gbps(min_ms)
plt.plot(Ns, ref_bw, label='Torch')
plt.plot(Ns, def_bw, label='Naive')
plt.legend() benchmark.run(show_plots=True)
plt.show()
# %% # %%
# In the above plot, we can see that: # In the above plot, we can see that:

View File

@@ -79,7 +79,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Benchmarking\nWe can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.\n\n" "## Benchmarking\nWe can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.\nTo make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.\nfor different problem sizes.\n\n"
] ]
}, },
{ {
@@ -90,14 +90,25 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"import matplotlib.pyplot as plt\n\n# There are three tensors of 4N bytes each. So the bandwidth of a given kernel\n# is 12N / time_ms * 1e-6 GB/s\ngbps = lambda N, ms: 12 * N / ms * 1e-6\n# We want to benchmark small and large vector alike\nsizes = [2**i for i in range(12, 25, 1)]\ntriton_bw = []\ntorch_bw = []\nfor N in sizes:\n x = torch.rand(N, device='cuda', dtype=torch.float32)\n y = torch.rand(N, device='cuda', dtype=torch.float32)\n # Triton provide a do_bench utility function that can be used to benchmark\n # arbitrary workloads. It supports a `warmup` parameter that is used to stabilize\n # GPU clock speeds as well as a `rep` parameter that controls the number of times\n # the benchmark is repeated. Importantly, we set `clear_l2 = True` to make sure\n # that the L2 cache does not contain any element of x before each kernel call when\n # N is small.\n do_bench = lambda fn: gbps(N, triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True))\n triton_bw += [do_bench(lambda: add(x, y))]\n torch_bw += [do_bench(lambda: x + y)]\n# We plot the results as a semi-log\nplt.semilogx(sizes, triton_bw, label='Triton')\nplt.semilogx(sizes, torch_bw, label='Torch')\nplt.legend()\nplt.show()" "@triton.testing.perf_report(\n triton.testing.Benchmark(\n x_names=['size'], # argument names to use as an x-axis for the plot\n x_vals=[2**i for i in range(12, 28, 1)], # different possible values for `x_name`\n x_log=True, # x axis is logarithmic\n y_name='provider', # argument name whose value corresponds to a different line in the plot\n y_vals=['torch', 'triton'], # possible keys for `y_name`\n y_lines=[\"Torch\", \"Triton\"], # label name for the lines\n ylabel=\"GB/s\", # label name for the y-axis\n plot_name=\"vector-add-performance\", # name for the plot. Used also as a file name for saving the plot.\n args={} # values for function arguments not in `x_names` and `y_name`\n )\n)\ndef benchmark(size, provider):\n x = torch.rand(size, device='cuda', dtype=torch.float32)\n y = torch.rand(size, device='cuda', dtype=torch.float32)\n if provider == 'torch':\n ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)\n if provider == 'triton':\n ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))\n gbps = lambda ms: 12 * size / ms * 1e-6\n return gbps(ms), gbps(max_ms), gbps(min_ms)"
] ]
}, },
{ {
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"Seems like our simple element-wise operation operates at peak bandwidth. While this is a fairly low bar for a custom GPU programming language, this is a good start before we move to more advanced operations.\n" "We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or\n`save_path='/path/to/results/' to save them to disk along with raw CSV data\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"benchmark.run(show_plots=True)"
] ]
} }
], ],

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -200,63 +200,75 @@ Of course, the first thing that we should check is that whether kernel is correc
Seems like we're good to go! Seems like we're good to go!
.. GENERATED FROM PYTHON SOURCE LINES 147-150 .. GENERATED FROM PYTHON SOURCE LINES 147-152
Benchmarking Benchmarking
-------------------------- --------------------------
We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch. We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.
for different problem sizes.
.. GENERATED FROM PYTHON SOURCE LINES 150-178 .. GENERATED FROM PYTHON SOURCE LINES 152-178
.. code-block:: default .. code-block:: default
import matplotlib.pyplot as plt
# There are three tensors of 4N bytes each. So the bandwidth of a given kernel @triton.testing.perf_report(
# is 12N / time_ms * 1e-6 GB/s triton.testing.Benchmark(
gbps = lambda N, ms: 12 * N / ms * 1e-6 x_names=['size'], # argument names to use as an x-axis for the plot
# We want to benchmark small and large vector alike x_vals=[2**i for i in range(12, 28, 1)], # different possible values for `x_name`
sizes = [2**i for i in range(12, 25, 1)] x_log=True, # x axis is logarithmic
triton_bw = [] y_name='provider', # argument name whose value corresponds to a different line in the plot
torch_bw = [] y_vals=['torch', 'triton'], # possible keys for `y_name`
for N in sizes: y_lines=["Torch", "Triton"], # label name for the lines
x = torch.rand(N, device='cuda', dtype=torch.float32) ylabel="GB/s", # label name for the y-axis
y = torch.rand(N, device='cuda', dtype=torch.float32) plot_name="vector-add-performance", # name for the plot. Used also as a file name for saving the plot.
# Triton provide a do_bench utility function that can be used to benchmark args={} # values for function arguments not in `x_names` and `y_name`
# arbitrary workloads. It supports a `warmup` parameter that is used to stabilize )
# GPU clock speeds as well as a `rep` parameter that controls the number of times )
# the benchmark is repeated. Importantly, we set `clear_l2 = True` to make sure def benchmark(size, provider):
# that the L2 cache does not contain any element of x before each kernel call when x = torch.rand(size, device='cuda', dtype=torch.float32)
# N is small. y = torch.rand(size, device='cuda', dtype=torch.float32)
do_bench = lambda fn: gbps(N, triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True)) if provider == 'torch':
triton_bw += [do_bench(lambda: add(x, y))] ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
torch_bw += [do_bench(lambda: x + y)] if provider == 'triton':
# We plot the results as a semi-log ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
plt.semilogx(sizes, triton_bw, label='Triton') gbps = lambda ms: 12 * size / ms * 1e-6
plt.semilogx(sizes, torch_bw, label='Torch') return gbps(ms), gbps(max_ms), gbps(min_ms)
plt.legend()
plt.show()
.. GENERATED FROM PYTHON SOURCE LINES 179-181
We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
`save_path='/path/to/results/' to save them to disk along with raw CSV data
.. GENERATED FROM PYTHON SOURCE LINES 181-181
.. code-block:: default
benchmark.run(show_plots=True)
.. image:: /getting-started/tutorials/images/sphx_glr_01-vector-add_001.png .. image:: /getting-started/tutorials/images/sphx_glr_01-vector-add_001.png
:alt: 01 vector add :alt: vector-add-performance
:class: sphx-glr-single-img :class: sphx-glr-single-img
.. GENERATED FROM PYTHON SOURCE LINES 179-179
Seems like our simple element-wise operation operates at peak bandwidth. While this is a fairly low bar for a custom GPU programming language, this is a good start before we move to more advanced operations.
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 0 minutes 4.784 seconds) **Total running time of the script:** ( 0 minutes 5.768 seconds)
.. _sphx_glr_download_getting-started_tutorials_01-vector-add.py: .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:

View File

@@ -250,45 +250,50 @@ Benchmarking
Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows. Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above. We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.
.. GENERATED FROM PYTHON SOURCE LINES 181-204 .. GENERATED FROM PYTHON SOURCE LINES 181-209
.. code-block:: default .. code-block:: default
import matplotlib.pyplot as plt
M = 4096 @triton.testing.perf_report(
Ns = [256 * i for i in range(2, 50)] triton.testing.Benchmark(
tri_bw = [] x_names=['N'], # argument names to use as an x-axis for the plot
ref_bw = [] x_vals=[256 * i for i in range(2, 50)], # different possible values for `x_name`
def_bw = [] y_name='provider', # argument name whose value corresponds to a different line in the plot
for N in Ns: y_vals=['torch', 'triton', 'naive'], # possible keys for `y_name`
y_lines=["Torch", "Triton", 'Naive'], # label name for the lines
ylabel="GB/s", # label name for the y-axis
plot_name="softmax-performance", # name for the plot. Used also as a file name for saving the plot.
args={'M': 4096} # values for function arguments not in `x_names` and `y_name`
)
)
def benchmark(M, N, provider):
x = torch.randn(M, N, device='cuda', dtype=torch.float32) x = torch.randn(M, N, device='cuda', dtype=torch.float32)
gbps = lambda ms: x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3) if provider == 'torch':
do_bench = lambda fn: gbps(triton.testing.do_bench(fn, warmup=10, rep=100, clear_l2=True)) ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
tri_bw += [do_bench(lambda: softmax(x))] if provider == 'triton':
ref_bw += [do_bench(lambda: torch.softmax(x, axis=1))] ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))
def_bw += [do_bench(lambda: naive_softmax(x))] if provider == 'naive':
plt.xlabel('N') ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))
plt.ylabel('Bandwidth (GB/s)') gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
plt.plot(Ns, tri_bw, label='Triton') return gbps(ms), gbps(max_ms), gbps(min_ms)
plt.plot(Ns, ref_bw, label='Torch')
plt.plot(Ns, def_bw, label='Naive')
plt.legend() benchmark.run(show_plots=True)
plt.show()
.. image:: /getting-started/tutorials/images/sphx_glr_02-fused-softmax_001.png .. image:: /getting-started/tutorials/images/sphx_glr_02-fused-softmax_001.png
:alt: 02 fused softmax :alt: softmax-performance
:class: sphx-glr-single-img :class: sphx-glr-single-img
.. GENERATED FROM PYTHON SOURCE LINES 205-210 .. GENERATED FROM PYTHON SOURCE LINES 210-215
In the above plot, we can see that: In the above plot, we can see that:
@@ -300,7 +305,7 @@ In the above plot, we can see that:
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 0 minutes 33.773 seconds) **Total running time of the script:** ( 0 minutes 21.653 seconds)
.. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py: .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:

View File

@@ -5,10 +5,10 @@
Computation times Computation times
================= =================
**00:33.773** total execution time for **getting-started_tutorials** files: **00:27.420** total execution time for **getting-started_tutorials** files:
+-----------------------------------------------------------------------------------------+-----------+--------+ +-----------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:33.773 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:21.653 | 0.0 MB |
+-----------------------------------------------------------------------------------------+-----------+--------+ +-----------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:00.000 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:05.768 | 0.0 MB |
+-----------------------------------------------------------------------------------------+-----------+--------+ +-----------------------------------------------------------------------------------------+-----------+--------+

View File

@@ -322,38 +322,40 @@ The maximum difference between torch and triton is 0.0
</div> </div>
<div class="section" id="benchmarking"> <div class="section" id="benchmarking">
<h2>Benchmarking<a class="headerlink" href="#benchmarking" title="Permalink to this headline"></a></h2> <h2>Benchmarking<a class="headerlink" href="#benchmarking" title="Permalink to this headline"></a></h2>
<p>We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.</p> <p>We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span> To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of our custom op.
for different problem sizes.</p>
<span class="c1"># There are three tensors of 4N bytes each. So the bandwidth of a given kernel</span> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">perf_report</span><span class="p">(</span>
<span class="c1"># is 12N / time_ms * 1e-6 GB/s</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">Benchmark</span><span class="p">(</span>
<span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">N</span><span class="p">,</span> <span class="n">ms</span><span class="p">:</span> <span class="mi">12</span> <span class="o">*</span> <span class="n">N</span> <span class="o">/</span> <span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-6</span> <span class="n">x_names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;size&#39;</span><span class="p">],</span> <span class="c1"># argument names to use as an x-axis for the plot</span>
<span class="c1"># We want to benchmark small and large vector alike</span> <span class="n">x_vals</span><span class="o">=</span><span class="p">[</span><span class="mi">2</span><span class="o">**</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">28</span><span class="p">,</span> <span class="mi">1</span><span class="p">)],</span> <span class="c1"># different possible values for `x_name`</span>
<span class="n">sizes</span> <span class="o">=</span> <span class="p">[</span><span class="mi">2</span><span class="o">**</span><span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">12</span><span class="p">,</span> <span class="mi">25</span><span class="p">,</span> <span class="mi">1</span><span class="p">)]</span> <span class="n">x_log</span><span class="o">=</span><span class="kc">True</span><span class="p">,</span> <span class="c1"># x axis is logarithmic</span>
<span class="n">triton_bw</span> <span class="o">=</span> <span class="p">[]</span> <span class="n">y_name</span><span class="o">=</span><span class="s1">&#39;provider&#39;</span><span class="p">,</span> <span class="c1"># argument name whose value corresponds to a different line in the plot</span>
<span class="n">torch_bw</span> <span class="o">=</span> <span class="p">[]</span> <span class="n">y_vals</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;torch&#39;</span><span class="p">,</span> <span class="s1">&#39;triton&#39;</span><span class="p">],</span> <span class="c1"># possible keys for `y_name`</span>
<span class="k">for</span> <span class="n">N</span> <span class="ow">in</span> <span class="n">sizes</span><span class="p">:</span> <span class="n">y_lines</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;Torch&quot;</span><span class="p">,</span> <span class="s2">&quot;Triton&quot;</span><span class="p">],</span> <span class="c1"># label name for the lines</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="n">ylabel</span><span class="o">=</span><span class="s2">&quot;GB/s&quot;</span><span class="p">,</span> <span class="c1"># label name for the y-axis</span>
<span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="n">plot_name</span><span class="o">=</span><span class="s2">&quot;vector-add-performance&quot;</span><span class="p">,</span> <span class="c1"># name for the plot. Used also as a file name for saving the plot.</span>
<span class="c1"># Triton provide a do_bench utility function that can be used to benchmark</span> <span class="n">args</span><span class="o">=</span><span class="p">{}</span> <span class="c1"># values for function arguments not in `x_names` and `y_name`</span>
<span class="c1"># arbitrary workloads. It supports a `warmup` parameter that is used to stabilize</span> <span class="p">)</span>
<span class="c1"># GPU clock speeds as well as a `rep` parameter that controls the number of times</span> <span class="p">)</span>
<span class="c1"># the benchmark is repeated. Importantly, we set `clear_l2 = True` to make sure</span> <span class="k">def</span> <span class="nf">benchmark</span><span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="n">provider</span><span class="p">):</span>
<span class="c1"># that the L2 cache does not contain any element of x before each kernel call when</span> <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="c1"># N is small.</span> <span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">size</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">do_bench</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">fn</span><span class="p">:</span> <span class="n">gbps</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="n">fn</span><span class="p">,</span> <span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">rep</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">clear_l2</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span> <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;torch&#39;</span><span class="p">:</span>
<span class="n">triton_bw</span> <span class="o">+=</span> <span class="p">[</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">))]</span> <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">)</span>
<span class="n">torch_bw</span> <span class="o">+=</span> <span class="p">[</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span><span class="p">)]</span> <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;triton&#39;</span><span class="p">:</span>
<span class="c1"># We plot the results as a semi-log</span> <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">add</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">))</span>
<span class="n">plt</span><span class="o">.</span><span class="n">semilogx</span><span class="p">(</span><span class="n">sizes</span><span class="p">,</span> <span class="n">triton_bw</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Triton&#39;</span><span class="p">)</span> <span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">ms</span><span class="p">:</span> <span class="mi">12</span> <span class="o">*</span> <span class="n">size</span> <span class="o">/</span> <span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-6</span>
<span class="n">plt</span><span class="o">.</span><span class="n">semilogx</span><span class="p">(</span><span class="n">sizes</span><span class="p">,</span> <span class="n">torch_bw</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Torch&#39;</span><span class="p">)</span> <span class="k">return</span> <span class="n">gbps</span><span class="p">(</span><span class="n">ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">max_ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">min_ms</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div> </pre></div>
</div> </div>
<img alt="01 vector add" class="sphx-glr-single-img" src="../../_images/sphx_glr_01-vector-add_001.png" /> <p>We can now run the decorated function above. Pass <cite>show_plots=True</cite> to see the plots and/or
<p>Seems like our simple element-wise operation operates at peak bandwidth. While this is a fairly low bar for a custom GPU programming language, this is a good start before we move to more advanced operations.</p> <a href="#id1"><span class="problematic" id="id2">`</span></a>save_path=/path/to/results/ to save them to disk along with raw CSV data</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 4.784 seconds)</p> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">benchmark</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">show_plots</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div>
</div>
<img alt="vector-add-performance" class="sphx-glr-single-img" src="../../_images/sphx_glr_01-vector-add_001.png" />
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 5.768 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>

View File

@@ -349,30 +349,34 @@ This will allow us to verify that our padding mechanism works.</p>
<h2>Benchmarking<a class="headerlink" href="#benchmarking" title="Permalink to this headline"></a></h2> <h2>Benchmarking<a class="headerlink" href="#benchmarking" title="Permalink to this headline"></a></h2>
<p>Here we will benchmark our operation as a function of the number of columns in the input matrix assuming 4096 rows. <p>Here we will benchmark our operation as a function of the number of columns in the input matrix assuming 4096 rows.
We will then compare its performance against (1) <code class="code docutils literal notranslate"><span class="pre">torch.softmax</span></code> and (2) the <code class="code docutils literal notranslate"><span class="pre">naive_softmax</span></code> defined above.</p> We will then compare its performance against (1) <code class="code docutils literal notranslate"><span class="pre">torch.softmax</span></code> and (2) the <code class="code docutils literal notranslate"><span class="pre">naive_softmax</span></code> defined above.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">matplotlib.pyplot</span> <span class="k">as</span> <span class="nn">plt</span> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">perf_report</span><span class="p">(</span>
<span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">Benchmark</span><span class="p">(</span>
<span class="n">M</span> <span class="o">=</span> <span class="mi">4096</span> <span class="n">x_names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;N&#39;</span><span class="p">],</span> <span class="c1"># argument names to use as an x-axis for the plot</span>
<span class="n">Ns</span> <span class="o">=</span> <span class="p">[</span><span class="mi">256</span> <span class="o">*</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50</span><span class="p">)]</span> <span class="n">x_vals</span><span class="o">=</span><span class="p">[</span><span class="mi">256</span> <span class="o">*</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">50</span><span class="p">)],</span> <span class="c1"># different possible values for `x_name`</span>
<span class="n">tri_bw</span> <span class="o">=</span> <span class="p">[]</span> <span class="n">y_name</span><span class="o">=</span><span class="s1">&#39;provider&#39;</span><span class="p">,</span> <span class="c1"># argument name whose value corresponds to a different line in the plot</span>
<span class="n">ref_bw</span> <span class="o">=</span> <span class="p">[]</span> <span class="n">y_vals</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;torch&#39;</span><span class="p">,</span> <span class="s1">&#39;triton&#39;</span><span class="p">,</span> <span class="s1">&#39;naive&#39;</span><span class="p">],</span> <span class="c1"># possible keys for `y_name`</span>
<span class="n">def_bw</span> <span class="o">=</span> <span class="p">[]</span> <span class="n">y_lines</span><span class="o">=</span><span class="p">[</span><span class="s2">&quot;Torch&quot;</span><span class="p">,</span> <span class="s2">&quot;Triton&quot;</span><span class="p">,</span> <span class="s1">&#39;Naive&#39;</span><span class="p">],</span> <span class="c1"># label name for the lines</span>
<span class="k">for</span> <span class="n">N</span> <span class="ow">in</span> <span class="n">Ns</span><span class="p">:</span> <span class="n">ylabel</span><span class="o">=</span><span class="s2">&quot;GB/s&quot;</span><span class="p">,</span> <span class="c1"># label name for the y-axis</span>
<span class="n">plot_name</span><span class="o">=</span><span class="s2">&quot;softmax-performance&quot;</span><span class="p">,</span> <span class="c1"># name for the plot. Used also as a file name for saving the plot.</span>
<span class="n">args</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;M&#39;</span><span class="p">:</span> <span class="mi">4096</span><span class="p">}</span> <span class="c1"># values for function arguments not in `x_names` and `y_name`</span>
<span class="p">)</span>
<span class="p">)</span>
<span class="k">def</span> <span class="nf">benchmark</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">provider</span><span class="p">):</span>
<span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span> <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
<span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">ms</span><span class="p">:</span> <span class="n">x</span><span class="o">.</span><span class="n">nelement</span><span class="p">()</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">*</span> <span class="mf">1e-9</span> <span class="o">/</span> <span class="p">(</span><span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-3</span><span class="p">)</span> <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;torch&#39;</span><span class="p">:</span>
<span class="n">do_bench</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">fn</span><span class="p">:</span> <span class="n">gbps</span><span class="p">(</span><span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="n">fn</span><span class="p">,</span> <span class="n">warmup</span><span class="o">=</span><span class="mi">10</span><span class="p">,</span> <span class="n">rep</span><span class="o">=</span><span class="mi">100</span><span class="p">,</span> <span class="n">clear_l2</span><span class="o">=</span><span class="kc">True</span><span class="p">))</span> <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">axis</span><span class="o">=-</span><span class="mi">1</span><span class="p">))</span>
<span class="n">tri_bw</span> <span class="o">+=</span> <span class="p">[</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">))]</span> <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;triton&#39;</span><span class="p">:</span>
<span class="n">ref_bw</span> <span class="o">+=</span> <span class="p">[</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">))]</span> <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
<span class="n">def_bw</span> <span class="o">+=</span> <span class="p">[</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">naive_softmax</span><span class="p">(</span><span class="n">x</span><span class="p">))]</span> <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;naive&#39;</span><span class="p">:</span>
<span class="n">plt</span><span class="o">.</span><span class="n">xlabel</span><span class="p">(</span><span class="s1">&#39;N&#39;</span><span class="p">)</span> <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">naive_softmax</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>
<span class="n">plt</span><span class="o">.</span><span class="n">ylabel</span><span class="p">(</span><span class="s1">&#39;Bandwidth (GB/s)&#39;</span><span class="p">)</span> <span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">ms</span><span class="p">:</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">nelement</span><span class="p">()</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">*</span> <span class="mf">1e-9</span> <span class="o">/</span> <span class="p">(</span><span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-3</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">Ns</span><span class="p">,</span> <span class="n">tri_bw</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Triton&#39;</span><span class="p">)</span> <span class="k">return</span> <span class="n">gbps</span><span class="p">(</span><span class="n">ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">max_ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">min_ms</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">Ns</span><span class="p">,</span> <span class="n">ref_bw</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Torch&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">plot</span><span class="p">(</span><span class="n">Ns</span><span class="p">,</span> <span class="n">def_bw</span><span class="p">,</span> <span class="n">label</span><span class="o">=</span><span class="s1">&#39;Naive&#39;</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">legend</span><span class="p">()</span> <span class="n">benchmark</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">show_plots</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="n">plt</span><span class="o">.</span><span class="n">show</span><span class="p">()</span>
</pre></div> </pre></div>
</div> </div>
<img alt="02 fused softmax" class="sphx-glr-single-img" src="../../_images/sphx_glr_02-fused-softmax_001.png" /> <img alt="softmax-performance" class="sphx-glr-single-img" src="../../_images/sphx_glr_02-fused-softmax_001.png" />
<p>In the above plot, we can see that:</p> <p>In the above plot, we can see that:</p>
<blockquote> <blockquote>
<div><ul class="simple"> <div><ul class="simple">
@@ -382,7 +386,7 @@ This means that when temporary data is too large to fit entirely in the GPU
Note that our Triton kernel is not only faster than PyTorchs CUDA kernel, it is also <strong>easier to read, understand and maintain</strong>.</p></li> Note that our Triton kernel is not only faster than PyTorchs CUDA kernel, it is also <strong>easier to read, understand and maintain</strong>.</p></li>
</ul> </ul>
</div></blockquote> </div></blockquote>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 33.773 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 21.653 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>

View File

@@ -160,7 +160,7 @@
<div class="section" id="computation-times"> <div class="section" id="computation-times">
<span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1> <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1>
<p><strong>00:33.773</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p> <p><strong>00:27.420</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<table class="docutils align-default"> <table class="docutils align-default">
<colgroup> <colgroup>
<col style="width: 82%" /> <col style="width: 82%" />
@@ -169,11 +169,11 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td> <tr class="row-odd"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
<td><p>00:33.773</p></td> <td><p>00:21.653</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td> <tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
<td><p>00:00.000</p></td> <td><p>00:05.768</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
</tbody> </tbody>

File diff suppressed because one or more lines are too long