[GH-PAGES] Updated website

This commit is contained in:
Philippe Tillet
2021-03-19 16:19:37 -04:00
parent 4d1c282eb2
commit 3db1455cda
51 changed files with 3981 additions and 94 deletions

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../_static/css/custom.css" type="text/css" />
@@ -100,6 +101,12 @@
</li>
<li class="toctree-l1"><a class="reference internal" href="tutorials/index.html">Tutorials</a></li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -104,6 +105,12 @@
</ul>
</li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>
@@ -355,7 +362,7 @@ for different problem sizes.</p>
</pre></div>
</div>
<img alt="vector-add-performance" class="sphx-glr-single-img" src="../../_images/sphx_glr_01-vector-add_001.png" />
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 7.521 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 8.442 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -106,6 +107,12 @@
</ul>
</li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>
@@ -397,7 +404,7 @@ This means that when temporary data is too large to fit entirely in the GPU
Note that our Triton kernel is not only faster than PyTorchs CUDA kernel, it is also <strong>easier to read, understand and maintain</strong>.</p></li>
</ul>
</div></blockquote>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 19.896 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 20.299 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -42,6 +43,7 @@
<link rel="index" title="Index" href="../../genindex.html" />
<link rel="search" title="Search" href="../../search.html" />
<link rel="next" title="Introduction" href="../../programming-guide/introduction.html" />
<link rel="prev" title="Fused Softmax" href="02-fused-softmax.html" />
</head>
@@ -117,6 +119,12 @@
</ul>
</li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>
@@ -347,14 +355,46 @@ If <code class="code docutils literal notranslate"><span class="pre">TYPE</span>
<span class="kn">import</span> <span class="nn">triton</span>
<span class="n">autotune_configs</span> <span class="o">=</span> <span class="p">[</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s2">&quot;MB&quot;</span><span class="p">:</span> <span class="s2">&quot;128&quot;</span><span class="p">,</span> <span class="s2">&quot;NB&quot;</span><span class="p">:</span> <span class="s2">&quot;128&quot;</span><span class="p">,</span> <span class="s2">&quot;KB&quot;</span><span class="p">:</span> <span class="s2">&quot;32&quot;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span> <span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span> <span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s2">&quot;MB&quot;</span><span class="p">:</span> <span class="s2">&quot;128&quot;</span><span class="p">,</span>
<span class="s2">&quot;NB&quot;</span><span class="p">:</span> <span class="s2">&quot;128&quot;</span><span class="p">,</span>
<span class="s2">&quot;KB&quot;</span><span class="p">:</span> <span class="s2">&quot;32&quot;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;128&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
<span class="n">triton</span><span class="o">.</span><span class="n">config</span><span class="p">(</span><span class="n">defines</span><span class="o">=</span><span class="p">{</span>
<span class="s1">&#39;MB&#39;</span><span class="p">:</span> <span class="s1">&#39;32&#39;</span><span class="p">,</span>
<span class="s1">&#39;NB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span><span class="p">,</span>
<span class="s1">&#39;KB&#39;</span><span class="p">:</span> <span class="s1">&#39;64&#39;</span>
<span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">)</span>
<span class="p">]</span>
</pre></div>
</div>
@@ -450,21 +490,21 @@ Note that we need to modify the :code`atol` and <code class="code docutils liter
</pre></div>
</div>
<p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[186.7500, 195.3750, 196.1250, ..., 197.0000, 199.1250, 200.1250],
[181.8750, 181.1250, 187.2500, ..., 191.5000, 192.3750, 185.1250],
[183.0000, 192.7500, 194.3750, ..., 200.3750, 195.1250, 193.5000],
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[199.0000, 199.1250, 195.8750, ..., 190.6250, 200.7500, 186.3750],
[196.1250, 201.6250, 197.6250, ..., 189.6250, 197.7500, 190.0000],
[198.0000, 196.6250, 200.1250, ..., 198.6250, 199.7500, 190.8750],
...,
[176.1250, 183.0000, 182.1250, ..., 184.7500, 190.8750, 187.5000],
[182.0000, 181.8750, 183.2500, ..., 187.8750, 190.5000, 186.2500],
[173.0000, 182.3750, 187.2500, ..., 191.2500, 187.6250, 184.5000]],
[190.3750, 192.0000, 190.5000, ..., 187.0000, 191.7500, 180.8750],
[185.2500, 187.6250, 181.2500, ..., 185.1250, 188.2500, 175.5000],
[191.6250, 191.6250, 194.2500, ..., 188.2500, 192.1250, 182.0000]],
device=&#39;cuda:0&#39;, dtype=torch.float16)
tensor([[186.7500, 195.3750, 196.1250, ..., 197.0000, 199.1250, 200.1250],
[181.8750, 181.1250, 187.2500, ..., 191.5000, 192.3750, 185.1250],
[183.0000, 192.7500, 194.3750, ..., 200.3750, 195.1250, 193.5000],
tensor([[199.0000, 199.1250, 195.8750, ..., 190.6250, 200.7500, 186.3750],
[196.1250, 201.6250, 197.6250, ..., 189.6250, 197.7500, 190.0000],
[198.0000, 196.6250, 200.1250, ..., 198.6250, 199.7500, 190.8750],
...,
[176.1250, 183.0000, 182.1250, ..., 184.7500, 190.8750, 187.5000],
[182.0000, 181.8750, 183.2500, ..., 187.8750, 190.5000, 186.2500],
[173.0000, 182.3750, 187.2500, ..., 191.2500, 187.6250, 184.5000]],
[190.3750, 192.0000, 190.5000, ..., 187.0000, 191.7500, 180.8750],
[185.2500, 187.6250, 181.2500, ..., 185.1250, 188.2500, 175.5000],
[191.6250, 191.6250, 194.2500, ..., 188.2500, 192.1250, 182.0000]],
device=&#39;cuda:0&#39;, dtype=torch.float16)
True
</pre></div>
@@ -478,7 +518,7 @@ True
For this reason, we will instead compare the performance of our kernel against <a class="reference external" href="https://github.com/NVIDIA/cutlass/">CUTLASS</a> , a highly optimized CUDA library for matrix multiplication written by NVIDIA themselves._
To install CUTLASS, you need a recent version of cmake:</p>
<blockquote>
<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> /path/to/cutlass/
<div><div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> /tmp/
git clone https://github.com/NVIDIA/cutlass.git
<span class="nb">cd</span> cutlass
mkdir build
@@ -506,7 +546,7 @@ make -j8 install
Triton comes with some basic Python bindings for benchmarking CUTLASS. These will be compiled when the environment variables <code class="code docutils literal notranslate"><span class="pre">CUTLASS_INCLUDE_DIR</span></code> and <code class="code docutils literal notranslate"><span class="pre">CUTLASS_LIBRARY_DIR</span></code> are set during the installation process.
To re-install Triton with the updated CUTLASS bindings, run the following command:</p>
<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">export</span> <span class="nv">CUTLASS_INCLUDE_DIR</span><span class="o">=</span>/tmp/cutlass/build/install/include/
<span class="nb">export</span> <span class="nv">CUTLASS_LIBRARY_DIR</span><span class="o">=</span>/tmp/cutlass/build/install/lib/a
<span class="nb">export</span> <span class="nv">CUTLASS_LIBRARY_DIR</span><span class="o">=</span>/tmp/cutlass/build/install/lib/
pip uninstall -y triton
pip install -e <span class="s2">&quot;git+https://github.com/ptillet/triton.git#egg=triton&amp;subdirectory=python&quot;</span>
</pre></div>
@@ -519,13 +559,13 @@ pip install -e <span class="s2">&quot;git+https://github.com/ptillet/triton.git#
</pre></div>
</div>
<p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[186.7500, 195.3750, 196.1250, ..., 197.0000, 199.1250, 200.1250],
[181.8750, 181.1250, 187.2500, ..., 191.5000, 192.3750, 185.1250],
[183.0000, 192.7500, 194.3750, ..., 200.3750, 195.1250, 193.5000],
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[199.0000, 199.1250, 195.8750, ..., 190.6250, 200.7500, 186.3750],
[196.1250, 201.6250, 197.6250, ..., 189.6250, 197.7500, 190.0000],
[198.0000, 196.6250, 200.1250, ..., 198.6250, 199.7500, 190.8750],
...,
[176.1250, 183.0000, 182.1250, ..., 184.7500, 190.8750, 187.5000],
[182.0000, 181.8750, 183.2500, ..., 187.8750, 190.5000, 186.2500],
[173.0000, 182.3750, 187.2500, ..., 191.2500, 187.6250, 184.5000]],
[190.3750, 192.0000, 190.5000, ..., 187.0000, 191.7500, 180.8750],
[185.2500, 187.6250, 181.2500, ..., 185.1250, 188.2500, 175.5000],
[191.6250, 191.6250, 194.2500, ..., 188.2500, 192.1250, 182.0000]],
device=&#39;cuda:0&#39;, dtype=torch.float16)
True
</pre></div>
@@ -565,7 +605,7 @@ True
</div>
<img alt="matmul-performance" class="sphx-glr-single-img" src="../../_images/sphx_glr_03-matrix-multiplication_001.png" />
<p>As we can see, the performance of our kernel is pretty good. It is in fact faster than CUTLASS, and therefore probably comparable to the absolute best CUDA code an expert could write.</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 10.181 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 10.094 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
@@ -585,6 +625,7 @@ True
</div>
<footer>
<div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
<a href="../../programming-guide/introduction.html" class="btn btn-neutral float-right" title="Introduction" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
<a href="02-fused-softmax.html" class="btn btn-neutral float-left" title="Fused Softmax" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
</div>

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -98,6 +99,12 @@
</ul>
</li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>

View File

@@ -17,6 +17,7 @@
<link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
<link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
<link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
@@ -91,6 +92,12 @@
<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
<li class="toctree-l1"><a class="reference internal" href="index.html">Tutorials</a></li>
</ul>
<p class="caption"><span class="caption-text">Programming Guide</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/introduction.html">Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/related-work.html">Related Work</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/triton-c.html">The Triton-C Language</a></li>
</ul>
@@ -159,7 +166,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1>
<p><strong>01:10.181</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<p><strong>01:10.094</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -168,7 +175,7 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
<td><p>01:10.181</p></td>
<td><p>01:10.094</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>