[GH-PAGES] Updated website
This commit is contained in:
@@ -295,7 +295,7 @@ for different problem sizes.</p>
|
||||
</pre></div>
|
||||
</div>
|
||||
<img alt="01 vector add" class="sphx-glr-single-img" src="../../_images/sphx_glr_01-vector-add_001.png" />
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 5.812 seconds)</p>
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 7.044 seconds)</p>
|
||||
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
|
||||
<div class="sphx-glr-download sphx-glr-download-python docutils container">
|
||||
<p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>
|
||||
|
@@ -280,7 +280,7 @@ so we need to internally “pad” tiles and guard the memory operations properl
|
||||
<span class="c1"># Allocate output</span>
|
||||
<span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
|
||||
<span class="c1"># Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix</span>
|
||||
<span class="n">_softmax</span><span class="p">[(</span><span class="n">M</span><span class="p">,</span> <span class="p">)](</span><span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">y</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">BLOCK</span><span class="o">=</span><span class="n">BLOCK</span><span class="p">)</span>
|
||||
<span class="n">_softmax</span><span class="p">[(</span><span class="n">M</span><span class="p">,</span> <span class="p">)](</span><span class="n">y</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">y</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="n">num_warps</span><span class="p">,</span> <span class="n">BLOCK</span><span class="o">=</span><span class="n">BLOCK</span><span class="p">)</span>
|
||||
<span class="k">return</span> <span class="n">y</span>
|
||||
</pre></div>
|
||||
</div>
|
||||
@@ -343,7 +343,7 @@ This means that – when temporary data is too large to fit entirely in the GPU
|
||||
Note that our Triton kernel is not only faster than PyTorch’s CUDA kernel, it is also <strong>easier to read, understand and maintain</strong>.</p></li>
|
||||
</ul>
|
||||
</div></blockquote>
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 20.767 seconds)</p>
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 20.176 seconds)</p>
|
||||
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
|
||||
<div class="sphx-glr-download sphx-glr-download-python docutils container">
|
||||
<p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>
|
||||
|
@@ -406,32 +406,32 @@ and (1) checks any shape constraint; (2) allocates the output; (3) launches the
|
||||
</pre></div>
|
||||
</div>
|
||||
<p class="sphx-glr-script-out">Out:</p>
|
||||
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00,
|
||||
-3.4237e-04, -0.0000e+00],
|
||||
[-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01,
|
||||
-0.0000e+00, -0.0000e+00],
|
||||
[ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00,
|
||||
1.6578e+01, 2.1031e+01],
|
||||
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>tensor([[-5.9605e-08, 5.1094e+01, -1.8477e-05, ..., 2.6547e+01,
|
||||
-7.2598e-05, -4.2510e-04],
|
||||
[-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00,
|
||||
-1.8644e-04, 1.3094e+01],
|
||||
[-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00,
|
||||
4.3375e+01, 1.6865e+00],
|
||||
...,
|
||||
[ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01,
|
||||
3.5469e+01, 1.5602e+01],
|
||||
[-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00,
|
||||
1.7412e+00, 1.1422e+01],
|
||||
[-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1248e-03,
|
||||
-1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16)
|
||||
tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00,
|
||||
-3.4261e-04, -0.0000e+00],
|
||||
[-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01,
|
||||
-0.0000e+00, -0.0000e+00],
|
||||
[ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00,
|
||||
1.6578e+01, 2.1031e+01],
|
||||
[-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00,
|
||||
4.1133e+00, -0.0000e+00],
|
||||
[ 1.6562e+01, -8.1539e-04, 1.3836e+01, ..., 1.9844e+00,
|
||||
-1.1238e-02, 8.4375e+00],
|
||||
[-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6907e-02,
|
||||
-0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
|
||||
tensor([[-5.9605e-08, 5.1094e+01, -1.8537e-05, ..., 2.6547e+01,
|
||||
-7.2658e-05, -4.2605e-04],
|
||||
[-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00,
|
||||
-1.8632e-04, 1.3094e+01],
|
||||
[-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00,
|
||||
4.3375e+01, 1.6875e+00],
|
||||
...,
|
||||
[ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01,
|
||||
3.5469e+01, 1.5602e+01],
|
||||
[-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00,
|
||||
1.7412e+00, 1.1422e+01],
|
||||
[-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1324e-03,
|
||||
-1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16)
|
||||
[-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00,
|
||||
4.1133e+00, -0.0000e+00],
|
||||
[ 1.6562e+01, -8.1778e-04, 1.3836e+01, ..., 1.9844e+00,
|
||||
-1.1238e-02, 8.4375e+00],
|
||||
[-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6891e-02,
|
||||
-0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
|
||||
tensor(True, device='cuda:0')
|
||||
</pre></div>
|
||||
</div>
|
||||
@@ -472,39 +472,39 @@ tensor(True, device='cuda:0')
|
||||
<p class="sphx-glr-script-out">Out:</p>
|
||||
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span> M cuBLAS Triton
|
||||
0 512.0 20.164923 15.420235
|
||||
1 768.0 58.982401 42.130286
|
||||
1 768.0 58.982401 40.215272
|
||||
2 1024.0 91.180520 72.315584
|
||||
3 1280.0 157.538463 117.028568
|
||||
4 1536.0 150.593357 147.455995
|
||||
5 1792.0 212.064605 193.783168
|
||||
6 2048.0 197.379013 151.146088
|
||||
7 2304.0 243.753804 179.608068
|
||||
8 2560.0 237.449270 217.006622
|
||||
9 2816.0 233.231062 200.987140
|
||||
4 1536.0 153.867127 144.446699
|
||||
5 1792.0 208.137481 190.498706
|
||||
6 2048.0 199.728763 152.520144
|
||||
7 2304.0 246.266731 178.267699
|
||||
8 2560.0 235.741014 215.578957
|
||||
9 2816.0 231.990461 198.246398
|
||||
10 3072.0 236.916752 221.184001
|
||||
11 3328.0 234.499328 210.500857
|
||||
11 3328.0 239.173747 210.500857
|
||||
12 3584.0 248.385067 230.552287
|
||||
13 3840.0 252.493157 223.418188
|
||||
14 4096.0 263.689066 244.922869
|
||||
15 4352.0 247.295210 231.639115
|
||||
16 4608.0 274.573240 254.803966
|
||||
17 4864.0 266.298229 245.366501
|
||||
18 5120.0 259.548513 238.312729
|
||||
19 5376.0 252.676487 237.081606
|
||||
20 5632.0 270.685535 249.046163
|
||||
21 5888.0 264.382140 242.069377
|
||||
22 6144.0 262.447761 240.565495
|
||||
23 6400.0 257.028108 235.078047
|
||||
24 6656.0 254.386204 232.699140
|
||||
25 6912.0 252.040861 232.926171
|
||||
26 7168.0 253.193644 231.815375
|
||||
27 7424.0 251.789150 232.860938
|
||||
28 7680.0 250.988932 231.727608
|
||||
29 7936.0 253.622108 232.094986
|
||||
30 8192.0 253.121589 231.859598
|
||||
13 3840.0 251.917998 222.519114
|
||||
14 4096.0 263.172024 244.032234
|
||||
15 4352.0 249.595626 232.307632
|
||||
16 4608.0 276.560014 254.803966
|
||||
17 4864.0 266.614125 245.366501
|
||||
18 5120.0 257.003930 238.096276
|
||||
19 5376.0 252.676487 236.527241
|
||||
20 5632.0 270.057027 248.514009
|
||||
21 5888.0 264.206935 242.511113
|
||||
22 6144.0 259.441481 241.205983
|
||||
23 6400.0 257.157204 235.078047
|
||||
24 6656.0 254.161678 232.699140
|
||||
25 6912.0 251.844029 233.178785
|
||||
26 7168.0 253.282797 231.740709
|
||||
27 7424.0 251.868505 230.377264
|
||||
28 7680.0 250.988932 231.606284
|
||||
29 7936.0 253.293068 229.692102
|
||||
30 8192.0 253.002304 231.360005
|
||||
</pre></div>
|
||||
</div>
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 36.230 seconds)</p>
|
||||
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 32.933 seconds)</p>
|
||||
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
|
||||
<div class="sphx-glr-download sphx-glr-download-python docutils container">
|
||||
<p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
|
||||
|
@@ -169,7 +169,7 @@
|
||||
|
||||
<div class="section" id="computation-times">
|
||||
<span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
|
||||
<p><strong>00:36.230</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
|
||||
<p><strong>01:00.154</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
|
||||
<table class="docutils align-default">
|
||||
<colgroup>
|
||||
<col style="width: 85%" />
|
||||
@@ -178,15 +178,15 @@
|
||||
</colgroup>
|
||||
<tbody>
|
||||
<tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
|
||||
<td><p>00:36.230</p></td>
|
||||
<td><p>00:32.933</p></td>
|
||||
<td><p>0.0 MB</p></td>
|
||||
</tr>
|
||||
<tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
|
||||
<td><p>00:00.000</p></td>
|
||||
<tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
|
||||
<td><p>00:20.176</p></td>
|
||||
<td><p>0.0 MB</p></td>
|
||||
</tr>
|
||||
<tr class="row-odd"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
|
||||
<td><p>00:00.000</p></td>
|
||||
<tr class="row-odd"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
|
||||
<td><p>00:07.044</p></td>
|
||||
<td><p>0.0 MB</p></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
|
Reference in New Issue
Block a user