diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index e181fe4e0..993842d5a 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index 4cedcffca..ec2149ddf 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index d0d6ac944..45631f2c0 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index 04ba50ca1..51c8c2f86 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index f29fc72c8..4252df496 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index 03d198f22..1cf3684f3 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index c45098216..64fde06bc 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index d06755853..f4cf96f77 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt
index 6dd9ed359..6eec75843 100644
--- a/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -231,7 +231,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
 
     vector-add-performance:
                size      Triton       Torch
-    0        4096.0    9.540372    9.600000
+    0        4096.0    9.600000    9.600000
     1        8192.0   19.200000   19.200000
     2       16384.0   38.400001   38.400001
     3       32768.0   76.800002   76.800002
@@ -246,7 +246,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
     12   16777216.0  833.084721  833.084721
     13   33554432.0  843.811163  843.811163
     14   67108864.0  848.362445  848.362445
-    15  134217728.0  851.116890  850.656574
+    15  134217728.0  851.577704  850.656574
 
 
 
@@ -254,7 +254,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  10.964 seconds)
+   **Total running time of the script:** ( 0 minutes  11.019 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
index ec10c89db..1185996ff 100644
--- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -306,11 +306,11 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
     3     640.0  682.666684      640.000002   160.000000
     4     768.0  702.171410      664.216187   163.839992
     ..      ...         ...             ...          ...
-    93  12160.0  812.359066      405.755985   199.038365
-    94  12288.0  812.429770      415.661740   199.298541
-    95  12416.0  810.840807      412.149375   198.755369
-    96  12544.0  810.925276      412.971190   199.111113
-    97  12672.0  811.007961      412.097543   199.167004
+    93  12160.0  812.359066      406.179533   198.733401
+    94  12288.0  812.429770      416.101597   199.096718
+    95  12416.0  810.840807      412.149375   198.655991
+    96  12544.0  810.925276      412.546756   198.913776
+    97  12672.0  811.007961      412.097543   198.873965
 
     [98 rows x 4 columns]
 
@@ -328,7 +328,7 @@ In the above plot, we can see that:
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  12.596 seconds)
+   **Total running time of the script:** ( 1 minutes  12.589 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
index 65942683b..e50b4bf85 100644
--- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -462,37 +462,37 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
 
     matmul-performance:
              M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
-    0    256.0   2.978909  ...   3.276800              2.978909
-    1    384.0   7.372800  ...   7.899428              7.899428
-    2    512.0  14.563555  ...  16.384000             15.420235
+    0    256.0   2.978909  ...   2.978909              2.978909
+    1    384.0   7.372800  ...   8.507077              8.507077
+    2    512.0  14.563555  ...  16.384000             16.384000
     3    640.0  22.260869  ...  24.380953             24.380953
     4    768.0  32.768000  ...  34.028308             34.028308
-    5    896.0  37.971025  ...  39.025776             39.025776
+    5    896.0  39.025776  ...  39.025776             35.994954
     6   1024.0  49.932191  ...  52.428801             52.428801
     7   1152.0  44.566925  ...  46.656000             46.656000
     8   1280.0  51.200001  ...  56.888887             56.888887
     9   1408.0  64.138541  ...  63.392744             62.664092
-    10  1536.0  79.526831  ...  75.296679             76.106321
-    11  1664.0  62.929456  ...  62.061463             62.061463
+    10  1536.0  80.430545  ...  76.933564             75.296679
+    11  1664.0  63.372618  ...  62.061463             62.061463
     12  1792.0  72.983276  ...  62.441243             62.441243
-    13  1920.0  69.120002  ...  68.776119             68.776119
-    14  2048.0  73.584279  ...  74.235468             74.565406
-    15  2176.0  83.155572  ...  80.173899             81.143743
-    16  2304.0  68.056616  ...  73.275679             73.275679
-    17  2432.0  71.125224  ...  72.037087             82.630777
-    18  2560.0  78.019048  ...  76.560748             76.027843
-    19  2688.0  84.108772  ...  81.752274             81.401408
-    20  2816.0  83.552120  ...  78.726003             79.298560
-    21  2944.0  80.510553  ...  78.112900             75.853930
-    22  3072.0  81.825298  ...  83.146995             81.589488
-    23  3200.0  84.768213  ...  86.253369             90.014065
-    24  3328.0  83.226931  ...  81.622783             86.632127
-    25  3456.0  81.600781  ...  83.980802             84.068369
-    26  3584.0  87.211821  ...  85.797134             85.308722
-    27  3712.0  85.822459  ...  79.472826             78.721311
-    28  3840.0  83.465663  ...  86.197974             86.265212
-    29  3968.0  93.219206  ...  86.849777             87.222259
-    30  4096.0  93.206754  ...  89.958266             89.777746
+    13  1920.0  69.120002  ...  70.530615             70.172588
+    14  2048.0  73.908442  ...  74.898285             74.565406
+    15  2176.0  83.500614  ...  81.143743             78.916269
+    16  2304.0  68.056616  ...  73.501144             72.828879
+    17  2432.0  71.125224  ...  82.388456             81.433227
+    18  2560.0  77.833728  ...  77.283019             75.851852
+    19  2688.0  81.752274  ...  83.369354             80.708630
+    20  2816.0  83.392363  ...  79.587973             78.584162
+    21  2944.0  79.865439  ...  78.605729             77.747321
+    22  3072.0  80.890151  ...  83.638266             83.025078
+    23  3200.0  81.528664  ...  90.140846             86.603520
+    24  3328.0  83.034941  ...  87.262177             82.558825
+    25  3456.0  79.273916  ...  80.460651             79.040756
+    26  3584.0  87.466332  ...  93.080114             85.876512
+    27  3712.0  85.091436  ...  82.698613             81.950243
+    28  3840.0  81.019778  ...  86.063813             86.197974
+    29  3968.0  93.219206  ...  87.850207             86.973584
+    30  4096.0  93.727466  ...  84.894196             90.018600
 
     [31 rows x 5 columns]
 
@@ -502,7 +502,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 2 minutes  5.162 seconds)
+   **Total running time of the script:** ( 1 minutes  59.549 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
diff --git a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
index ed4f6bc5d..857686c85 100644
--- a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
+++ b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
@@ -238,7 +238,7 @@ References
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  0.187 seconds)
+   **Total running time of the script:** ( 0 minutes  0.185 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:
diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
index cd89aec28..496f93d9b 100644
--- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**03:28.909** total execution time for **getting-started_tutorials** files:
+**03:23.342** total execution time for **getting-started_tutorials** files:
 
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 02:05.162 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 01:59.549 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 01:12.596 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 01:12.589 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 00:10.964 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 00:11.019 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.187 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.185 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html
index 5678da270..931589693 100644
--- a/getting-started/tutorials/01-vector-add.html
+++ b/getting-started/tutorials/01-vector-add.html
@@ -320,7 +320,7 @@ for different problem sizes.</p>
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>vector-add-performance:
            size      Triton       Torch
-0        4096.0    9.540372    9.600000
+0        4096.0    9.600000    9.600000
 1        8192.0   19.200000   19.200000
 2       16384.0   38.400001   38.400001
 3       32768.0   76.800002   76.800002
@@ -335,10 +335,10 @@ for different problem sizes.</p>
 12   16777216.0  833.084721  833.084721
 13   33554432.0  843.811163  843.811163
 14   67108864.0  848.362445  848.362445
-15  134217728.0  851.116890  850.656574
+15  134217728.0  851.577704  850.656574
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  10.964 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  11.019 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>
diff --git a/getting-started/tutorials/02-fused-softmax.html b/getting-started/tutorials/02-fused-softmax.html
index ef8c8073d..33973f34a 100644
--- a/getting-started/tutorials/02-fused-softmax.html
+++ b/getting-started/tutorials/02-fused-softmax.html
@@ -392,11 +392,11 @@ We will then compare its performance against (1) <code class="code docutils lite
 3     640.0  682.666684      640.000002   160.000000
 4     768.0  702.171410      664.216187   163.839992
 ..      ...         ...             ...          ...
-93  12160.0  812.359066      405.755985   199.038365
-94  12288.0  812.429770      415.661740   199.298541
-95  12416.0  810.840807      412.149375   198.755369
-96  12544.0  810.925276      412.971190   199.111113
-97  12672.0  811.007961      412.097543   199.167004
+93  12160.0  812.359066      406.179533   198.733401
+94  12288.0  812.429770      416.101597   199.096718
+95  12416.0  810.840807      412.149375   198.655991
+96  12544.0  810.925276      412.546756   198.913776
+97  12672.0  811.007961      412.097543   198.873965
 
 [98 rows x 4 columns]
 </pre></div>
@@ -409,7 +409,7 @@ We will then compare its performance against (1) <code class="code docutils lite
 Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
 </ul>
 </div></blockquote>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  12.596 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  12.589 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>
diff --git a/getting-started/tutorials/03-matrix-multiplication.html b/getting-started/tutorials/03-matrix-multiplication.html
index dcfadf337..a45f35ccd 100644
--- a/getting-started/tutorials/03-matrix-multiplication.html
+++ b/getting-started/tutorials/03-matrix-multiplication.html
@@ -567,42 +567,42 @@ torch_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3906,  24.4531, -3
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
          M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
-0    256.0   2.978909  ...   3.276800              2.978909
-1    384.0   7.372800  ...   7.899428              7.899428
-2    512.0  14.563555  ...  16.384000             15.420235
+0    256.0   2.978909  ...   2.978909              2.978909
+1    384.0   7.372800  ...   8.507077              8.507077
+2    512.0  14.563555  ...  16.384000             16.384000
 3    640.0  22.260869  ...  24.380953             24.380953
 4    768.0  32.768000  ...  34.028308             34.028308
-5    896.0  37.971025  ...  39.025776             39.025776
+5    896.0  39.025776  ...  39.025776             35.994954
 6   1024.0  49.932191  ...  52.428801             52.428801
 7   1152.0  44.566925  ...  46.656000             46.656000
 8   1280.0  51.200001  ...  56.888887             56.888887
 9   1408.0  64.138541  ...  63.392744             62.664092
-10  1536.0  79.526831  ...  75.296679             76.106321
-11  1664.0  62.929456  ...  62.061463             62.061463
+10  1536.0  80.430545  ...  76.933564             75.296679
+11  1664.0  63.372618  ...  62.061463             62.061463
 12  1792.0  72.983276  ...  62.441243             62.441243
-13  1920.0  69.120002  ...  68.776119             68.776119
-14  2048.0  73.584279  ...  74.235468             74.565406
-15  2176.0  83.155572  ...  80.173899             81.143743
-16  2304.0  68.056616  ...  73.275679             73.275679
-17  2432.0  71.125224  ...  72.037087             82.630777
-18  2560.0  78.019048  ...  76.560748             76.027843
-19  2688.0  84.108772  ...  81.752274             81.401408
-20  2816.0  83.552120  ...  78.726003             79.298560
-21  2944.0  80.510553  ...  78.112900             75.853930
-22  3072.0  81.825298  ...  83.146995             81.589488
-23  3200.0  84.768213  ...  86.253369             90.014065
-24  3328.0  83.226931  ...  81.622783             86.632127
-25  3456.0  81.600781  ...  83.980802             84.068369
-26  3584.0  87.211821  ...  85.797134             85.308722
-27  3712.0  85.822459  ...  79.472826             78.721311
-28  3840.0  83.465663  ...  86.197974             86.265212
-29  3968.0  93.219206  ...  86.849777             87.222259
-30  4096.0  93.206754  ...  89.958266             89.777746
+13  1920.0  69.120002  ...  70.530615             70.172588
+14  2048.0  73.908442  ...  74.898285             74.565406
+15  2176.0  83.500614  ...  81.143743             78.916269
+16  2304.0  68.056616  ...  73.501144             72.828879
+17  2432.0  71.125224  ...  82.388456             81.433227
+18  2560.0  77.833728  ...  77.283019             75.851852
+19  2688.0  81.752274  ...  83.369354             80.708630
+20  2816.0  83.392363  ...  79.587973             78.584162
+21  2944.0  79.865439  ...  78.605729             77.747321
+22  3072.0  80.890151  ...  83.638266             83.025078
+23  3200.0  81.528664  ...  90.140846             86.603520
+24  3328.0  83.034941  ...  87.262177             82.558825
+25  3456.0  79.273916  ...  80.460651             79.040756
+26  3584.0  87.466332  ...  93.080114             85.876512
+27  3712.0  85.091436  ...  82.698613             81.950243
+28  3840.0  81.019778  ...  86.063813             86.197974
+29  3968.0  93.219206  ...  87.850207             86.973584
+30  4096.0  93.727466  ...  84.894196             90.018600
 
 [31 rows x 5 columns]
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes  5.162 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  59.549 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
diff --git a/getting-started/tutorials/04-low-memory-dropout.html b/getting-started/tutorials/04-low-memory-dropout.html
index 9c8854daf..54aea39ec 100644
--- a/getting-started/tutorials/04-low-memory-dropout.html
+++ b/getting-started/tutorials/04-low-memory-dropout.html
@@ -370,7 +370,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
 <dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
 </dd>
 </dl>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.187 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.185 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>
diff --git a/getting-started/tutorials/sg_execution_times.html b/getting-started/tutorials/sg_execution_times.html
index f2c02202d..2d556c6f8 100644
--- a/getting-started/tutorials/sg_execution_times.html
+++ b/getting-started/tutorials/sg_execution_times.html
@@ -174,7 +174,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>03:28.909</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
+<p><strong>03:23.342</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -183,19 +183,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
-<td><p>02:05.162</p></td>
+<td><p>01:59.549</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
-<td><p>01:12.596</p></td>
+<td><p>01:12.589</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
-<td><p>00:10.964</p></td>
+<td><p>00:11.019</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
-<td><p>00:00.187</p></td>
+<td><p>00:00.185</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/searchindex.js b/searchindex.js
index 9c9be0ddb..ad6c5e101 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":2,"000002":2,"007961":2,"01":[1,3,6],"014065":3,"019048":3,"02":[2,6],"025776":3,"027843":3,"028308":3,"03":[3,6],"037087":3,"038365":2,"04":[4,6],"05":6,"056616":3,"061463":3,"0625":3,"068369":3,"08199":4,"08452":4,"084721":1,"0938":3,"097543":2,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4,6],"100":[2,48],"1024":[1,3,4,11],"1045":3,"1048576":1,"106321":3,"106434":4,"108772":3,"11":[0,1,3],"111113":2,"112900":3,"1152":3,"116890":1,"12":[1,2,3,6],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"125224":3,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"1408":3,"142862":2,"143743":3,"146995":3,"149375":2,"149397":4,"15":[1,3],"153":2,"1536":3,"153853":2,"154":2,"155572":3,"16":[2,3,9,46],"160":2,"162":[3,6],"163":2,"16384":1,"1664":3,"167004":2,"16777216":1,"17":3,"171410":2,"173899":3,"17879":4,"1792":3,"18":3,"181817":2,"1823":2,"186":2,"187":[4,6],"19":[1,3],"190482":1,"192":1,"1920":3,"197974":3,"198":2,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"206754":3,"2097152":1,"21":3,"211821":3,"212868":4,"2141":1,"214186":4,"216187":2,"2176":3,"219":1,"219206":3,"22":3,"220":3,"222259":3,"226931":3,"23":3,"2304":3,"235468":3,"24":3,"2432":3,"245":3,"25":[3,48],"253369":3,"256":[1,2,3,10],"2560":3,"26":3,"260869":3,"262144":1,"265212":3,"2656":3,"2688":3,"27":3,"275679":3,"276800":3,"28":[1,3,6],"2812":3,"2816":3,"2891":3,"29":3,"293429":4,"2944":3,"296679":3,"298541":2,"298560":3,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"3072":3,"3076":1,"308722":3,"31":3,"3125":3,"32":[3,10],"3200":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"3477":3,"3516":3,"3555":3,"3584":3,"359066":2,"36":3,"362445":1,"37":3,"3712":3,"3713":1,"371721":4,"372800":3,"38":1,"380953":3,"384":[2,3],"3840":3,"384000":3,"39":3,"3906":3,"392744":3,"3968":3,"3984":3,"3986":4,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"401408":3,"4023":3,"403344":4,"403347":4,"405":2,"4062":3,"408716":4,"4096":[1,2,3],"412":2,"415":2,"4194304":1,"420235":3,"42142":4,"428568":1,"428801":3,"429770":[1,2],"431969":4,"44":3,"441243":3,"4492":3,"4531":3,"46":3,"4609":3,"465663":3,"4688":3,"472":1,"472826":3,"49":3,"4940":1,"4m":2,"4x":2,"5":[1,3,4,9],"5000":3,"51":3,"510553":3,"512":[2,3,4],"52":3,"524288":1,"526831":3,"5312":3,"54":3,"540372":1,"541":4,"546":2,"552120":3,"56":3,"560748":3,"563555":3,"565406":3,"566038":2,"566925":3,"568431":4,"584279":3,"585":2,"5859":3,"586858":4,"589488":3,"5898":3,"596":[2,6],"5mn":2,"6":[0,1,3],"600000":1,"600004":2,"600781":3,"606":2,"6094":3,"614":1,"615390":1,"62":3,"622783":3,"63":3,"630":2,"630777":3,"632127":3,"64":[1,3],"640":[2,3],"65536":1,"656000":3,"656574":1,"661740":2,"664":2,"664092":3,"666684":2,"67086":4,"67108864":1,"6724":1,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"702":2,"7031":3,"7070":3,"707878":4,"71":3,"719258":4,"72":3,"721311":3,"722":1,"726003":3,"73":3,"74":3,"743443":4,"75":3,"7500":3,"752274":3,"755369":2,"755985":2,"76":[1,3],"768":[2,3],"768000":3,"768213":3,"776119":3,"777746":3,"78":3,"780":1,"781":2,"79":3,"797134":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"806694":4,"81":3,"810":2,"811":2,"811163":1,"812":[1,2],"814814":2,"8192":1,"82":3,"822459":3,"823517":1,"825298":3,"83":3,"833":1,"838026":4,"8388608":1,"839992":2,"84":3,"840807":2,"84284":4,"843":1,"848":1,"849777":3,"85":3,"850":1,"851":1,"853930":3,"86":3,"863938":4,"87":3,"8828":3,"8867":3,"888887":3,"89":3,"8906":3,"8945":3,"896":3,"899428":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"909":6,"9219":3,"925276":2,"929456":3,"93":[2,3],"932191":3,"9375":3,"94":2,"9492":3,"95":2,"952835":4,"9531":3,"958266":3,"96":2,"964":[1,6],"9688":3,"97":2,"971025":3,"971190":2,"9733":1,"978909":3,"98":2,"9805":3,"980802":3,"983276":3,"98432":1,"9844":3,"999995":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:[2,9],error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:2,ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file
+Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":2,"000002":2,"007961":2,"01":[1,3,6],"018600":3,"019":[1,6],"019778":3,"02":[2,6],"025078":3,"025776":3,"028308":3,"03":[3,6],"034941":3,"04":[4,6],"040756":3,"056616":3,"061463":3,"0625":3,"063813":3,"080114":3,"08199":4,"08452":4,"084721":1,"091436":3,"0938":3,"096718":2,"097543":2,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4],"100":[2,48],"101597":2,"1024":[1,3,4,11],"1045":3,"1048576":1,"106434":4,"11":[0,1,3,6],"1152":3,"12":[1,2,3,6],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"125224":3,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"1408":3,"140846":3,"142862":2,"143743":3,"149375":2,"149397":4,"15":[1,3],"153":2,"1536":3,"153853":2,"154":2,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"1664":3,"16777216":1,"17":3,"171410":2,"172588":3,"17879":4,"1792":3,"179533":2,"18":3,"181817":2,"1823":2,"185":[4,6],"186":2,"19":[1,3],"190482":1,"192":1,"1920":3,"197974":3,"198":2,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"21":3,"212868":4,"2141":1,"214186":4,"216187":2,"2176":3,"219":1,"219206":3,"22":3,"220":3,"23":[3,6],"2304":3,"24":3,"2432":3,"245":3,"25":[3,48],"256":[1,2,3,10],"2560":3,"26":3,"260869":3,"262144":1,"262177":3,"2656":3,"2688":3,"27":3,"273916":3,"28":[1,3],"2812":3,"2816":3,"283019":3,"2891":3,"29":3,"293429":4,"2944":3,"296679":3,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"3072":3,"3076":1,"31":3,"3125":3,"32":[3,10],"3200":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"34":3,"341":1,"34172":4,"342":6,"3438":3,"3456":3,"3477":3,"35":3,"3516":3,"3555":3,"3584":3,"359066":2,"36":3,"362445":1,"369354":3,"3712":3,"3713":1,"371721":4,"372618":3,"372800":3,"38":1,"380953":3,"384":[2,3],"3840":3,"384000":3,"388456":3,"39":3,"3906":3,"392363":3,"392744":3,"3968":3,"3984":3,"3986":4,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"4023":3,"403344":4,"403347":4,"406":2,"4062":3,"408716":4,"4096":[1,2,3],"412":2,"416":2,"4194304":1,"42142":4,"428568":1,"428801":3,"429770":[1,2],"430545":3,"431969":4,"433227":3,"44":3,"441243":3,"4492":3,"4531":3,"46":3,"460651":3,"4609":3,"466332":3,"4688":3,"472":1,"49":3,"4940":1,"4m":2,"4x":2,"5":[1,3,4,9],"5000":3,"500614":3,"501144":3,"507077":3,"51":3,"512":[2,3,4],"52":3,"524288":1,"528664":3,"530615":3,"5312":3,"54":3,"541":4,"546":2,"546756":2,"549":[3,6],"558825":3,"56":3,"563555":3,"565406":3,"566038":2,"566925":3,"568431":4,"577704":1,"584162":3,"585":2,"5859":3,"586858":4,"587973":3,"589":[2,6],"5898":3,"59":[3,6],"5mn":2,"6":[0,1,3],"600000":1,"600004":2,"603520":3,"605729":3,"606":2,"6094":3,"614":1,"615390":1,"62":3,"63":3,"630":2,"638266":3,"64":[1,3],"640":[2,3],"65536":1,"655991":2,"656000":3,"656574":1,"664":2,"664092":3,"666684":2,"67086":4,"67108864":1,"6724":1,"68":3,"682":2,"69":3,"6953":3,"698613":3,"7":[0,1,3,9],"70":3,"702":2,"7031":3,"7070":3,"707878":4,"708630":3,"71":3,"719258":4,"72":3,"722":1,"727466":3,"73":3,"733401":2,"74":3,"743443":4,"747321":3,"75":3,"7500":3,"752274":3,"76":[1,3],"768":[2,3],"768000":3,"77":3,"78":3,"780":1,"781":2,"79":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"806694":4,"81":3,"810":2,"811":2,"811163":1,"812":[1,2],"814814":2,"8192":1,"82":3,"823517":1,"828879":3,"83":3,"833":1,"833728":3,"838026":4,"8388608":1,"839992":2,"84":3,"840807":2,"84284":4,"843":1,"848":1,"85":3,"850":1,"850207":3,"851":1,"851852":3,"86":3,"863938":4,"865439":3,"87":3,"873965":2,"876512":3,"8828":3,"8867":3,"888887":3,"890151":3,"8906":3,"894196":3,"8945":3,"896":3,"898285":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"908442":3,"913776":2,"916269":3,"9219":3,"925276":2,"93":[2,3],"932191":3,"933564":3,"9375":3,"94":2,"9492":3,"95":2,"950243":3,"952835":4,"9531":3,"96":2,"9688":3,"97":2,"9733":1,"973584":3,"978909":3,"98":2,"9805":3,"983276":3,"98432":1,"9844":3,"994954":3,"999995":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:[2,9],error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:2,ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file