diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index 82e42c143..aab318caa 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index f3dc2e234..a4dd8413e 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index 67b83135a..5723bca13 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index 318ba9d77..96771572d 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index 6f4033674..5f3cc3772 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index deaa8e1d8..35ee4dd0a 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index 04c7e5d6d..e8bee9f60 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index 3eea650d9..b1e6e6da9 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt
index bdfbd6a6f..a165f60b2 100644
--- a/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -233,8 +233,8 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
                size      Triton       Torch
     0        4096.0    9.600000    9.600000
     1        8192.0   19.200000   19.200000
-    2       16384.0   38.400001   38.400001
-    3       32768.0   76.800002   76.800002
+    2       16384.0   38.400001   31.999999
+    3       32768.0   63.999998   76.800002
     4       65536.0  127.999995  127.999995
     5      131072.0  219.428568  219.428568
     6      262144.0  341.333321  384.000001
@@ -244,7 +244,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
     10    4194304.0  780.190482  780.190482
     11    8388608.0  812.429770  812.429770
     12   16777216.0  833.084721  833.084721
-    13   33554432.0  842.004273  842.004273
+    13   33554432.0  842.004273  843.811163
     14   67108864.0  847.448255  848.362445
     15  134217728.0  849.737435  850.656574
 
@@ -254,7 +254,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  53.211 seconds)
+   **Total running time of the script:** ( 1 minutes  52.287 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
index 9e1e07d3e..4c7b124f7 100644
--- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -286,17 +286,17 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
 
     softmax-performance:
               N      Triton  Torch (native)  Torch (jit)
-    0     256.0  512.000001      546.133347   190.511628
-    1     384.0  585.142862      585.142862   151.703707
-    2     512.0  655.360017      606.814814   156.038096
+    0     256.0  512.000001      546.133347   186.181817
+    1     384.0  585.142862      585.142862   153.600004
+    2     512.0  630.153853      606.814814   154.566038
     3     640.0  682.666684      640.000002   160.000000
     4     768.0  702.171410      664.216187   163.839992
     ..      ...         ...             ...          ...
-    93  12160.0  810.666687      405.755985   198.936606
-    94  12288.0  810.754644      415.661740   199.298541
-    95  12416.0  809.189387      411.722274   198.904612
+    93  12160.0  810.666687      406.179533   199.140227
+    94  12288.0  810.754644      415.661740   199.399583
+    95  12416.0  809.189387      412.149375   198.954424
     96  12544.0  807.661970      412.971190   199.209928
-    97  12672.0  807.776923      411.679167   199.264875
+    97  12672.0  807.776923      412.097543   199.264875
 
     [98 rows x 4 columns]
 
@@ -314,7 +314,7 @@ In the above plot, we can see that:
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  31.670 seconds)
+   **Total running time of the script:** ( 3 minutes  29.511 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
index 5b08d800d..9acb6c6a3 100644
--- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -463,36 +463,36 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
     matmul-performance:
              M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
     0    256.0   2.730667  ...   2.978909              2.978909
-    1    384.0   7.372800  ...   8.507077              8.507077
+    1    384.0   7.372800  ...   8.507077              7.899428
     2    512.0  14.563555  ...  16.384000             16.384000
     3    640.0  22.260869  ...  24.380953             24.380953
     4    768.0  32.768000  ...  34.028308             34.028308
-    5    896.0  39.025776  ...  39.025776             39.025776
-    6   1024.0  49.932191  ...  53.773130             52.428801
-    7   1152.0  44.566925  ...  46.656000             46.656000
+    5    896.0  39.025776  ...  39.025776             37.971025
+    6   1024.0  49.932191  ...  52.428801             52.428801
+    7   1152.0  44.566925  ...  46.656000             45.938215
     8   1280.0  51.200001  ...  56.109587             56.109587
     9   1408.0  64.138541  ...  66.485074             66.485074
-    10  1536.0  80.430545  ...  78.643199             78.643199
-    11  1664.0  62.929456  ...  62.061463             62.061463
-    12  1792.0  72.983276  ...  72.047592             72.047592
-    13  1920.0  69.120002  ...  70.172588             70.172588
-    14  2048.0  73.908442  ...  76.608294             76.260072
-    15  2176.0  82.813365  ...  85.998493             85.269692
-    16  2304.0  68.446623  ...  76.563695             76.563695
-    17  2432.0  71.305746  ...  74.521127             85.393507
-    18  2560.0  77.833728  ...  81.310171             81.310171
-    19  2688.0  83.552988  ...  89.464755             89.044730
-    20  2816.0  83.712490  ...  83.552120             82.916747
-    21  2944.0  82.373605  ...  83.060049             82.102191
-    22  3072.0  81.121923  ...  89.170242             88.197981
-    23  3200.0  84.880639  ...  95.238096             94.534716
-    24  3328.0  83.130825  ...  83.905938             84.200347
-    25  3456.0  82.015834  ...  90.994998             85.585527
-    26  3584.0  85.797134  ...  91.750399             94.947616
-    27  3712.0  80.823095  ...  87.475786             88.015279
-    28  3840.0  83.718392  ...  91.097196             86.602979
-    29  3968.0  87.818595  ...  90.994735             86.973584
-    30  4096.0  93.142072  ...  88.417474             85.325956
+    10  1536.0  79.526831  ...  79.526831             78.643199
+    11  1664.0  62.929456  ...  62.492442             62.061463
+    12  1792.0  72.983276  ...  72.512412             72.047592
+    13  1920.0  69.120002  ...  70.530615             70.172588
+    14  2048.0  73.908442  ...  76.959706             76.608294
+    15  2176.0  83.500614  ...  85.632545             85.269692
+    16  2304.0  68.446623  ...  76.809875             76.809875
+    17  2432.0  71.305746  ...  84.621881             84.877538
+    18  2560.0  77.649287  ...  80.709358             81.310171
+    19  2688.0  83.552988  ...  89.464755             89.464755
+    20  2816.0  83.392363  ...  83.392363             82.916747
+    21  2944.0  82.509987  ...  82.373605             82.237674
+    22  3072.0  82.301023  ...  88.750943             88.335577
+    23  3200.0  84.993363  ...  95.380032             94.955488
+    24  3328.0  83.130825  ...  84.200347             84.895397
+    25  3456.0  81.108217  ...  85.858966             85.494768
+    26  3584.0  86.707226  ...  97.947050             98.429551
+    27  3712.0  81.548851  ...  88.015279             88.326564
+    28  3840.0  80.139129  ...  89.187096             91.097196
+    29  3968.0  85.992909  ...  87.347124             88.103928
+    30  4096.0  91.805174  ...  92.948562             85.216761
 
     [31 rows x 5 columns]
 
@@ -502,7 +502,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 6 minutes  36.541 seconds)
+   **Total running time of the script:** ( 6 minutes  28.511 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
diff --git a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
index 34799280f..0ae825e4e 100644
--- a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
+++ b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
@@ -238,7 +238,7 @@ References
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  0.195 seconds)
+   **Total running time of the script:** ( 0 minutes  0.354 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:
diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
index 9c0dcc5c8..02e8216a4 100644
--- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**12:01.617** total execution time for **getting-started_tutorials** files:
+**11:50.662** total execution time for **getting-started_tutorials** files:
 
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:36.541 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:28.511 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:31.670 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:29.511 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:53.211 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:52.287 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.195 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.354 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html
index 70cd3bbc9..143b47e5f 100644
--- a/getting-started/tutorials/01-vector-add.html
+++ b/getting-started/tutorials/01-vector-add.html
@@ -322,8 +322,8 @@ for different problem sizes.</p>
            size      Triton       Torch
 0        4096.0    9.600000    9.600000
 1        8192.0   19.200000   19.200000
-2       16384.0   38.400001   38.400001
-3       32768.0   76.800002   76.800002
+2       16384.0   38.400001   31.999999
+3       32768.0   63.999998   76.800002
 4       65536.0  127.999995  127.999995
 5      131072.0  219.428568  219.428568
 6      262144.0  341.333321  384.000001
@@ -333,12 +333,12 @@ for different problem sizes.</p>
 10    4194304.0  780.190482  780.190482
 11    8388608.0  812.429770  812.429770
 12   16777216.0  833.084721  833.084721
-13   33554432.0  842.004273  842.004273
+13   33554432.0  842.004273  843.811163
 14   67108864.0  847.448255  848.362445
 15  134217728.0  849.737435  850.656574
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  53.211 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  52.287 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>
diff --git a/getting-started/tutorials/02-fused-softmax.html b/getting-started/tutorials/02-fused-softmax.html
index 0e376ea09..65f4e5c1f 100644
--- a/getting-started/tutorials/02-fused-softmax.html
+++ b/getting-started/tutorials/02-fused-softmax.html
@@ -373,17 +373,17 @@ We will then compare its performance against (1) <code class="code docutils lite
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance:
           N      Triton  Torch (native)  Torch (jit)
-0     256.0  512.000001      546.133347   190.511628
-1     384.0  585.142862      585.142862   151.703707
-2     512.0  655.360017      606.814814   156.038096
+0     256.0  512.000001      546.133347   186.181817
+1     384.0  585.142862      585.142862   153.600004
+2     512.0  630.153853      606.814814   154.566038
 3     640.0  682.666684      640.000002   160.000000
 4     768.0  702.171410      664.216187   163.839992
 ..      ...         ...             ...          ...
-93  12160.0  810.666687      405.755985   198.936606
-94  12288.0  810.754644      415.661740   199.298541
-95  12416.0  809.189387      411.722274   198.904612
+93  12160.0  810.666687      406.179533   199.140227
+94  12288.0  810.754644      415.661740   199.399583
+95  12416.0  809.189387      412.149375   198.954424
 96  12544.0  807.661970      412.971190   199.209928
-97  12672.0  807.776923      411.679167   199.264875
+97  12672.0  807.776923      412.097543   199.264875
 
 [98 rows x 4 columns]
 </pre></div>
@@ -396,7 +396,7 @@ We will then compare its performance against (1) <code class="code docutils lite
 Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
 </ul>
 </div></blockquote>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  31.670 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  29.511 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>
diff --git a/getting-started/tutorials/03-matrix-multiplication.html b/getting-started/tutorials/03-matrix-multiplication.html
index ffb6b551f..bffd255d0 100644
--- a/getting-started/tutorials/03-matrix-multiplication.html
+++ b/getting-started/tutorials/03-matrix-multiplication.html
@@ -568,41 +568,41 @@ torch_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3906,  24.4531, -3
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
          M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
 0    256.0   2.730667  ...   2.978909              2.978909
-1    384.0   7.372800  ...   8.507077              8.507077
+1    384.0   7.372800  ...   8.507077              7.899428
 2    512.0  14.563555  ...  16.384000             16.384000
 3    640.0  22.260869  ...  24.380953             24.380953
 4    768.0  32.768000  ...  34.028308             34.028308
-5    896.0  39.025776  ...  39.025776             39.025776
-6   1024.0  49.932191  ...  53.773130             52.428801
-7   1152.0  44.566925  ...  46.656000             46.656000
+5    896.0  39.025776  ...  39.025776             37.971025
+6   1024.0  49.932191  ...  52.428801             52.428801
+7   1152.0  44.566925  ...  46.656000             45.938215
 8   1280.0  51.200001  ...  56.109587             56.109587
 9   1408.0  64.138541  ...  66.485074             66.485074
-10  1536.0  80.430545  ...  78.643199             78.643199
-11  1664.0  62.929456  ...  62.061463             62.061463
-12  1792.0  72.983276  ...  72.047592             72.047592
-13  1920.0  69.120002  ...  70.172588             70.172588
-14  2048.0  73.908442  ...  76.608294             76.260072
-15  2176.0  82.813365  ...  85.998493             85.269692
-16  2304.0  68.446623  ...  76.563695             76.563695
-17  2432.0  71.305746  ...  74.521127             85.393507
-18  2560.0  77.833728  ...  81.310171             81.310171
-19  2688.0  83.552988  ...  89.464755             89.044730
-20  2816.0  83.712490  ...  83.552120             82.916747
-21  2944.0  82.373605  ...  83.060049             82.102191
-22  3072.0  81.121923  ...  89.170242             88.197981
-23  3200.0  84.880639  ...  95.238096             94.534716
-24  3328.0  83.130825  ...  83.905938             84.200347
-25  3456.0  82.015834  ...  90.994998             85.585527
-26  3584.0  85.797134  ...  91.750399             94.947616
-27  3712.0  80.823095  ...  87.475786             88.015279
-28  3840.0  83.718392  ...  91.097196             86.602979
-29  3968.0  87.818595  ...  90.994735             86.973584
-30  4096.0  93.142072  ...  88.417474             85.325956
+10  1536.0  79.526831  ...  79.526831             78.643199
+11  1664.0  62.929456  ...  62.492442             62.061463
+12  1792.0  72.983276  ...  72.512412             72.047592
+13  1920.0  69.120002  ...  70.530615             70.172588
+14  2048.0  73.908442  ...  76.959706             76.608294
+15  2176.0  83.500614  ...  85.632545             85.269692
+16  2304.0  68.446623  ...  76.809875             76.809875
+17  2432.0  71.305746  ...  84.621881             84.877538
+18  2560.0  77.649287  ...  80.709358             81.310171
+19  2688.0  83.552988  ...  89.464755             89.464755
+20  2816.0  83.392363  ...  83.392363             82.916747
+21  2944.0  82.509987  ...  82.373605             82.237674
+22  3072.0  82.301023  ...  88.750943             88.335577
+23  3200.0  84.993363  ...  95.380032             94.955488
+24  3328.0  83.130825  ...  84.200347             84.895397
+25  3456.0  81.108217  ...  85.858966             85.494768
+26  3584.0  86.707226  ...  97.947050             98.429551
+27  3712.0  81.548851  ...  88.015279             88.326564
+28  3840.0  80.139129  ...  89.187096             91.097196
+29  3968.0  85.992909  ...  87.347124             88.103928
+30  4096.0  91.805174  ...  92.948562             85.216761
 
 [31 rows x 5 columns]
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes  36.541 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes  28.511 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
diff --git a/getting-started/tutorials/04-low-memory-dropout.html b/getting-started/tutorials/04-low-memory-dropout.html
index b04f63145..486f47cef 100644
--- a/getting-started/tutorials/04-low-memory-dropout.html
+++ b/getting-started/tutorials/04-low-memory-dropout.html
@@ -370,7 +370,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
 <dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
 </dd>
 </dl>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.195 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.354 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>
diff --git a/getting-started/tutorials/sg_execution_times.html b/getting-started/tutorials/sg_execution_times.html
index 0d5c9e339..702203b3c 100644
--- a/getting-started/tutorials/sg_execution_times.html
+++ b/getting-started/tutorials/sg_execution_times.html
@@ -174,7 +174,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>12:01.617</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
+<p><strong>11:50.662</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -183,19 +183,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
-<td><p>06:36.541</p></td>
+<td><p>06:28.511</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
-<td><p>03:31.670</p></td>
+<td><p>03:29.511</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
-<td><p>01:53.211</p></td>
+<td><p>01:52.287</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
-<td><p>00:00.195</p></td>
+<td><p>00:00.354</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/searchindex.js b/searchindex.js
index 0e745fb83..7027f2499 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":[1,2],"000002":2,"004273":1,"01":[1,3,6],"015279":3,"015834":3,"02":[2,6],"025776":3,"028308":3,"03":[3,6],"038096":2,"04":[4,6],"044730":3,"047592":3,"06":6,"060049":3,"061463":3,"0625":3,"08199":4,"08452":4,"084721":1,"0938":3,"097196":3,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4],"100":[2,48],"102191":3,"1024":[1,3,4,11],"1045":3,"1048576":1,"106434":4,"109587":3,"11":[0,1,3],"1152":3,"12":[1,3,6],"120002":3,"12160":2,"121923":3,"12288":2,"123":4,"12416":2,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"130825":3,"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"1408":3,"142072":3,"142862":2,"149397":4,"15":[1,3],"151":2,"1536":3,"156":2,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"1664":3,"16777216":1,"17":3,"170242":3,"171410":2,"172588":3,"17879":4,"1792":3,"18":3,"1823":2,"189387":2,"19":[1,3],"190":2,"190482":1,"192":1,"1920":3,"195":[4,6],"197981":3,"198":2,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"200347":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"209928":2,"21":3,"211":[1,6],"212868":4,"2141":1,"214186":4,"216187":2,"2176":3,"219":1,"22":3,"220":3,"23":3,"2304":3,"238096":3,"24":3,"2432":3,"245":3,"25":[3,48],"256":[1,2,3,10],"2560":3,"26":3,"260072":3,"260869":3,"262144":1,"264875":2,"2656":3,"2688":3,"269692":3,"27":3,"28":[1,3],"2812":3,"2816":3,"2891":3,"29":3,"293429":4,"2944":3,"298541":2,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"305746":3,"3072":3,"3076":1,"31":[2,3,6],"310171":3,"3125":3,"32":[3,10],"3200":3,"325956":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"3477":3,"3516":3,"3555":3,"3584":3,"36":[3,6],"360017":2,"362445":1,"3712":3,"3713":1,"371721":4,"372800":3,"373605":3,"38":1,"380953":3,"384":[1,2,3],"3840":3,"384000":3,"39":3,"3906":3,"393507":3,"3968":3,"3984":3,"3986":4,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"4023":3,"403344":4,"403347":4,"405":2,"4062":3,"408716":4,"4096":[1,2,3],"411":2,"412":2,"415":2,"417474":3,"4194304":1,"42142":4,"428568":1,"428801":3,"429770":1,"430545":3,"431969":4,"44":3,"446623":3,"448255":1,"4492":3,"4531":3,"46":3,"4609":3,"464755":3,"4688":3,"472":1,"475786":3,"485074":3,"49":3,"4940":1,"4m":2,"4x":2,"5":[1,3,4,9,48],"5000":3,"507077":3,"51":3,"511628":2,"512":[2,3,4],"52":3,"521127":3,"524288":1,"53":[1,3,6],"5312":3,"534716":3,"54":3,"541":[3,4,6],"546":2,"552120":3,"552988":3,"56":3,"563555":3,"563695":3,"566925":3,"568431":4,"585":2,"585527":3,"5859":3,"586858":4,"5898":3,"5mn":2,"6":[0,1,3],"600000":1,"602979":3,"606":2,"608294":3,"6094":3,"614":1,"615390":1,"617":6,"62":3,"64":[1,3],"640":[2,3],"643199":3,"655":2,"65536":1,"656000":3,"656574":1,"66":3,"661740":2,"661970":2,"664":2,"666684":2,"666687":2,"670":[2,6],"67086":4,"67108864":1,"6724":1,"679167":2,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"70":3,"702":2,"7031":3,"703707":2,"7070":3,"707878":4,"71":3,"712490":3,"718392":3,"719258":4,"72":3,"722":1,"722274":2,"73":3,"730667":3,"737435":1,"74":3,"743443":4,"7500":3,"750399":3,"754644":2,"755985":2,"76":[1,3],"768":[2,3],"768000":3,"77":3,"773130":3,"776923":2,"78":3,"780":1,"781":2,"797134":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"806694":4,"807":2,"809":2,"81":3,"810":2,"812":1,"813365":3,"814814":2,"818595":3,"8192":1,"82":3,"823095":3,"823517":1,"83":3,"833":1,"833728":3,"838026":4,"8388608":1,"839992":2,"84":3,"842":1,"84284":4,"847":1,"848":1,"849":1,"85":3,"850":1,"86":3,"863938":4,"87":3,"88":3,"880639":3,"8828":3,"8867":3,"89":3,"8906":3,"8945":3,"896":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"904612":2,"90567":4,"905938":3,"908442":3,"91":3,"916747":3,"9219":3,"929456":3,"93":[2,3],"932191":3,"936606":2,"9375":3,"94":[2,3],"947616":3,"9492":3,"95":[2,3],"952835":4,"9531":3,"96":2,"9688":3,"97":2,"971190":2,"9733":1,"973584":3,"978909":3,"98":2,"9805":3,"983276":3,"98432":1,"9844":3,"994735":3,"994998":3,"998493":3,"999995":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:9,error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47,48],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,pgm:1,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:[0,2],ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],record_clock:48,rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[0,2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],txt:0,type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file
+Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":[1,2],"000002":2,"004273":1,"01":[1,3,6],"015279":3,"02":[2,6],"025776":3,"028308":3,"03":[3,6],"04":[4,6],"047592":3,"06":6,"061463":3,"0625":3,"08199":4,"08452":4,"084721":1,"0938":3,"097196":3,"097543":2,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4],"100":[2,48],"1024":[1,3,4,11],"103928":3,"1045":3,"1048576":1,"106434":4,"108217":3,"109587":3,"11":[0,1,3,6],"1152":3,"12":[1,3],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"130825":3,"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"139129":3,"14":[1,3],"140227":2,"1408":3,"142862":2,"149375":2,"149397":4,"15":[1,3],"153":2,"1536":3,"153853":2,"154":2,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"1664":3,"16777216":1,"17":3,"171410":2,"172588":3,"17879":4,"1792":3,"179533":2,"18":3,"181817":2,"1823":2,"186":2,"187096":3,"189387":2,"19":[1,3],"190482":1,"192":1,"1920":3,"198":2,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"200347":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"209928":2,"21":3,"212868":4,"2141":1,"214186":4,"216187":2,"216761":3,"2176":3,"219":1,"22":3,"220":3,"23":3,"2304":3,"237674":3,"24":3,"2432":3,"245":3,"25":[3,48],"256":[1,2,3,10],"2560":3,"26":3,"260869":3,"262144":1,"264875":2,"2656":3,"2688":3,"269692":3,"27":3,"28":[1,3,6],"2812":3,"2816":3,"287":[1,6],"2891":3,"29":[2,3,6],"293429":4,"2944":3,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"301023":3,"305746":3,"3072":3,"3076":1,"31":[1,3],"310171":3,"3125":3,"32":[3,10],"3200":3,"326564":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"335577":3,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"347124":3,"3477":3,"3516":3,"354":[4,6],"3555":3,"3584":3,"36":3,"362445":1,"37":3,"3712":3,"3713":1,"371721":4,"372800":3,"373605":3,"38":1,"380032":3,"380953":3,"384":[1,2,3],"3840":3,"384000":3,"39":3,"3906":3,"392363":3,"3968":3,"3984":3,"3986":4,"399583":2,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"4023":3,"403344":4,"403347":4,"406":2,"4062":3,"408716":4,"4096":[1,2,3],"412":2,"415":2,"4194304":1,"42142":4,"428568":1,"428801":3,"429551":3,"429770":1,"431969":4,"44":3,"446623":3,"448255":1,"4492":3,"45":3,"4531":3,"46":3,"4609":3,"464755":3,"4688":3,"472":1,"485074":3,"49":3,"492442":3,"4940":1,"494768":3,"4m":2,"4x":2,"5":[1,3,4,9,48],"50":6,"5000":3,"500614":3,"507077":3,"509987":3,"51":3,"511":[2,3,6],"512":[2,3,4],"512412":3,"52":[1,3,6],"524288":1,"526831":3,"530615":3,"5312":3,"54":3,"541":4,"546":2,"548851":3,"552988":3,"56":3,"563555":3,"566038":2,"566925":3,"568431":4,"585":2,"5859":3,"586858":4,"5898":3,"5mn":2,"6":[0,1,3],"600000":1,"600004":2,"606":2,"608294":3,"6094":3,"614":1,"615390":1,"62":3,"621881":3,"63":1,"630":2,"632545":3,"64":[1,3],"640":[2,3],"643199":3,"649287":3,"65536":1,"656000":3,"656574":1,"66":3,"661740":2,"661970":2,"662":6,"664":2,"666684":2,"666687":2,"67086":4,"67108864":1,"6724":1,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"70":3,"702":2,"7031":3,"7070":3,"707226":3,"707878":4,"709358":3,"71":3,"719258":4,"72":3,"722":1,"73":3,"730667":3,"737435":1,"743443":4,"7500":3,"750943":3,"754644":2,"76":[1,3],"768":[2,3],"768000":3,"77":3,"776923":2,"78":3,"780":1,"781":2,"79":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"805174":3,"806694":4,"807":2,"809":2,"809875":3,"81":3,"810":2,"811163":1,"812":1,"814814":2,"8192":1,"82":3,"823517":1,"83":3,"833":1,"838026":4,"8388608":1,"839992":2,"84":3,"842":1,"84284":4,"843":1,"847":1,"848":1,"849":1,"85":3,"850":1,"858966":3,"86":3,"863938":4,"87":3,"877538":3,"88":3,"8828":3,"8867":3,"89":3,"8906":3,"8945":3,"895397":3,"896":3,"899428":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"908442":3,"91":3,"916747":3,"92":3,"9219":3,"929456":3,"93":2,"932191":3,"9375":3,"938215":3,"94":[2,3],"947050":3,"948562":3,"9492":3,"95":[2,3],"952835":4,"9531":3,"954424":2,"955488":3,"959706":3,"96":2,"9688":3,"97":[2,3],"971025":3,"971190":2,"9733":1,"978909":3,"98":[2,3],"9805":3,"983276":3,"98432":1,"9844":3,"992909":3,"993363":3,"999995":1,"999998":1,"999999":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:9,error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47,48],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,pgm:1,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:[0,2],ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],record_clock:48,rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[0,2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],txt:0,type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file