diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index 1488600ad..a3ebafcb8 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index c0dccebcb..55c0a9f38 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index d103ad29b..423b366d6 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index d2558f2c4..feac82077 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index 4adad176a..d1386546d 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index 95427a2a6..b2c2e08b6 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index bb8bcb7dc..5366ee1a6 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index 78282fd2c..72225d786 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt
index 46c7371a3..d38c65ca4 100644
--- a/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -234,7 +234,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
     0        4096.0    9.600000    9.600000
     1        8192.0   19.200000   19.200000
     2       16384.0   38.400001   38.400001
-    3       32768.0   63.999998   76.800002
+    3       32768.0   76.800002   76.800002
     4       65536.0  127.999995  127.999995
     5      131072.0  219.428568  219.428568
     6      262144.0  341.333321  341.333321
@@ -254,7 +254,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  44.504 seconds)
+   **Total running time of the script:** ( 1 minutes  47.617 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
index 73cc4b463..15c82cecb 100644
--- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -286,16 +286,16 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
 
     softmax-performance:
               N      Triton  Torch (native)  Torch (jit)
-    0     256.0  512.000001      546.133347   184.089886
-    1     384.0  585.142862      558.545450   151.703707
+    0     256.0  512.000001      546.133347   186.181817
+    1     384.0  585.142862      585.142862   153.600004
     2     512.0  630.153853      606.814814   154.566038
     3     640.0  682.666684      640.000002   160.000000
     4     768.0  702.171410      664.216187   163.839992
     ..      ...         ...             ...          ...
-    93  12160.0  810.666687      406.179533   199.038365
-    94  12288.0  810.754644      415.222812   199.298541
-    95  12416.0  809.189387      412.577363   198.854847
-    96  12544.0  807.661970      412.971190   199.061730
+    93  12160.0  810.666687      406.179533   199.140227
+    94  12288.0  810.754644      416.101597   199.399583
+    95  12416.0  809.189387      412.149375   199.054102
+    96  12544.0  807.661970      412.546756   199.308841
     97  12672.0  807.776923      412.097543   199.264875
 
     [98 rows x 4 columns]
@@ -314,7 +314,7 @@ In the above plot, we can see that:
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  26.029 seconds)
+   **Total running time of the script:** ( 3 minutes  26.788 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
index 04727690b..c0c53695e 100644
--- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -463,36 +463,36 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
     matmul-performance:
              M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
     0    256.0   2.978909  ...   2.978909              2.978909
-    1    384.0   7.372800  ...   8.507077              8.507077
+    1    384.0   7.372800  ...   7.899428              7.899428
     2    512.0  14.563555  ...  16.384000             16.384000
     3    640.0  22.260869  ...  24.380953             24.380953
     4    768.0  32.768000  ...  34.028308             34.028308
-    5    896.0  37.971025  ...  39.025776             39.025776
+    5    896.0  39.025776  ...  40.140799             39.025776
     6   1024.0  49.932191  ...  53.773130             52.428801
-    7   1152.0  44.566925  ...  46.656000             45.938215
-    8   1280.0  51.200001  ...  56.109587             56.109587
-    9   1408.0  64.138541  ...  66.485074             66.485074
+    7   1152.0  44.566925  ...  46.656000             46.656000
+    8   1280.0  51.200001  ...  56.888887             56.109587
+    9   1408.0  64.138541  ...  67.305878             66.485074
     10  1536.0  79.526831  ...  78.643199             78.643199
-    11  1664.0  62.929456  ...  62.492442             62.492442
-    12  1792.0  72.983276  ...  72.047592             72.047592
+    11  1664.0  62.929456  ...  62.061463             62.061463
+    12  1792.0  72.983276  ...  71.588687             72.047592
     13  1920.0  69.120002  ...  70.530615             70.172588
     14  2048.0  73.908442  ...  76.959706             76.608294
-    15  2176.0  83.155572  ...  85.998493             85.998493
-    16  2304.0  68.446623  ...  76.809875             76.809875
-    17  2432.0  71.305746  ...  74.918570             85.393507
-    18  2560.0  77.833728  ...  81.310171             80.709358
-    19  2688.0  83.552988  ...  89.676257             89.254248
-    20  2816.0  82.759409  ...  83.074685             83.392363
-    21  2944.0  82.784108  ...  81.832567             82.237674
-    22  3072.0  81.943708  ...  87.924073             89.030036
-    23  3200.0  82.368085  ...  89.012517             95.025983
-    24  3328.0  83.613586  ...  81.346098             83.905938
-    25  3456.0  81.766291  ...  90.943675             91.097818
-    26  3584.0  86.540320  ...  91.655413             87.381330
-    27  3712.0  85.163978  ...  84.088676             88.561477
-    28  3840.0  80.960466  ...  86.400002             91.701494
-    29  3968.0  86.083907  ...  91.198760             84.154440
-    30  4096.0  93.498941  ...  93.336389             89.181212
+    15  2176.0  83.500614  ...  86.367588             85.632545
+    16  2304.0  68.446623  ...  76.809875             77.057651
+    17  2432.0  71.305746  ...  74.521127             85.393507
+    18  2560.0  78.019048  ...  81.512437             81.108913
+    19  2688.0  83.922689  ...  89.464755             89.676257
+    20  2816.0  84.035084  ...  83.233226             83.552120
+    21  2944.0  82.373605  ...  82.784108             81.967162
+    22  3072.0  81.707223  ...  89.170242             87.924073
+    23  3200.0  84.656085  ...  95.451158             95.096582
+    24  3328.0  84.003845  ...  82.181847             83.710812
+    25  3456.0  81.849303  ...  91.097818             91.097818
+    26  3584.0  86.790921  ...  93.661869             95.047985
+    27  3712.0  84.159518  ...  86.044224             89.273764
+    28  3840.0  83.528704  ...  90.649182             87.011801
+    29  3968.0  89.003603  ...  86.480463             86.911637
+    30  4096.0  91.741443  ...  91.428970             86.258181
 
     [31 rows x 5 columns]
 
@@ -502,7 +502,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 6 minutes  4.767 seconds)
+   **Total running time of the script:** ( 5 minutes  38.718 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
diff --git a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
index f7149d658..fe5ecad7d 100644
--- a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
+++ b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
@@ -238,7 +238,7 @@ References
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  0.271 seconds)
+   **Total running time of the script:** ( 0 minutes  0.010 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:
diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
index a6feaad6c..6f36d0542 100644
--- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -5,14 +5,14 @@
 
 Computation times
 =================
-**11:15.571** total execution time for **getting-started_tutorials** files:
+**10:53.132** total execution time for **getting-started_tutorials** files:
 
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:04.767 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 05:38.718 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:26.029 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:26.788 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:44.504 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:47.617 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.271 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.010 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html
index 5a8e9df35..32bac86fb 100644
--- a/getting-started/tutorials/01-vector-add.html
+++ b/getting-started/tutorials/01-vector-add.html
@@ -323,7 +323,7 @@ for different problem sizes.</p>
 0        4096.0    9.600000    9.600000
 1        8192.0   19.200000   19.200000
 2       16384.0   38.400001   38.400001
-3       32768.0   63.999998   76.800002
+3       32768.0   76.800002   76.800002
 4       65536.0  127.999995  127.999995
 5      131072.0  219.428568  219.428568
 6      262144.0  341.333321  341.333321
@@ -338,7 +338,7 @@ for different problem sizes.</p>
 15  134217728.0  849.737435  850.656574
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  44.504 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  47.617 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>
diff --git a/getting-started/tutorials/02-fused-softmax.html b/getting-started/tutorials/02-fused-softmax.html
index ba73b9fc5..75b24dfc3 100644
--- a/getting-started/tutorials/02-fused-softmax.html
+++ b/getting-started/tutorials/02-fused-softmax.html
@@ -373,16 +373,16 @@ We will then compare its performance against (1) <code class="code docutils lite
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance:
           N      Triton  Torch (native)  Torch (jit)
-0     256.0  512.000001      546.133347   184.089886
-1     384.0  585.142862      558.545450   151.703707
+0     256.0  512.000001      546.133347   186.181817
+1     384.0  585.142862      585.142862   153.600004
 2     512.0  630.153853      606.814814   154.566038
 3     640.0  682.666684      640.000002   160.000000
 4     768.0  702.171410      664.216187   163.839992
 ..      ...         ...             ...          ...
-93  12160.0  810.666687      406.179533   199.038365
-94  12288.0  810.754644      415.222812   199.298541
-95  12416.0  809.189387      412.577363   198.854847
-96  12544.0  807.661970      412.971190   199.061730
+93  12160.0  810.666687      406.179533   199.140227
+94  12288.0  810.754644      416.101597   199.399583
+95  12416.0  809.189387      412.149375   199.054102
+96  12544.0  807.661970      412.546756   199.308841
 97  12672.0  807.776923      412.097543   199.264875
 
 [98 rows x 4 columns]
@@ -396,7 +396,7 @@ We will then compare its performance against (1) <code class="code docutils lite
 Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
 </ul>
 </div></blockquote>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  26.029 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  26.788 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>
diff --git a/getting-started/tutorials/03-matrix-multiplication.html b/getting-started/tutorials/03-matrix-multiplication.html
index 1af06876c..f6f41dd25 100644
--- a/getting-started/tutorials/03-matrix-multiplication.html
+++ b/getting-started/tutorials/03-matrix-multiplication.html
@@ -568,41 +568,41 @@ torch_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3906,  24.4531, -3
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
          M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
 0    256.0   2.978909  ...   2.978909              2.978909
-1    384.0   7.372800  ...   8.507077              8.507077
+1    384.0   7.372800  ...   7.899428              7.899428
 2    512.0  14.563555  ...  16.384000             16.384000
 3    640.0  22.260869  ...  24.380953             24.380953
 4    768.0  32.768000  ...  34.028308             34.028308
-5    896.0  37.971025  ...  39.025776             39.025776
+5    896.0  39.025776  ...  40.140799             39.025776
 6   1024.0  49.932191  ...  53.773130             52.428801
-7   1152.0  44.566925  ...  46.656000             45.938215
-8   1280.0  51.200001  ...  56.109587             56.109587
-9   1408.0  64.138541  ...  66.485074             66.485074
+7   1152.0  44.566925  ...  46.656000             46.656000
+8   1280.0  51.200001  ...  56.888887             56.109587
+9   1408.0  64.138541  ...  67.305878             66.485074
 10  1536.0  79.526831  ...  78.643199             78.643199
-11  1664.0  62.929456  ...  62.492442             62.492442
-12  1792.0  72.983276  ...  72.047592             72.047592
+11  1664.0  62.929456  ...  62.061463             62.061463
+12  1792.0  72.983276  ...  71.588687             72.047592
 13  1920.0  69.120002  ...  70.530615             70.172588
 14  2048.0  73.908442  ...  76.959706             76.608294
-15  2176.0  83.155572  ...  85.998493             85.998493
-16  2304.0  68.446623  ...  76.809875             76.809875
-17  2432.0  71.305746  ...  74.918570             85.393507
-18  2560.0  77.833728  ...  81.310171             80.709358
-19  2688.0  83.552988  ...  89.676257             89.254248
-20  2816.0  82.759409  ...  83.074685             83.392363
-21  2944.0  82.784108  ...  81.832567             82.237674
-22  3072.0  81.943708  ...  87.924073             89.030036
-23  3200.0  82.368085  ...  89.012517             95.025983
-24  3328.0  83.613586  ...  81.346098             83.905938
-25  3456.0  81.766291  ...  90.943675             91.097818
-26  3584.0  86.540320  ...  91.655413             87.381330
-27  3712.0  85.163978  ...  84.088676             88.561477
-28  3840.0  80.960466  ...  86.400002             91.701494
-29  3968.0  86.083907  ...  91.198760             84.154440
-30  4096.0  93.498941  ...  93.336389             89.181212
+15  2176.0  83.500614  ...  86.367588             85.632545
+16  2304.0  68.446623  ...  76.809875             77.057651
+17  2432.0  71.305746  ...  74.521127             85.393507
+18  2560.0  78.019048  ...  81.512437             81.108913
+19  2688.0  83.922689  ...  89.464755             89.676257
+20  2816.0  84.035084  ...  83.233226             83.552120
+21  2944.0  82.373605  ...  82.784108             81.967162
+22  3072.0  81.707223  ...  89.170242             87.924073
+23  3200.0  84.656085  ...  95.451158             95.096582
+24  3328.0  84.003845  ...  82.181847             83.710812
+25  3456.0  81.849303  ...  91.097818             91.097818
+26  3584.0  86.790921  ...  93.661869             95.047985
+27  3712.0  84.159518  ...  86.044224             89.273764
+28  3840.0  83.528704  ...  90.649182             87.011801
+29  3968.0  89.003603  ...  86.480463             86.911637
+30  4096.0  91.741443  ...  91.428970             86.258181
 
 [31 rows x 5 columns]
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes  4.767 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  38.718 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
diff --git a/getting-started/tutorials/04-low-memory-dropout.html b/getting-started/tutorials/04-low-memory-dropout.html
index 6dcca5d5e..8223753fa 100644
--- a/getting-started/tutorials/04-low-memory-dropout.html
+++ b/getting-started/tutorials/04-low-memory-dropout.html
@@ -370,7 +370,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
 <dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
 </dd>
 </dl>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.271 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.010 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>
diff --git a/getting-started/tutorials/sg_execution_times.html b/getting-started/tutorials/sg_execution_times.html
index e35c4d02a..4e71d06c4 100644
--- a/getting-started/tutorials/sg_execution_times.html
+++ b/getting-started/tutorials/sg_execution_times.html
@@ -174,7 +174,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>11:15.571</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
+<p><strong>10:53.132</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -183,19 +183,19 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
-<td><p>06:04.767</p></td>
+<td><p>05:38.718</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
-<td><p>03:26.029</p></td>
+<td><p>03:26.788</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
-<td><p>01:44.504</p></td>
+<td><p>01:47.617</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
-<td><p>00:00.271</p></td>
+<td><p>00:00.010</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/searchindex.js b/searchindex.js
index 902330338..68ebbb120 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":2,"000002":2,"004273":1,"01":[1,3,6],"012517":3,"02":[2,6],"025776":3,"025983":3,"028308":3,"029":[2,6],"03":[3,6],"030036":3,"038365":2,"04":[4,6],"047592":3,"06":6,"061730":2,"0625":3,"074685":3,"08199":4,"083907":3,"08452":4,"084721":1,"088676":3,"089886":2,"0938":3,"097543":2,"097818":3,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4],"100":[2,48],"1024":[1,3,4,11],"1045":3,"1048576":1,"106434":4,"109587":3,"11":[0,1,3,6],"1152":3,"12":[1,3],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"1408":3,"142862":2,"149397":4,"15":[1,3,6],"151":2,"1536":3,"153853":2,"154":2,"154440":3,"155572":3,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"163978":3,"1664":3,"16777216":1,"17":3,"171410":2,"172588":3,"17879":4,"1792":3,"179533":2,"18":3,"181212":3,"1823":2,"184":2,"189387":2,"19":[1,3],"190482":1,"192":1,"1920":3,"198":2,"1982":9,"1983":8,"1984":9,"198760":3,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"21":3,"212868":4,"2141":1,"214186":4,"216187":2,"2176":3,"219":1,"22":3,"220":3,"222812":2,"23":3,"2304":3,"237674":3,"24":3,"2432":3,"245":3,"25":[3,48],"254248":3,"256":[1,2,3,10],"2560":3,"26":[2,3,6],"260869":3,"262144":1,"264875":2,"2656":3,"2688":3,"27":3,"271":[4,6],"28":[1,3],"2812":3,"2816":3,"2891":3,"29":3,"293429":4,"2944":3,"298541":2,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"305746":3,"3072":3,"3076":1,"31":3,"310171":3,"3125":3,"32":[3,10],"3200":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"336389":3,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"346098":3,"3477":3,"3516":3,"3555":3,"3584":3,"36":3,"362445":1,"368085":3,"37":3,"3712":3,"3713":1,"371721":4,"372800":3,"38":1,"380953":3,"381330":3,"384":[2,3],"3840":3,"384000":3,"39":3,"3906":3,"392363":3,"393507":3,"3968":3,"3984":3,"3986":4,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400002":3,"400016":1,"4023":3,"403344":4,"403347":4,"406":2,"4062":3,"408716":4,"4096":[1,2,3],"412":2,"415":2,"4194304":1,"42142":4,"428568":1,"428801":3,"429770":1,"431969":4,"44":[1,3,6],"446623":3,"448255":1,"4492":3,"45":3,"4531":3,"46":3,"4609":3,"4688":3,"472":1,"485074":3,"49":3,"492442":3,"4940":1,"498941":3,"4m":2,"4x":2,"5":[1,3,4,9,48],"5000":3,"504":[1,6],"507077":3,"51":3,"512":[2,3,4],"52":3,"524288":1,"526831":3,"53":3,"530615":3,"5312":3,"54":3,"540320":3,"541":4,"545450":2,"546":2,"552988":3,"558":2,"56":3,"561477":3,"563555":3,"566038":2,"566925":3,"568431":4,"571":6,"577363":2,"585":2,"5859":3,"586858":4,"5898":3,"5mn":2,"6":[0,1,3],"600000":1,"606":2,"608294":3,"6094":3,"613586":3,"614":1,"615390":1,"62":3,"63":1,"630":2,"64":[1,3],"640":[2,3],"643199":3,"65536":1,"655413":3,"656000":3,"656574":1,"66":3,"661970":2,"664":2,"666684":2,"666687":2,"67086":4,"67108864":1,"6724":1,"676257":3,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"70":3,"701494":3,"702":2,"7031":3,"703707":2,"7070":3,"707878":4,"709358":3,"71":3,"719258":4,"72":3,"722":1,"73":3,"737435":1,"74":3,"743443":4,"7500":3,"754644":2,"759409":3,"76":[1,3],"766291":3,"767":[3,6],"768":[2,3],"768000":3,"77":3,"773130":3,"776923":2,"78":3,"780":1,"781":2,"784108":3,"79":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"806694":4,"807":2,"809":2,"809875":3,"81":3,"810":2,"811163":1,"812":1,"814814":2,"8192":1,"82":3,"823517":1,"83":3,"832567":3,"833":1,"833728":3,"838026":4,"8388608":1,"839992":2,"84":3,"842":1,"84284":4,"843":1,"847":1,"848":1,"849":1,"85":3,"850":1,"854847":2,"86":3,"863938":4,"87":3,"88":3,"8828":3,"8867":3,"89":3,"8906":3,"8945":3,"896":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"905938":3,"908442":3,"91":3,"918570":3,"9219":3,"924073":3,"929456":3,"93":[2,3],"932191":3,"9375":3,"938215":3,"94":2,"943675":3,"943708":3,"9492":3,"95":[2,3],"952835":4,"9531":3,"959706":3,"96":2,"960466":3,"9688":3,"97":2,"971025":3,"971190":2,"9733":1,"978909":3,"98":2,"9805":3,"983276":3,"98432":1,"9844":3,"998493":3,"999995":1,"999998":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,13,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:9,error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47,48],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,kwarg:13,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,pgm:1,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:[0,2],ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],record_clock:48,rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[0,2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4,46],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],txt:0,type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file
+Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":2,"000002":2,"003603":3,"003845":3,"004273":1,"01":[1,3,6],"010":[4,6],"011801":3,"019048":3,"02":[2,6],"025776":3,"028308":3,"03":[3,6],"035084":3,"04":[4,6],"044224":3,"047592":3,"047985":3,"05":6,"054102":2,"057651":3,"061463":3,"0625":3,"08199":4,"08452":4,"084721":1,"0938":3,"096582":3,"097543":2,"097818":3,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4,6],"100":[2,48],"101597":2,"1024":[1,3,4,11],"1045":3,"1048576":1,"106434":4,"108913":3,"109587":3,"11":[0,1,3],"1152":3,"12":[1,3],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"131072":1,"132":6,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"140227":2,"140799":3,"1408":3,"142862":2,"149375":2,"149397":4,"15":[1,3],"153":2,"1536":3,"153853":2,"154":2,"159518":3,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"1664":3,"16777216":1,"17":3,"170242":3,"171410":2,"172588":3,"17879":4,"1792":3,"179533":2,"18":3,"181817":2,"181847":3,"1823":2,"186":2,"189387":2,"19":[1,3],"190482":1,"192":1,"1920":3,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"2004":9,"2006":9,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"21":3,"212868":4,"2141":1,"214186":4,"216187":2,"2176":3,"219":1,"22":3,"220":3,"23":3,"2304":3,"233226":3,"24":3,"2432":3,"245":3,"25":[3,48],"256":[1,2,3,10],"2560":3,"258181":3,"26":[2,3,6],"260869":3,"262144":1,"264875":2,"2656":3,"2688":3,"27":3,"273764":3,"28":[1,3],"2812":3,"2816":3,"2891":3,"29":3,"293429":4,"2944":3,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"305746":3,"305878":3,"3072":3,"3076":1,"308841":2,"31":3,"3125":3,"32":[3,10],"3200":3,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"3477":3,"3516":3,"3555":3,"3584":3,"36":3,"362445":1,"367588":3,"3712":3,"3713":1,"371721":4,"372800":3,"373605":3,"38":[1,3,6],"380953":3,"384":[2,3],"3840":3,"384000":3,"39":3,"3906":3,"393507":3,"3968":3,"3984":3,"3986":4,"399583":2,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"4023":3,"403344":4,"403347":4,"406":2,"4062":3,"408716":4,"4096":[1,2,3],"412":2,"416":2,"4194304":1,"42142":4,"428568":1,"428801":3,"428970":3,"429770":1,"431969":4,"44":3,"446623":3,"448255":1,"4492":3,"451158":3,"4531":3,"46":3,"4609":3,"464755":3,"4688":3,"47":[1,6],"472":1,"480463":3,"485074":3,"49":3,"4940":1,"4m":2,"4x":2,"5":[1,3,4,9,48],"5000":3,"500614":3,"51":3,"512":[2,3,4],"512437":3,"52":3,"521127":3,"524288":1,"526831":3,"528704":3,"53":[3,6],"530615":3,"5312":3,"54":3,"541":4,"546":2,"546756":2,"552120":3,"56":3,"563555":3,"566038":2,"566925":3,"568431":4,"585":2,"5859":3,"586858":4,"588687":3,"5898":3,"5mn":2,"6":[0,1,3],"600000":1,"600004":2,"606":2,"608294":3,"6094":3,"614":1,"615390":1,"617":[1,6],"62":3,"630":2,"632545":3,"64":[1,3],"640":[2,3],"643199":3,"649182":3,"65536":1,"656000":3,"656085":3,"656574":1,"66":3,"661869":3,"661970":2,"664":2,"666684":2,"666687":2,"67":3,"67086":4,"67108864":1,"6724":1,"676257":3,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"70":3,"702":2,"7031":3,"7070":3,"707223":3,"707878":4,"71":3,"710812":3,"718":[3,6],"719258":4,"72":3,"722":1,"73":3,"737435":1,"74":3,"741443":3,"743443":4,"7500":3,"754644":2,"76":[1,3],"768":[2,3],"768000":3,"77":3,"773130":3,"776923":2,"78":3,"780":1,"781":2,"784108":3,"788":[2,6],"79":3,"790921":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":48,"800002":1,"806694":4,"807":2,"809":2,"809875":3,"81":3,"810":2,"811163":1,"812":1,"814814":2,"8192":1,"82":3,"823517":1,"83":3,"833":1,"838026":4,"8388608":1,"839992":2,"84":3,"842":1,"84284":4,"843":1,"847":1,"848":1,"849":1,"849303":3,"85":3,"850":1,"86":3,"863938":4,"87":3,"8828":3,"8867":3,"888887":3,"89":3,"8906":3,"8945":3,"896":3,"899428":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"908442":3,"91":3,"911637":3,"9219":3,"922689":3,"924073":3,"929456":3,"93":[2,3],"932191":3,"9375":3,"94":2,"9492":3,"95":[2,3],"952835":4,"9531":3,"959706":3,"96":2,"967162":3,"9688":3,"97":2,"9733":1,"978909":3,"98":2,"9805":3,"983276":3,"98432":1,"9844":3,"999995":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,13,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:9,error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47,48],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,kwarg:13,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,pgm:1,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:[0,2],ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],record_clock:48,rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[0,2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4,46],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],txt:0,type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file