diff --git a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
index 8c90a0be8..91722fcae 100644
--- a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
+++ b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
@@ -76,7 +76,7 @@
},
"outputs": [],
"source": [
- "def next_power_of_2(n):\n n -= 1\n n |= n >> 1\n n |= n >> 2\n n |= n >> 4\n n |= n >> 8\n n |= n >> 16\n n += 1\n return n\n\n\ndef softmax(x):\n M, N = x.shape\n # The block size is the smallest power of two greater than the number of columns in `x`\n BLOCK = next_power_of_2(N)\n # Another trick we can use is to ask the compiler to parallelize each\n # row-normalization more aggressively -- i.e., with more warps -- vectors\n # that are longer\n # You will see in the next tutorial how to auto-tune this value in a more natural\n # way so you don't have to come up with manual heuristics yourself\n num_warps = 4\n if BLOCK >= 2048: num_warps = 8\n if BLOCK >= 4096: num_warps = 16\n # Allocate output\n y = torch.empty_like(x)\n # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix\n _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK)\n return y"
+ "def next_power_of_2(n):\n n -= 1\n n |= n >> 1\n n |= n >> 2\n n |= n >> 4\n n |= n >> 8\n n |= n >> 16\n n += 1\n return n\n\n\ndef softmax(x):\n M, N = x.shape\n # The block size is the smallest power of two greater than the number of columns in `x`\n BLOCK = next_power_of_2(N)\n # Another trick we can use is to ask the compiler to parallelize each\n # row-normalization more aggressively -- i.e., with more warps -- vectors\n # that are longer\n # You will see in the next tutorial how to auto-tune this value in a more natural\n # way so you don't have to come up with manual heuristics yourself\n num_warps = 4\n if BLOCK >= 2048: num_warps = 8\n if BLOCK >= 4096: num_warps = 16\n # Allocate output\n y = torch.empty_like(x)\n # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix\n _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK)\n return y"
]
},
{
diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index f46a615e7..164c82123 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index e6a7b00f2..97b5a7ca0 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
index 3c5d674c2..f9b1b5103 100644
--- a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
+++ b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
@@ -100,7 +100,7 @@ def softmax(x):
# Allocate output
y = torch.empty_like(x)
# Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix
- _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK)
+ _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK)
return y
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index cdb766d83..05123e4a0 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index f567402c9..2bbadbd65 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index 9889c70ef..f61347b43 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index 175214ab1..fa278f626 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index 80cc917e5..32cef6cd0 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index b1f955701..3514aa13b 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt
index 654b91a35..20f67f6ef 100644
--- a/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -212,7 +212,7 @@ We can now run the decorated function above. Pass `show_plots=True` to see the p
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 0 minutes 5.812 seconds)
+ **Total running time of the script:** ( 0 minutes 7.044 seconds)
.. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
index b6d8d0bf2..adba5d155 100644
--- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -151,7 +151,7 @@ We can create a helper function that enqueues the kernel and its (meta-)argument
# Allocate output
y = torch.empty_like(x)
# Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix
- _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK)
+ _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK)
return y
@@ -264,7 +264,7 @@ In the above plot, we can see that:
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 0 minutes 20.767 seconds)
+ **Total running time of the script:** ( 0 minutes 20.176 seconds)
.. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
index 1d6f977e6..2a476a4bd 100644
--- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -287,32 +287,32 @@ We can test our custom matrix multiplication operation against a native torch im
.. code-block:: none
- tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00,
- -3.4237e-04, -0.0000e+00],
- [-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01,
- -0.0000e+00, -0.0000e+00],
- [ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00,
- 1.6578e+01, 2.1031e+01],
+ tensor([[-5.9605e-08, 5.1094e+01, -1.8477e-05, ..., 2.6547e+01,
+ -7.2598e-05, -4.2510e-04],
+ [-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00,
+ -1.8644e-04, 1.3094e+01],
+ [-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00,
+ 4.3375e+01, 1.6865e+00],
...,
- [ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01,
- 3.5469e+01, 1.5602e+01],
- [-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00,
- 1.7412e+00, 1.1422e+01],
- [-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1248e-03,
- -1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16)
- tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00,
- -3.4261e-04, -0.0000e+00],
- [-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01,
- -0.0000e+00, -0.0000e+00],
- [ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00,
- 1.6578e+01, 2.1031e+01],
+ [-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00,
+ 4.1133e+00, -0.0000e+00],
+ [ 1.6562e+01, -8.1539e-04, 1.3836e+01, ..., 1.9844e+00,
+ -1.1238e-02, 8.4375e+00],
+ [-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6907e-02,
+ -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
+ tensor([[-5.9605e-08, 5.1094e+01, -1.8537e-05, ..., 2.6547e+01,
+ -7.2658e-05, -4.2605e-04],
+ [-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00,
+ -1.8632e-04, 1.3094e+01],
+ [-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00,
+ 4.3375e+01, 1.6875e+00],
...,
- [ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01,
- 3.5469e+01, 1.5602e+01],
- [-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00,
- 1.7412e+00, 1.1422e+01],
- [-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1324e-03,
- -1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16)
+ [-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00,
+ 4.1133e+00, -0.0000e+00],
+ [ 1.6562e+01, -8.1778e-04, 1.3836e+01, ..., 1.9844e+00,
+ -1.1238e-02, 8.4375e+00],
+ [-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6891e-02,
+ -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
tensor(True, device='cuda:0')
@@ -373,36 +373,36 @@ We can now compare the performance of our kernel against CUTLASS. Here we focus
M cuBLAS Triton
0 512.0 20.164923 15.420235
- 1 768.0 58.982401 42.130286
+ 1 768.0 58.982401 40.215272
2 1024.0 91.180520 72.315584
3 1280.0 157.538463 117.028568
- 4 1536.0 150.593357 147.455995
- 5 1792.0 212.064605 193.783168
- 6 2048.0 197.379013 151.146088
- 7 2304.0 243.753804 179.608068
- 8 2560.0 237.449270 217.006622
- 9 2816.0 233.231062 200.987140
+ 4 1536.0 153.867127 144.446699
+ 5 1792.0 208.137481 190.498706
+ 6 2048.0 199.728763 152.520144
+ 7 2304.0 246.266731 178.267699
+ 8 2560.0 235.741014 215.578957
+ 9 2816.0 231.990461 198.246398
10 3072.0 236.916752 221.184001
- 11 3328.0 234.499328 210.500857
+ 11 3328.0 239.173747 210.500857
12 3584.0 248.385067 230.552287
- 13 3840.0 252.493157 223.418188
- 14 4096.0 263.689066 244.922869
- 15 4352.0 247.295210 231.639115
- 16 4608.0 274.573240 254.803966
- 17 4864.0 266.298229 245.366501
- 18 5120.0 259.548513 238.312729
- 19 5376.0 252.676487 237.081606
- 20 5632.0 270.685535 249.046163
- 21 5888.0 264.382140 242.069377
- 22 6144.0 262.447761 240.565495
- 23 6400.0 257.028108 235.078047
- 24 6656.0 254.386204 232.699140
- 25 6912.0 252.040861 232.926171
- 26 7168.0 253.193644 231.815375
- 27 7424.0 251.789150 232.860938
- 28 7680.0 250.988932 231.727608
- 29 7936.0 253.622108 232.094986
- 30 8192.0 253.121589 231.859598
+ 13 3840.0 251.917998 222.519114
+ 14 4096.0 263.172024 244.032234
+ 15 4352.0 249.595626 232.307632
+ 16 4608.0 276.560014 254.803966
+ 17 4864.0 266.614125 245.366501
+ 18 5120.0 257.003930 238.096276
+ 19 5376.0 252.676487 236.527241
+ 20 5632.0 270.057027 248.514009
+ 21 5888.0 264.206935 242.511113
+ 22 6144.0 259.441481 241.205983
+ 23 6400.0 257.157204 235.078047
+ 24 6656.0 254.161678 232.699140
+ 25 6912.0 251.844029 233.178785
+ 26 7168.0 253.282797 231.740709
+ 27 7424.0 251.868505 230.377264
+ 28 7680.0 250.988932 231.606284
+ 29 7936.0 253.293068 229.692102
+ 30 8192.0 253.002304 231.360005
@@ -410,7 +410,7 @@ We can now compare the performance of our kernel against CUTLASS. Here we focus
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 0 minutes 36.230 seconds)
+ **Total running time of the script:** ( 0 minutes 32.933 seconds)
.. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
index aef0a0aff..85f0bd13f 100644
--- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -5,12 +5,12 @@
Computation times
=================
-**00:36.230** total execution time for **getting-started_tutorials** files:
+**01:00.154** total execution time for **getting-started_tutorials** files:
+---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 00:36.230 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 00:32.933 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:20.176 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:00.000 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:07.044 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html
index 0fd411698..dd5e6f78f 100644
--- a/getting-started/tutorials/01-vector-add.html
+++ b/getting-started/tutorials/01-vector-add.html
@@ -295,7 +295,7 @@ for different problem sizes.
-
Total running time of the script: ( 0 minutes 5.812 seconds)
+
Total running time of the script: ( 0 minutes 7.044 seconds)
@@ -343,7 +343,7 @@ This means that – when temporary data is too large to fit entirely in the GPU
Note that our Triton kernel is not only faster than PyTorch’s CUDA kernel, it is also easier to read, understand and maintain.
-
Total running time of the script: ( 0 minutes 20.767 seconds)
+
Total running time of the script: ( 0 minutes 20.176 seconds)