diff --git a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb index 8c90a0be8..91722fcae 100644 --- a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb +++ b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb @@ -76,7 +76,7 @@ }, "outputs": [], "source": [ - "def next_power_of_2(n):\n n -= 1\n n |= n >> 1\n n |= n >> 2\n n |= n >> 4\n n |= n >> 8\n n |= n >> 16\n n += 1\n return n\n\n\ndef softmax(x):\n M, N = x.shape\n # The block size is the smallest power of two greater than the number of columns in `x`\n BLOCK = next_power_of_2(N)\n # Another trick we can use is to ask the compiler to parallelize each\n # row-normalization more aggressively -- i.e., with more warps -- vectors\n # that are longer\n # You will see in the next tutorial how to auto-tune this value in a more natural\n # way so you don't have to come up with manual heuristics yourself\n num_warps = 4\n if BLOCK >= 2048: num_warps = 8\n if BLOCK >= 4096: num_warps = 16\n # Allocate output\n y = torch.empty_like(x)\n # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix\n _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK)\n return y" + "def next_power_of_2(n):\n n -= 1\n n |= n >> 1\n n |= n >> 2\n n |= n >> 4\n n |= n >> 8\n n |= n >> 16\n n += 1\n return n\n\n\ndef softmax(x):\n M, N = x.shape\n # The block size is the smallest power of two greater than the number of columns in `x`\n BLOCK = next_power_of_2(N)\n # Another trick we can use is to ask the compiler to parallelize each\n # row-normalization more aggressively -- i.e., with more warps -- vectors\n # that are longer\n # You will see in the next tutorial how to auto-tune this value in a more natural\n # way so you don't have to come up with manual heuristics yourself\n num_warps = 4\n if BLOCK >= 2048: num_warps = 8\n if BLOCK >= 4096: num_warps = 16\n # Allocate output\n y = torch.empty_like(x)\n # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix\n _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK)\n return y" ] }, { diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip index f46a615e7..164c82123 100644 Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip index e6a7b00f2..97b5a7ca0 100644 Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ diff --git a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py index 3c5d674c2..f9b1b5103 100644 --- a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py +++ b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py @@ -100,7 +100,7 @@ def softmax(x): # Allocate output y = torch.empty_like(x) # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix - _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK) + _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK) return y diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png index cdb766d83..05123e4a0 100644 Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png index f567402c9..2bbadbd65 100644 Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png index 9889c70ef..f61347b43 100644 Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png index 175214ab1..fa278f626 100644 Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png index 80cc917e5..32cef6cd0 100644 Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png index b1f955701..3514aa13b 100644 Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt index 654b91a35..20f67f6ef 100644 --- a/_sources/getting-started/tutorials/01-vector-add.rst.txt +++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt @@ -212,7 +212,7 @@ We can now run the decorated function above. Pass `show_plots=True` to see the p .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 5.812 seconds) + **Total running time of the script:** ( 0 minutes 7.044 seconds) .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py: diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt index b6d8d0bf2..adba5d155 100644 --- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt +++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt @@ -151,7 +151,7 @@ We can create a helper function that enqueues the kernel and its (meta-)argument # Allocate output y = torch.empty_like(x) # Enqueue kernel. The launch grid is simple: we have one kernel instance per row of the input matrix - _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, BLOCK=BLOCK) + _softmax[(M, )](y, x, x.stride(0), y.stride(0), M, N, num_warps=num_warps, BLOCK=BLOCK) return y @@ -264,7 +264,7 @@ In the above plot, we can see that: .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 20.767 seconds) + **Total running time of the script:** ( 0 minutes 20.176 seconds) .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py: diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt index 1d6f977e6..2a476a4bd 100644 --- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt +++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt @@ -287,32 +287,32 @@ We can test our custom matrix multiplication operation against a native torch im .. code-block:: none - tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00, - -3.4237e-04, -0.0000e+00], - [-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01, - -0.0000e+00, -0.0000e+00], - [ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00, - 1.6578e+01, 2.1031e+01], + tensor([[-5.9605e-08, 5.1094e+01, -1.8477e-05, ..., 2.6547e+01, + -7.2598e-05, -4.2510e-04], + [-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00, + -1.8644e-04, 1.3094e+01], + [-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00, + 4.3375e+01, 1.6865e+00], ..., - [ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01, - 3.5469e+01, 1.5602e+01], - [-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00, - 1.7412e+00, 1.1422e+01], - [-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1248e-03, - -1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16) - tensor([[-0.0000e+00, 2.9438e+01, -1.3113e-06, ..., 9.7266e+00, - -3.4261e-04, -0.0000e+00], - [-1.7615e-01, -0.0000e+00, 6.1914e+00, ..., 3.7562e+01, - -0.0000e+00, -0.0000e+00], - [ 9.9531e+00, 1.9078e+01, -0.0000e+00, ..., 3.6934e+00, - 1.6578e+01, 2.1031e+01], + [-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00, + 4.1133e+00, -0.0000e+00], + [ 1.6562e+01, -8.1539e-04, 1.3836e+01, ..., 1.9844e+00, + -1.1238e-02, 8.4375e+00], + [-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6907e-02, + -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16) + tensor([[-5.9605e-08, 5.1094e+01, -1.8537e-05, ..., 2.6547e+01, + -7.2658e-05, -4.2605e-04], + [-2.7100e-01, -3.0220e-05, 5.9414e+00, ..., 2.8340e+00, + -1.8632e-04, 1.3094e+01], + [-1.5332e-01, 4.8125e+00, 8.4277e-01, ..., 3.6387e+00, + 4.3375e+01, 1.6875e+00], ..., - [ 2.6547e+01, -1.1802e-05, 7.7852e+00, ..., 5.2156e+01, - 3.5469e+01, 1.5602e+01], - [-0.0000e+00, -0.0000e+00, 1.6531e+01, ..., 2.1211e+00, - 1.7412e+00, 1.1422e+01], - [-2.6550e-02, -1.1325e-05, 3.0344e+01, ..., -9.1324e-03, - -1.5199e-05, 3.8164e+00]], device='cuda:0', dtype=torch.float16) + [-0.0000e+00, 2.9453e+01, -4.7684e-07, ..., 6.2617e+00, + 4.1133e+00, -0.0000e+00], + [ 1.6562e+01, -8.1778e-04, 1.3836e+01, ..., 1.9844e+00, + -1.1238e-02, 8.4375e+00], + [-1.0876e-01, -2.7295e-01, 3.2156e+01, ..., -1.6891e-02, + -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16) tensor(True, device='cuda:0') @@ -373,36 +373,36 @@ We can now compare the performance of our kernel against CUTLASS. Here we focus M cuBLAS Triton 0 512.0 20.164923 15.420235 - 1 768.0 58.982401 42.130286 + 1 768.0 58.982401 40.215272 2 1024.0 91.180520 72.315584 3 1280.0 157.538463 117.028568 - 4 1536.0 150.593357 147.455995 - 5 1792.0 212.064605 193.783168 - 6 2048.0 197.379013 151.146088 - 7 2304.0 243.753804 179.608068 - 8 2560.0 237.449270 217.006622 - 9 2816.0 233.231062 200.987140 + 4 1536.0 153.867127 144.446699 + 5 1792.0 208.137481 190.498706 + 6 2048.0 199.728763 152.520144 + 7 2304.0 246.266731 178.267699 + 8 2560.0 235.741014 215.578957 + 9 2816.0 231.990461 198.246398 10 3072.0 236.916752 221.184001 - 11 3328.0 234.499328 210.500857 + 11 3328.0 239.173747 210.500857 12 3584.0 248.385067 230.552287 - 13 3840.0 252.493157 223.418188 - 14 4096.0 263.689066 244.922869 - 15 4352.0 247.295210 231.639115 - 16 4608.0 274.573240 254.803966 - 17 4864.0 266.298229 245.366501 - 18 5120.0 259.548513 238.312729 - 19 5376.0 252.676487 237.081606 - 20 5632.0 270.685535 249.046163 - 21 5888.0 264.382140 242.069377 - 22 6144.0 262.447761 240.565495 - 23 6400.0 257.028108 235.078047 - 24 6656.0 254.386204 232.699140 - 25 6912.0 252.040861 232.926171 - 26 7168.0 253.193644 231.815375 - 27 7424.0 251.789150 232.860938 - 28 7680.0 250.988932 231.727608 - 29 7936.0 253.622108 232.094986 - 30 8192.0 253.121589 231.859598 + 13 3840.0 251.917998 222.519114 + 14 4096.0 263.172024 244.032234 + 15 4352.0 249.595626 232.307632 + 16 4608.0 276.560014 254.803966 + 17 4864.0 266.614125 245.366501 + 18 5120.0 257.003930 238.096276 + 19 5376.0 252.676487 236.527241 + 20 5632.0 270.057027 248.514009 + 21 5888.0 264.206935 242.511113 + 22 6144.0 259.441481 241.205983 + 23 6400.0 257.157204 235.078047 + 24 6656.0 254.161678 232.699140 + 25 6912.0 251.844029 233.178785 + 26 7168.0 253.282797 231.740709 + 27 7424.0 251.868505 230.377264 + 28 7680.0 250.988932 231.606284 + 29 7936.0 253.293068 229.692102 + 30 8192.0 253.002304 231.360005 @@ -410,7 +410,7 @@ We can now compare the performance of our kernel against CUTLASS. Here we focus .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 36.230 seconds) + **Total running time of the script:** ( 0 minutes 32.933 seconds) .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py: diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt index aef0a0aff..85f0bd13f 100644 --- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt +++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt @@ -5,12 +5,12 @@ Computation times ================= -**00:36.230** total execution time for **getting-started_tutorials** files: +**01:00.154** total execution time for **getting-started_tutorials** files: +---------------------------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 00:36.230 | 0.0 MB | +| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 00:32.933 | 0.0 MB | +---------------------------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:20.176 | 0.0 MB | +---------------------------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 00:07.044 | 0.0 MB | +---------------------------------------------------------------------------------------------------------+-----------+--------+ diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html index 0fd411698..dd5e6f78f 100644 --- a/getting-started/tutorials/01-vector-add.html +++ b/getting-started/tutorials/01-vector-add.html @@ -295,7 +295,7 @@ for different problem sizes.

01 vector add -

Total running time of the script: ( 0 minutes 5.812 seconds)

+

Total running time of the script: ( 0 minutes 7.044 seconds)

@@ -343,7 +343,7 @@ This means that – when temporary data is too large to fit entirely in the GPU Note that our Triton kernel is not only faster than PyTorch’s CUDA kernel, it is also easier to read, understand and maintain.

-

Total running time of the script: ( 0 minutes 20.767 seconds)

+

Total running time of the script: ( 0 minutes 20.176 seconds)

Out:

-
tensor([[-0.0000e+00,  2.9438e+01, -1.3113e-06,  ...,  9.7266e+00,
-         -3.4237e-04, -0.0000e+00],
-        [-1.7615e-01, -0.0000e+00,  6.1914e+00,  ...,  3.7562e+01,
-         -0.0000e+00, -0.0000e+00],
-        [ 9.9531e+00,  1.9078e+01, -0.0000e+00,  ...,  3.6934e+00,
-          1.6578e+01,  2.1031e+01],
+
tensor([[-5.9605e-08,  5.1094e+01, -1.8477e-05,  ...,  2.6547e+01,
+         -7.2598e-05, -4.2510e-04],
+        [-2.7100e-01, -3.0220e-05,  5.9414e+00,  ...,  2.8340e+00,
+         -1.8644e-04,  1.3094e+01],
+        [-1.5332e-01,  4.8125e+00,  8.4277e-01,  ...,  3.6387e+00,
+          4.3375e+01,  1.6865e+00],
         ...,
-        [ 2.6547e+01, -1.1802e-05,  7.7852e+00,  ...,  5.2156e+01,
-          3.5469e+01,  1.5602e+01],
-        [-0.0000e+00, -0.0000e+00,  1.6531e+01,  ...,  2.1211e+00,
-          1.7412e+00,  1.1422e+01],
-        [-2.6550e-02, -1.1325e-05,  3.0344e+01,  ..., -9.1248e-03,
-         -1.5199e-05,  3.8164e+00]], device='cuda:0', dtype=torch.float16)
-tensor([[-0.0000e+00,  2.9438e+01, -1.3113e-06,  ...,  9.7266e+00,
-         -3.4261e-04, -0.0000e+00],
-        [-1.7615e-01, -0.0000e+00,  6.1914e+00,  ...,  3.7562e+01,
-         -0.0000e+00, -0.0000e+00],
-        [ 9.9531e+00,  1.9078e+01, -0.0000e+00,  ...,  3.6934e+00,
-          1.6578e+01,  2.1031e+01],
+        [-0.0000e+00,  2.9453e+01, -4.7684e-07,  ...,  6.2617e+00,
+          4.1133e+00, -0.0000e+00],
+        [ 1.6562e+01, -8.1539e-04,  1.3836e+01,  ...,  1.9844e+00,
+         -1.1238e-02,  8.4375e+00],
+        [-1.0876e-01, -2.7295e-01,  3.2156e+01,  ..., -1.6907e-02,
+         -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
+tensor([[-5.9605e-08,  5.1094e+01, -1.8537e-05,  ...,  2.6547e+01,
+         -7.2658e-05, -4.2605e-04],
+        [-2.7100e-01, -3.0220e-05,  5.9414e+00,  ...,  2.8340e+00,
+         -1.8632e-04,  1.3094e+01],
+        [-1.5332e-01,  4.8125e+00,  8.4277e-01,  ...,  3.6387e+00,
+          4.3375e+01,  1.6875e+00],
         ...,
-        [ 2.6547e+01, -1.1802e-05,  7.7852e+00,  ...,  5.2156e+01,
-          3.5469e+01,  1.5602e+01],
-        [-0.0000e+00, -0.0000e+00,  1.6531e+01,  ...,  2.1211e+00,
-          1.7412e+00,  1.1422e+01],
-        [-2.6550e-02, -1.1325e-05,  3.0344e+01,  ..., -9.1324e-03,
-         -1.5199e-05,  3.8164e+00]], device='cuda:0', dtype=torch.float16)
+        [-0.0000e+00,  2.9453e+01, -4.7684e-07,  ...,  6.2617e+00,
+          4.1133e+00, -0.0000e+00],
+        [ 1.6562e+01, -8.1778e-04,  1.3836e+01,  ...,  1.9844e+00,
+         -1.1238e-02,  8.4375e+00],
+        [-1.0876e-01, -2.7295e-01,  3.2156e+01,  ..., -1.6891e-02,
+         -0.0000e+00, -0.0000e+00]], device='cuda:0', dtype=torch.float16)
 tensor(True, device='cuda:0')
 
@@ -472,39 +472,39 @@ tensor(True, device='cuda:0')

Out:

         M      cuBLAS      Triton
 0    512.0   20.164923   15.420235
-1    768.0   58.982401   42.130286
+1    768.0   58.982401   40.215272
 2   1024.0   91.180520   72.315584
 3   1280.0  157.538463  117.028568
-4   1536.0  150.593357  147.455995
-5   1792.0  212.064605  193.783168
-6   2048.0  197.379013  151.146088
-7   2304.0  243.753804  179.608068
-8   2560.0  237.449270  217.006622
-9   2816.0  233.231062  200.987140
+4   1536.0  153.867127  144.446699
+5   1792.0  208.137481  190.498706
+6   2048.0  199.728763  152.520144
+7   2304.0  246.266731  178.267699
+8   2560.0  235.741014  215.578957
+9   2816.0  231.990461  198.246398
 10  3072.0  236.916752  221.184001
-11  3328.0  234.499328  210.500857
+11  3328.0  239.173747  210.500857
 12  3584.0  248.385067  230.552287
-13  3840.0  252.493157  223.418188
-14  4096.0  263.689066  244.922869
-15  4352.0  247.295210  231.639115
-16  4608.0  274.573240  254.803966
-17  4864.0  266.298229  245.366501
-18  5120.0  259.548513  238.312729
-19  5376.0  252.676487  237.081606
-20  5632.0  270.685535  249.046163
-21  5888.0  264.382140  242.069377
-22  6144.0  262.447761  240.565495
-23  6400.0  257.028108  235.078047
-24  6656.0  254.386204  232.699140
-25  6912.0  252.040861  232.926171
-26  7168.0  253.193644  231.815375
-27  7424.0  251.789150  232.860938
-28  7680.0  250.988932  231.727608
-29  7936.0  253.622108  232.094986
-30  8192.0  253.121589  231.859598
+13  3840.0  251.917998  222.519114
+14  4096.0  263.172024  244.032234
+15  4352.0  249.595626  232.307632
+16  4608.0  276.560014  254.803966
+17  4864.0  266.614125  245.366501
+18  5120.0  257.003930  238.096276
+19  5376.0  252.676487  236.527241
+20  5632.0  270.057027  248.514009
+21  5888.0  264.206935  242.511113
+22  6144.0  259.441481  241.205983
+23  6400.0  257.157204  235.078047
+24  6656.0  254.161678  232.699140
+25  6912.0  251.844029  233.178785
+26  7168.0  253.282797  231.740709
+27  7424.0  251.868505  230.377264
+28  7680.0  250.988932  231.606284
+29  7936.0  253.293068  229.692102
+30  8192.0  253.002304  231.360005
 
-

Total running time of the script: ( 0 minutes 36.230 seconds)

+

Total running time of the script: ( 0 minutes 32.933 seconds)

@@ -288,9 +224,6 @@ -

dot

-

Returns the matrix product of two blocks.

-
@@ -302,18 +235,6 @@ -

load

-

Return a block of data whose values are, elementwise, loaded from memory at location defined by pointer.

- -

store

-

Stores value block of elements in memory, element-wise, at the memory locations specified by pointer.

- -

atomic_cas

-

- -

atomic_xchg

-

-
@@ -325,9 +246,6 @@ -

where

-

Returns a block of elements from either x or y, depending on condition.

- @@ -339,18 +257,6 @@ -

exp

-

- -

log

-

- -

sigmoid

-

- -

softmax

-

- @@ -362,15 +268,6 @@ -

max

-

- -

min

-

- -

sum

-

- @@ -382,12 +279,6 @@ -

minimum

-

- -

maximum

-

- @@ -399,9 +290,6 @@ -

multiple_of

-

- @@ -413,7 +301,7 @@