[GH-PAGES] Updated website

This commit is contained in:
Philippe Tillet
2022-04-08 00:44:05 +00:00
parent 80b92a0d2d
commit 0c570c178d
173 changed files with 401 additions and 386 deletions

View File

@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 456fb2bf2c82c803cfdaa7bcb3e778a9
config: aa051e1ca67e6f2658629dbe2c30cba0
tags: 645f666f9bcd5a90fca523b33c5a78b7

Binary file not shown.

Binary file not shown.

View File

@@ -24,9 +24,11 @@ def add_kernel(
y_ptr, # *Pointer* to second input vector
output_ptr, # *Pointer* to output vector
n_elements, # Size of the vector
time_start_ptr, time_end_ptr,
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process
# NOTE: `constexpr` so it can be used as a shape value
):
tl.atomic_min(time_start_ptr, tl.clock())
# There are multiple 'program's processing different data. We identify which program
# we are here
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0
@@ -45,6 +47,7 @@ def add_kernel(
output = x + y
# Write x + y back to DRAM
tl.store(output_ptr + offsets, output, mask=mask)
tl.atomic_max(time_end_ptr, tl.clock())
# %%
@@ -53,6 +56,8 @@ def add_kernel(
def add(x: torch.Tensor, y: torch.Tensor):
time_start = torch.zeros(1, dtype=torch.int64, device='cuda')
time_end = torch.zeros(1, dtype=torch.int64, device='cuda')
# We need to preallocate the output
output = torch.empty_like(x)
assert x.is_cuda and y.is_cuda and output.is_cuda
@@ -65,7 +70,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
# - each torch.tensor object is implicitly converted into a pointer to its first element.
# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
# - don't forget to pass meta-parameters as keywords arguments
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
# running asynchronously at this point.
return output

View File

@@ -33,7 +33,7 @@
},
"outputs": [],
"source": [
"import torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n x_ptr, # *Pointer* to first input vector\n y_ptr, # *Pointer* to second input vector\n output_ptr, # *Pointer* to output vector\n n_elements, # Size of the vector\n BLOCK_SIZE: tl.constexpr, # Number of elements each program should process\n # NOTE: `constexpr` so it can be used as a shape value\n):\n # There are multiple 'program's processing different data. We identify which program\n # we are here\n pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0\n # This program will process inputs that are offset from the initial data.\n # for instance, if you had a vector of length 256 and block_size of 64, the programs\n # would each access the elements [0:64, 64:128, 128:192, 192:256].\n # Note that offsets is a list of pointers\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n # Create a mask to guard memory operations against out-of-bounds accesses\n mask = offsets < n_elements\n # Load x and y from DRAM, masking out any extra elements in case the input is not a\n # multiple of the block size\n x = tl.load(x_ptr + offsets, mask=mask)\n y = tl.load(y_ptr + offsets, mask=mask)\n output = x + y\n # Write x + y back to DRAM\n tl.store(output_ptr + offsets, output, mask=mask)"
"import torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n x_ptr, # *Pointer* to first input vector\n y_ptr, # *Pointer* to second input vector\n output_ptr, # *Pointer* to output vector\n n_elements, # Size of the vector\n time_start_ptr, time_end_ptr,\n BLOCK_SIZE: tl.constexpr, # Number of elements each program should process\n # NOTE: `constexpr` so it can be used as a shape value\n):\n tl.atomic_min(time_start_ptr, tl.clock())\n # There are multiple 'program's processing different data. We identify which program\n # we are here\n pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0\n # This program will process inputs that are offset from the initial data.\n # for instance, if you had a vector of length 256 and block_size of 64, the programs\n # would each access the elements [0:64, 64:128, 128:192, 192:256].\n # Note that offsets is a list of pointers\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n # Create a mask to guard memory operations against out-of-bounds accesses\n mask = offsets < n_elements\n # Load x and y from DRAM, masking out any extra elements in case the input is not a\n # multiple of the block size\n x = tl.load(x_ptr + offsets, mask=mask)\n y = tl.load(y_ptr + offsets, mask=mask)\n output = x + y\n # Write x + y back to DRAM\n tl.store(output_ptr + offsets, output, mask=mask)\n tl.atomic_max(time_end_ptr, tl.clock())"
]
},
{
@@ -51,7 +51,7 @@
},
"outputs": [],
"source": [
"def add(x: torch.Tensor, y: torch.Tensor):\n # We need to preallocate the output\n output = torch.empty_like(x)\n assert x.is_cuda and y.is_cuda and output.is_cuda\n n_elements = output.numel()\n # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n # In this case, we use a 1D grid where the size is the number of blocks\n grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n # NOTE:\n # - each torch.tensor object is implicitly converted into a pointer to its first element.\n # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n # - don't forget to pass meta-parameters as keywords arguments\n add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)\n # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n # running asynchronously at this point.\n return output"
"def add(x: torch.Tensor, y: torch.Tensor):\n time_start = torch.zeros(1, dtype=torch.int64, device='cuda')\n time_end = torch.zeros(1, dtype=torch.int64, device='cuda')\n # We need to preallocate the output\n output = torch.empty_like(x)\n assert x.is_cuda and y.is_cuda and output.is_cuda\n n_elements = output.numel()\n # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n # In this case, we use a 1D grid where the size is the number of blocks\n grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n # NOTE:\n # - each torch.tensor object is implicitly converted into a pointer to its first element.\n # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n # - don't forget to pass meta-parameters as keywords arguments\n add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)\n # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n # running asynchronously at this point.\n return output"
]
},
{

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -31,7 +31,7 @@ In this tutorial, you will write a simple vector addition using Triton and learn
Compute Kernel
--------------------------
.. GENERATED FROM PYTHON SOURCE LINES 14-50
.. GENERATED FROM PYTHON SOURCE LINES 14-53
.. code-block:: default
@@ -48,9 +48,11 @@ Compute Kernel
y_ptr, # *Pointer* to second input vector
output_ptr, # *Pointer* to output vector
n_elements, # Size of the vector
time_start_ptr, time_end_ptr,
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process
# NOTE: `constexpr` so it can be used as a shape value
):
tl.atomic_min(time_start_ptr, tl.clock())
# There are multiple 'program's processing different data. We identify which program
# we are here
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0
@@ -69,6 +71,7 @@ Compute Kernel
output = x + y
# Write x + y back to DRAM
tl.store(output_ptr + offsets, output, mask=mask)
tl.atomic_max(time_end_ptr, tl.clock())
@@ -78,18 +81,20 @@ Compute Kernel
.. GENERATED FROM PYTHON SOURCE LINES 51-53
.. GENERATED FROM PYTHON SOURCE LINES 54-56
Let's also declare a helper function to (1) allocate the `z` tensor
and (2) enqueue the above kernel with appropriate grid/block sizes.
.. GENERATED FROM PYTHON SOURCE LINES 53-74
.. GENERATED FROM PYTHON SOURCE LINES 56-79
.. code-block:: default
def add(x: torch.Tensor, y: torch.Tensor):
time_start = torch.zeros(1, dtype=torch.int64, device='cuda')
time_end = torch.zeros(1, dtype=torch.int64, device='cuda')
# We need to preallocate the output
output = torch.empty_like(x)
assert x.is_cuda and y.is_cuda and output.is_cuda
@@ -102,7 +107,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
# - each torch.tensor object is implicitly converted into a pointer to its first element.
# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
# - don't forget to pass meta-parameters as keywords arguments
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
# running asynchronously at this point.
return output
@@ -115,11 +120,11 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
.. GENERATED FROM PYTHON SOURCE LINES 75-76
.. GENERATED FROM PYTHON SOURCE LINES 80-81
We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
.. GENERATED FROM PYTHON SOURCE LINES 76-90
.. GENERATED FROM PYTHON SOURCE LINES 81-95
.. code-block:: default
@@ -154,11 +159,11 @@ We can now use the above function to compute the element-wise sum of two `torch.
.. GENERATED FROM PYTHON SOURCE LINES 91-92
.. GENERATED FROM PYTHON SOURCE LINES 96-97
Seems like we're good to go!
.. GENERATED FROM PYTHON SOURCE LINES 94-99
.. GENERATED FROM PYTHON SOURCE LINES 99-104
Benchmark
-----------
@@ -166,7 +171,7 @@ We can now benchmark our custom op on vectors of increasing sizes to get a sense
To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
for different problem sizes.
.. GENERATED FROM PYTHON SOURCE LINES 99-128
.. GENERATED FROM PYTHON SOURCE LINES 104-133
.. code-block:: default
@@ -206,12 +211,12 @@ for different problem sizes.
.. GENERATED FROM PYTHON SOURCE LINES 129-131
.. GENERATED FROM PYTHON SOURCE LINES 134-136
We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
`save_path='/path/to/results/' to save them to disk along with raw CSV data
.. GENERATED FROM PYTHON SOURCE LINES 131-132
.. GENERATED FROM PYTHON SOURCE LINES 136-137
.. code-block:: default
@@ -232,22 +237,22 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
vector-add-performance:
size Triton Torch
0 4096.0 9.600000 9.600000
1 8192.0 19.200000 19.200000
2 16384.0 38.400001 38.400001
3 32768.0 76.800002 76.800002
4 65536.0 127.999995 127.999995
5 131072.0 219.428568 219.428568
6 262144.0 341.333321 341.333321
7 524288.0 472.615390 472.615390
8 1048576.0 614.400016 614.400016
9 2097152.0 722.823517 722.823517
10 4194304.0 780.190482 780.190482
11 8388608.0 812.429770 812.429770
12 16777216.0 833.084721 833.084721
13 33554432.0 842.004273 843.811163
14 67108864.0 847.448255 848.362445
15 134217728.0 849.737435 850.656574
0 4096.0 4.800000 9.600000
1 8192.0 8.727273 19.200000
2 16384.0 17.454545 38.400001
3 32768.0 38.400001 76.800002
4 65536.0 69.818181 127.999995
5 131072.0 139.636363 219.428568
6 262144.0 219.428568 341.333321
7 524288.0 341.333321 472.615390
8 1048576.0 472.615390 614.400016
9 2097152.0 614.400016 702.171410
10 4194304.0 712.347810 780.190482
11 8388608.0 774.047204 812.429770
12 16777216.0 809.086412 833.084721
13 33554432.0 829.569620 842.004273
14 67108864.0 840.205105 848.362445
15 134217728.0 845.625825 850.656574
@@ -255,7 +260,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
.. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 1 minutes 42.600 seconds)
**Total running time of the script:** ( 1 minutes 42.917 seconds)
.. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:

View File

@@ -278,17 +278,17 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
softmax-performance:
N Triton Torch (native) Torch (jit)
0 256.0 512.000001 546.133347 186.181817
1 384.0 614.400016 585.142862 153.600004
2 512.0 655.360017 606.814814 154.566038
3 640.0 706.206879 640.000002 158.759699
4 768.0 722.823517 664.216187 162.754967
0 256.0 512.000001 546.133347 190.511628
1 384.0 585.142862 585.142862 151.703707
2 512.0 655.360017 585.142849 154.566038
3 640.0 682.666684 640.000002 160.000000
4 768.0 722.823517 646.736871 163.839992
.. ... ... ... ...
93 12160.0 814.058574 406.179533 198.834951
94 12288.0 814.111783 415.661740 199.096718
95 12416.0 814.163950 412.149375 198.655991
96 12544.0 814.214963 412.971190 198.913776
97 12672.0 814.265046 411.679167 198.971549
93 12160.0 814.058574 405.755985 198.834951
94 12288.0 814.111783 415.222812 199.197579
95 12416.0 814.163950 412.149375 198.854847
96 12544.0 814.214963 412.971190 199.012395
97 12672.0 814.265046 412.097543 199.167004
[98 rows x 4 columns]
@@ -306,7 +306,7 @@ In the above plot, we can see that:
.. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 3 minutes 19.345 seconds)
**Total running time of the script:** ( 3 minutes 27.571 seconds)
.. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:

View File

@@ -459,36 +459,36 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
matmul-performance:
M cuBLAS ... Triton Triton (+ LeakyReLU)
0 256.0 2.730667 ... 3.276800 2.978909
1 384.0 7.372800 ... 8.507077 7.899428
2 512.0 14.563555 ... 16.384000 16.384000
1 384.0 7.372800 ... 7.899428 8.507077
2 512.0 14.563555 ... 15.420235 15.420235
3 640.0 22.260869 ... 24.380953 24.380953
4 768.0 32.768000 ... 34.028308 34.028308
5 896.0 39.025776 ... 40.140799 39.025776
6 1024.0 49.932191 ... 52.428801 52.428801
7 1152.0 45.242181 ... 46.656000 46.656000
8 1280.0 51.200001 ... 56.888887 56.888887
9 1408.0 64.138541 ... 67.305878 66.485074
10 1536.0 80.430545 ... 79.526831 78.643199
11 1664.0 62.929456 ... 62.492442 61.636381
12 1792.0 72.512412 ... 72.512412 72.047592
13 1920.0 69.467336 ... 70.172588 69.818184
14 2048.0 73.262953 ... 76.608294 76.608294
15 2176.0 83.500614 ... 85.998493 85.632545
16 2304.0 68.643310 ... 77.057651 76.319081
17 2432.0 71.305746 ... 85.393507 85.134737
18 2560.0 78.019048 ... 80.709358 81.108913
19 2688.0 83.004501 ... 89.464755 89.254248
20 2816.0 80.767055 ... 83.552120 82.602666
21 2944.0 81.298583 ... 82.237674 81.967162
22 3072.0 81.707223 ... 88.473602 87.516392
23 3200.0 84.377059 ... 94.955488 94.674553
24 3328.0 83.034941 ... 84.695641 83.905938
25 3456.0 81.108217 ... 85.133652 81.029251
26 3584.0 86.623693 ... 98.483450 98.160909
27 3712.0 81.682211 ... 88.404730 84.017953
28 3840.0 82.592983 ... 92.006659 85.169042
29 3968.0 90.791620 ... 86.236000 90.522206
30 4096.0 86.369197 ... 85.001726 91.304576
4 768.0 32.768000 ... 35.389441 34.028308
5 896.0 37.971025 ... 40.140799 39.025776
6 1024.0 49.932191 ... 53.773130 53.773130
7 1152.0 45.242181 ... 48.161033 47.396572
8 1280.0 51.200001 ... 57.690139 57.690139
9 1408.0 64.138541 ... 69.009825 67.305878
10 1536.0 79.526831 ... 80.430545 79.526831
11 1664.0 63.372618 ... 63.372618 62.929456
12 1792.0 72.983276 ... 63.142831 63.142831
13 1920.0 69.120002 ... 71.626943 70.892307
14 2048.0 73.584279 ... 78.033565 77.672296
15 2176.0 83.500614 ... 87.115360 86.739860
16 2304.0 68.446623 ... 77.810656 77.307030
17 2432.0 71.125224 ... 75.522751 75.320281
18 2560.0 77.833728 ... 81.715711 81.512437
19 2688.0 83.737433 ... 90.966561 90.532356
20 2816.0 79.443003 ... 82.916747 84.035084
21 2944.0 81.832567 ... 83.198715 82.102191
22 3072.0 81.121923 ... 88.197981 87.381335
23 3200.0 83.116885 ... 96.385543 96.096095
24 3328.0 82.939284 ... 84.895397 85.096096
25 3456.0 77.745004 ... 86.596744 84.332184
26 3584.0 86.540320 ... 91.380335 97.416461
27 3712.0 85.748791 ... 88.404730 86.341700
28 3840.0 84.874902 ... 93.247896 85.300426
29 3968.0 92.372393 ... 80.015697 78.220472
30 4096.0 93.990003 ... 93.206754 86.036145
[31 rows x 5 columns]
@@ -498,7 +498,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
.. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 6 minutes 0.645 seconds)
**Total running time of the script:** ( 6 minutes 48.521 seconds)
.. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:

View File

@@ -240,7 +240,7 @@ References
.. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 0 minutes 0.482 seconds)
**Total running time of the script:** ( 0 minutes 0.327 seconds)
.. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:

View File

@@ -38,36 +38,36 @@ Layer Normalization
layer-norm-backward:
N Triton Torch Apex
0 1024.0 311.088617 98.303995 303.407414
1 1536.0 347.773587 134.050910 341.333333
2 2048.0 420.102553 161.684218 325.509933
3 2560.0 455.111129 181.238943 326.808501
4 3072.0 511.999982 191.999993 317.793096
5 3584.0 547.872604 207.768111 310.527060
6 4096.0 564.965515 219.919464 295.207204
7 4608.0 504.986315 232.825259 291.799469
8 5120.0 527.381977 242.845844 288.450695
9 5632.0 542.843364 243.107920 288.820505
10 6144.0 546.133354 248.661056 286.879370
11 6656.0 532.479975 256.000009 285.767438
12 7168.0 505.976473 260.654538 286.242939
13 7680.0 481.253256 262.190612 278.429013
14 8192.0 463.698115 267.130429 284.939124
15 8704.0 417.791980 267.815384 284.987724
16 9216.0 432.845409 272.394084 288.751954
17 9728.0 439.683593 280.278512 289.667485
18 10240.0 448.467168 286.433562 290.153487
19 10752.0 425.821771 247.172406 290.922209
20 11264.0 427.071098 245.760001 286.676558
21 11776.0 423.724129 249.667843 288.981596
22 12288.0 419.504980 254.673582 294.323369
23 12800.0 413.458944 253.674644 288.180121
24 13312.0 412.242569 252.859526 289.916513
25 13824.0 405.594132 257.190689 292.056329
26 14336.0 394.568805 254.297107 286.719986
27 14848.0 386.498925 257.665934 289.246765
28 15360.0 373.117425 257.970599 286.211174
29 15872.0 371.274849 261.806182 289.899545
0 1024.0 356.173905 99.497980 315.076934
1 1536.0 405.098894 134.050910 344.523365
2 2048.0 486.653476 159.067963 321.254900
3 2560.0 458.507457 182.314537 326.808501
4 3072.0 515.580429 191.501303 319.168834
5 3584.0 551.384634 207.768111 307.199992
6 4096.0 568.231237 220.907859 293.444785
7 4608.0 502.690905 232.336141 290.267724
8 5120.0 527.381977 243.326731 287.102804
9 5632.0 540.671974 244.426754 291.310338
10 6144.0 548.163546 251.202731 288.000001
11 6656.0 532.479975 255.590406 286.279570
12 7168.0 510.480705 253.734520 277.919225
13 7680.0 487.619051 266.743841 284.884090
14 8192.0 468.114289 258.694729 278.481578
15 8704.0 415.300208 267.472468 284.987724
16 9216.0 429.483477 272.394084 290.077383
17 9728.0 438.033784 280.278512 288.950501
18 10240.0 443.610086 287.102804 290.153487
19 10752.0 426.525614 246.699797 290.267711
20 11264.0 427.071098 245.536784 286.069848
21 11776.0 418.702211 249.447482 288.981596
22 12288.0 414.784810 254.673582 294.323369
23 12800.0 410.146863 254.094291 288.180121
24 13312.0 409.599999 252.161013 289.129403
25 13824.0 404.112047 257.190689 291.799461
26 14336.0 395.930964 256.000002 289.129416
27 14848.0 385.662341 257.479779 288.777966
28 15360.0 380.433442 258.332158 286.656296
29 15872.0 372.363640 261.806182 290.562936
@@ -339,7 +339,7 @@ Layer Normalization
.. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 2 minutes 12.550 seconds)
**Total running time of the script:** ( 2 minutes 14.583 seconds)
.. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:

View File

@@ -5,16 +5,16 @@
Computation times
=================
**13:15.622** total execution time for **getting-started_tutorials** files:
**14:13.919** total execution time for **getting-started_tutorials** files:
+---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:00.645 | 0.0 MB |
| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:48.521 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 03:19.345 | 0.0 MB |
| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 03:27.571 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``) | 02:12.550 | 0.0 MB |
| :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``) | 02:14.583 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 01:42.600 | 0.0 MB |
| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 01:42.917 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``) | 00:00.482 | 0.0 MB |
| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``) | 00:00.327 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+

View File

@@ -214,9 +214,11 @@ to download the full example code</p>
<span class="n">y_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to second input vector</span>
<span class="n">output_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to output vector</span>
<span class="n">n_elements</span><span class="p">,</span> <span class="c1"># Size of the vector</span>
<span class="n">time_start_ptr</span><span class="p">,</span> <span class="n">time_end_ptr</span><span class="p">,</span>
<span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="c1"># Number of elements each program should process</span>
<span class="c1"># NOTE: `constexpr` so it can be used as a shape value</span>
<span class="p">):</span>
<span class="n">tl</span><span class="o">.</span><span class="n">atomic_min</span><span class="p">(</span><span class="n">time_start_ptr</span><span class="p">,</span> <span class="n">tl</span><span class="o">.</span><span class="n">clock</span><span class="p">())</span>
<span class="c1"># There are multiple &#39;program&#39;s processing different data. We identify which program</span>
<span class="c1"># we are here</span>
<span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># We use a 1D launch grid so axis is 0</span>
@@ -235,11 +237,14 @@ to download the full example code</p>
<span class="n">output</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span>
<span class="c1"># Write x + y back to DRAM</span>
<span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">output_ptr</span> <span class="o">+</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
<span class="n">tl</span><span class="o">.</span><span class="n">atomic_max</span><span class="p">(</span><span class="n">time_end_ptr</span><span class="p">,</span> <span class="n">tl</span><span class="o">.</span><span class="n">clock</span><span class="p">())</span>
</pre></div>
</div>
<p>Lets also declare a helper function to (1) allocate the <cite>z</cite> tensor
and (2) enqueue the above kernel with appropriate grid/block sizes.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span>
<span class="n">time_start</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
<span class="n">time_end</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
<span class="c1"># We need to preallocate the output</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">x</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">y</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">output</span><span class="o">.</span><span class="n">is_cuda</span>
@@ -252,7 +257,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.</p>
<span class="c1"># - each torch.tensor object is implicitly converted into a pointer to its first element.</span>
<span class="c1"># - `triton.jit`&#39;ed functions can be index with a launch grid to obtain a callable GPU kernel</span>
<span class="c1"># - don&#39;t forget to pass meta-parameters as keywords arguments</span>
<span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
<span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">time_start</span><span class="p">,</span> <span class="n">time_end</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
<span class="c1"># We return a handle to z but, since `torch.cuda.synchronize()` hasn&#39;t been called, the kernel is still</span>
<span class="c1"># running asynchronously at this point.</span>
<span class="k">return</span> <span class="n">output</span>
@@ -322,25 +327,25 @@ for different problem sizes.</p>
<p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>vector-add-performance:
size Triton Torch
0 4096.0 9.600000 9.600000
1 8192.0 19.200000 19.200000
2 16384.0 38.400001 38.400001
3 32768.0 76.800002 76.800002
4 65536.0 127.999995 127.999995
5 131072.0 219.428568 219.428568
6 262144.0 341.333321 341.333321
7 524288.0 472.615390 472.615390
8 1048576.0 614.400016 614.400016
9 2097152.0 722.823517 722.823517
10 4194304.0 780.190482 780.190482
11 8388608.0 812.429770 812.429770
12 16777216.0 833.084721 833.084721
13 33554432.0 842.004273 843.811163
14 67108864.0 847.448255 848.362445
15 134217728.0 849.737435 850.656574
0 4096.0 4.800000 9.600000
1 8192.0 8.727273 19.200000
2 16384.0 17.454545 38.400001
3 32768.0 38.400001 76.800002
4 65536.0 69.818181 127.999995
5 131072.0 139.636363 219.428568
6 262144.0 219.428568 341.333321
7 524288.0 341.333321 472.615390
8 1048576.0 472.615390 614.400016
9 2097152.0 614.400016 702.171410
10 4194304.0 712.347810 780.190482
11 8388608.0 774.047204 812.429770
12 16777216.0 809.086412 833.084721
13 33554432.0 829.569620 842.004273
14 67108864.0 840.205105 848.362445
15 134217728.0 845.625825 850.656574
</pre></div>
</div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 42.600 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 42.917 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>

View File

@@ -369,17 +369,17 @@ We will then compare its performance against (1) <code class="code docutils lite
<p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance:
N Triton Torch (native) Torch (jit)
0 256.0 512.000001 546.133347 186.181817
1 384.0 614.400016 585.142862 153.600004
2 512.0 655.360017 606.814814 154.566038
3 640.0 706.206879 640.000002 158.759699
4 768.0 722.823517 664.216187 162.754967
0 256.0 512.000001 546.133347 190.511628
1 384.0 585.142862 585.142862 151.703707
2 512.0 655.360017 585.142849 154.566038
3 640.0 682.666684 640.000002 160.000000
4 768.0 722.823517 646.736871 163.839992
.. ... ... ... ...
93 12160.0 814.058574 406.179533 198.834951
94 12288.0 814.111783 415.661740 199.096718
95 12416.0 814.163950 412.149375 198.655991
96 12544.0 814.214963 412.971190 198.913776
97 12672.0 814.265046 411.679167 198.971549
93 12160.0 814.058574 405.755985 198.834951
94 12288.0 814.111783 415.222812 199.197579
95 12416.0 814.163950 412.149375 198.854847
96 12544.0 814.214963 412.971190 199.012395
97 12672.0 814.265046 412.097543 199.167004
[98 rows x 4 columns]
</pre></div>
@@ -392,7 +392,7 @@ We will then compare its performance against (1) <code class="code docutils lite
Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
</ul>
</div></blockquote>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 19.345 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 27.571 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>

View File

@@ -565,41 +565,41 @@ torch_output=tensor([[ 1.1045, -36.9688, 31.4688, ..., -11.3906, 24.4531, -3
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
M cuBLAS ... Triton Triton (+ LeakyReLU)
0 256.0 2.730667 ... 3.276800 2.978909
1 384.0 7.372800 ... 8.507077 7.899428
2 512.0 14.563555 ... 16.384000 16.384000
1 384.0 7.372800 ... 7.899428 8.507077
2 512.0 14.563555 ... 15.420235 15.420235
3 640.0 22.260869 ... 24.380953 24.380953
4 768.0 32.768000 ... 34.028308 34.028308
5 896.0 39.025776 ... 40.140799 39.025776
6 1024.0 49.932191 ... 52.428801 52.428801
7 1152.0 45.242181 ... 46.656000 46.656000
8 1280.0 51.200001 ... 56.888887 56.888887
9 1408.0 64.138541 ... 67.305878 66.485074
10 1536.0 80.430545 ... 79.526831 78.643199
11 1664.0 62.929456 ... 62.492442 61.636381
12 1792.0 72.512412 ... 72.512412 72.047592
13 1920.0 69.467336 ... 70.172588 69.818184
14 2048.0 73.262953 ... 76.608294 76.608294
15 2176.0 83.500614 ... 85.998493 85.632545
16 2304.0 68.643310 ... 77.057651 76.319081
17 2432.0 71.305746 ... 85.393507 85.134737
18 2560.0 78.019048 ... 80.709358 81.108913
19 2688.0 83.004501 ... 89.464755 89.254248
20 2816.0 80.767055 ... 83.552120 82.602666
21 2944.0 81.298583 ... 82.237674 81.967162
22 3072.0 81.707223 ... 88.473602 87.516392
23 3200.0 84.377059 ... 94.955488 94.674553
24 3328.0 83.034941 ... 84.695641 83.905938
25 3456.0 81.108217 ... 85.133652 81.029251
26 3584.0 86.623693 ... 98.483450 98.160909
27 3712.0 81.682211 ... 88.404730 84.017953
28 3840.0 82.592983 ... 92.006659 85.169042
29 3968.0 90.791620 ... 86.236000 90.522206
30 4096.0 86.369197 ... 85.001726 91.304576
4 768.0 32.768000 ... 35.389441 34.028308
5 896.0 37.971025 ... 40.140799 39.025776
6 1024.0 49.932191 ... 53.773130 53.773130
7 1152.0 45.242181 ... 48.161033 47.396572
8 1280.0 51.200001 ... 57.690139 57.690139
9 1408.0 64.138541 ... 69.009825 67.305878
10 1536.0 79.526831 ... 80.430545 79.526831
11 1664.0 63.372618 ... 63.372618 62.929456
12 1792.0 72.983276 ... 63.142831 63.142831
13 1920.0 69.120002 ... 71.626943 70.892307
14 2048.0 73.584279 ... 78.033565 77.672296
15 2176.0 83.500614 ... 87.115360 86.739860
16 2304.0 68.446623 ... 77.810656 77.307030
17 2432.0 71.125224 ... 75.522751 75.320281
18 2560.0 77.833728 ... 81.715711 81.512437
19 2688.0 83.737433 ... 90.966561 90.532356
20 2816.0 79.443003 ... 82.916747 84.035084
21 2944.0 81.832567 ... 83.198715 82.102191
22 3072.0 81.121923 ... 88.197981 87.381335
23 3200.0 83.116885 ... 96.385543 96.096095
24 3328.0 82.939284 ... 84.895397 85.096096
25 3456.0 77.745004 ... 86.596744 84.332184
26 3584.0 86.540320 ... 91.380335 97.416461
27 3712.0 85.748791 ... 88.404730 86.341700
28 3840.0 84.874902 ... 93.247896 85.300426
29 3968.0 92.372393 ... 80.015697 78.220472
30 4096.0 93.990003 ... 93.206754 86.036145
[31 rows x 5 columns]
</pre></div>
</div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes 0.645 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes 48.521 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>

View File

@@ -372,7 +372,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
<dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
</dd>
</dl>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 0.482 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 0.327 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>

View File

@@ -194,36 +194,36 @@ to download the full example code</p>
<p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>layer-norm-backward:
N Triton Torch Apex
0 1024.0 311.088617 98.303995 303.407414
1 1536.0 347.773587 134.050910 341.333333
2 2048.0 420.102553 161.684218 325.509933
3 2560.0 455.111129 181.238943 326.808501
4 3072.0 511.999982 191.999993 317.793096
5 3584.0 547.872604 207.768111 310.527060
6 4096.0 564.965515 219.919464 295.207204
7 4608.0 504.986315 232.825259 291.799469
8 5120.0 527.381977 242.845844 288.450695
9 5632.0 542.843364 243.107920 288.820505
10 6144.0 546.133354 248.661056 286.879370
11 6656.0 532.479975 256.000009 285.767438
12 7168.0 505.976473 260.654538 286.242939
13 7680.0 481.253256 262.190612 278.429013
14 8192.0 463.698115 267.130429 284.939124
15 8704.0 417.791980 267.815384 284.987724
16 9216.0 432.845409 272.394084 288.751954
17 9728.0 439.683593 280.278512 289.667485
18 10240.0 448.467168 286.433562 290.153487
19 10752.0 425.821771 247.172406 290.922209
20 11264.0 427.071098 245.760001 286.676558
21 11776.0 423.724129 249.667843 288.981596
22 12288.0 419.504980 254.673582 294.323369
23 12800.0 413.458944 253.674644 288.180121
24 13312.0 412.242569 252.859526 289.916513
25 13824.0 405.594132 257.190689 292.056329
26 14336.0 394.568805 254.297107 286.719986
27 14848.0 386.498925 257.665934 289.246765
28 15360.0 373.117425 257.970599 286.211174
29 15872.0 371.274849 261.806182 289.899545
0 1024.0 356.173905 99.497980 315.076934
1 1536.0 405.098894 134.050910 344.523365
2 2048.0 486.653476 159.067963 321.254900
3 2560.0 458.507457 182.314537 326.808501
4 3072.0 515.580429 191.501303 319.168834
5 3584.0 551.384634 207.768111 307.199992
6 4096.0 568.231237 220.907859 293.444785
7 4608.0 502.690905 232.336141 290.267724
8 5120.0 527.381977 243.326731 287.102804
9 5632.0 540.671974 244.426754 291.310338
10 6144.0 548.163546 251.202731 288.000001
11 6656.0 532.479975 255.590406 286.279570
12 7168.0 510.480705 253.734520 277.919225
13 7680.0 487.619051 266.743841 284.884090
14 8192.0 468.114289 258.694729 278.481578
15 8704.0 415.300208 267.472468 284.987724
16 9216.0 429.483477 272.394084 290.077383
17 9728.0 438.033784 280.278512 288.950501
18 10240.0 443.610086 287.102804 290.153487
19 10752.0 426.525614 246.699797 290.267711
20 11264.0 427.071098 245.536784 286.069848
21 11776.0 418.702211 249.447482 288.981596
22 12288.0 414.784810 254.673582 294.323369
23 12800.0 410.146863 254.094291 288.180121
24 13312.0 409.599999 252.161013 289.129403
25 13824.0 404.112047 257.190689 291.799461
26 14336.0 395.930964 256.000002 289.129416
27 14848.0 385.662341 257.479779 288.777966
28 15360.0 380.433442 258.332158 286.656296
29 15872.0 372.363640 261.806182 290.562936
</pre></div>
</div>
<div class="line-block">
@@ -487,7 +487,7 @@ to download the full example code</p>
<span class="n">bench_layer_norm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">save_path</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">,</span> <span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div>
</div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 12.550 seconds)</p>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 14.583 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-05-layer-norm-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">05-layer-norm.py</span></code></a></p>

View File

@@ -174,7 +174,7 @@
<div class="section" id="computation-times">
<span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1>
<p><strong>13:15.622</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<p><strong>14:13.919</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<table class="docutils align-default">
<colgroup>
<col style="width: 85%" />
@@ -183,23 +183,23 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
<td><p>06:00.645</p></td>
<td><p>06:48.521</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
<td><p>03:19.345</p></td>
<td><p>03:27.571</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">Layer Normalization</span></a> (<code class="docutils literal notranslate"><span class="pre">05-layer-norm.py</span></code>)</p></td>
<td><p>02:12.550</p></td>
<td><p>02:14.583</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
<td><p>01:42.600</p></td>
<td><p>01:42.917</p></td>
<td><p>0.0 MB</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
<td><p>00:00.482</p></td>
<td><p>00:00.327</p></td>
<td><p>0.0 MB</p></td>
</tr>
</tbody>

View File

@@ -200,11 +200,11 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.broadcast_to">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">broadcast_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.broadcast_to" title="Permalink to this definition"></a></dt>
<dd><p>Tries to broadcast the given block to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p>
<dd><p>Tries to broadcast the given tensor to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) The input block.</p></li>
<li><p><strong>input</strong> (<em>Block</em>) The input tensor.</p></li>
<li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li>
</ul>
</dd>

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The first block to be multiplied.</p></li>
<li><p><strong>other</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The second block to be multiplied.</p></li>
<li><p><strong>input</strong> (2D tensor of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The first tensor to be multiplied.</p></li>
<li><p><strong>other</strong> (2D tensor of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The second tensor to be multiplied.</p></li>
</ul>
</dd>
</dl>

View File

@@ -201,7 +201,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.load">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache_modifier</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eviction_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">volatile</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.load" title="Permalink to this definition"></a></dt>
<dd><p>Return a block of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<dd><p>Return a tensor of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">mask</span></code> and <code class="code docutils literal notranslate"><span class="pre">other</span></code> are implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">other</span></code> is implicitly typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p>
<dl class="field-list simple">

View File

@@ -200,7 +200,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.max">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">max</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.max" title="Permalink to this definition"></a></dt>
<dd><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dd><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) the first input block</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input block</p></li>
<li><p><strong>input</strong> (<em>Block</em>) the first input tensor</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input tensor</p></li>
</ul>
</dd>
</dl>

View File

@@ -200,7 +200,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.min">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">min</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.min" title="Permalink to this definition"></a></dt>
<dd><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dd><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) the first input block</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input block</p></li>
<li><p><strong>input</strong> (<em>Block</em>) the first input tensor</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input tensor</p></li>
</ul>
</dd>
</dl>

View File

@@ -203,7 +203,7 @@
<dd><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>x</strong> (<em>Block</em>) the input block</p>
<dd class="field-odd"><p><strong>x</strong> (<em>Block</em>) the input tensor</p>
</dd>
</dl>
</dd></dl>

View File

@@ -200,11 +200,11 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.reshape">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">reshape</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.reshape" title="Permalink to this definition"></a></dt>
<dd><p>Tries to reshape the given block to a new shape.</p>
<dd><p>Tries to reshape the given tensor to a new shape.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> The input block.</p></li>
<li><p><strong>input</strong> The input tensor.</p></li>
<li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li>
</ul>
</dd>

View File

@@ -203,7 +203,7 @@
<h1>triton.language.softmax<a class="headerlink" href="#triton-language-softmax" title="Permalink to this headline"></a></h1>
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.softmax">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ieee_rounding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition"></a></dt>
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ieee_rounding</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition"></a></dt>
<dd><p>Computes the element-wise softmax of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>

View File

@@ -201,13 +201,13 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.store">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">store</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.store" title="Permalink to this definition"></a></dt>
<dd><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> block of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<dd><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> tensor of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">value</span></code> is implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code> and typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">
<li><p><strong>pointer</strong> (<em>Block of dtype=triton.PointerDType</em>) The memory locations where the elements of <code class="code docutils literal notranslate"><span class="pre">value</span></code> are stored.</p></li>
<li><p><strong>value</strong> (<em>Block</em>) The block of elements to be stored.</p></li>
<li><p><strong>value</strong> (<em>Block</em>) The tensor of elements to be stored.</p></li>
<li><p><strong>mask</strong> (<em>Block of triton.int1</em><em>, </em><em>optional</em>) If mask[idx] is false, do not store <code class="code docutils literal notranslate"><span class="pre">value[idx]</span></code> at <code class="code docutils literal notranslate"><span class="pre">pointer[idx]</span></code>.</p></li>
</ul>
</dd>

View File

@@ -200,7 +200,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.sum">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">sum</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.sum" title="Permalink to this definition"></a></dt>
<dd><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dd><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">

View File

@@ -198,7 +198,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.where">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">where</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">condition</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.where" title="Permalink to this definition"></a></dt>
<dd><p>Returns a block of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p>
<dd><p>Returns a tensor of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p>
<p>Note that <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are always evaluated regardless of the value of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p>
<p>If you want to avoid unintented memory operations, use the <code class="code docutils literal notranslate"><span class="pre">mask</span></code> arguments in <cite>triton.load</cite> and <cite>triton.store</cite> instead.</p>
<p>The shape of <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are both broadcast to the shape of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.

View File

@@ -199,7 +199,7 @@
<dl class="py function">
<dt class="sig sig-object py" id="triton.language.zeros">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">zeros</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">shape</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.zeros" title="Permalink to this definition"></a></dt>
<dd><p>Returns a block filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p>
<dd><p>Returns a tensor filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p>
<dl class="field-list simple">
<dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple">

View File

@@ -278,7 +278,7 @@
<td><p>Returns contiguous values within the open interval [<code class="code docutils literal notranslate"><span class="pre">start</span></code>, <code class="code docutils literal notranslate"><span class="pre">end</span></code>).</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.zeros.html#triton.language.zeros" title="triton.language.zeros"><code class="xref py py-obj docutils literal notranslate"><span class="pre">zeros</span></code></a></p></td>
<td><p>Returns a block filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p></td>
<td><p>Returns a tensor filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p></td>
</tr>
</tbody>
</table>
@@ -292,10 +292,10 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.broadcast_to.html#triton.language.broadcast_to" title="triton.language.broadcast_to"><code class="xref py py-obj docutils literal notranslate"><span class="pre">broadcast_to</span></code></a></p></td>
<td><p>Tries to broadcast the given block to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p></td>
<td><p>Tries to broadcast the given tensor to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.reshape.html#triton.language.reshape" title="triton.language.reshape"><code class="xref py py-obj docutils literal notranslate"><span class="pre">reshape</span></code></a></p></td>
<td><p>Tries to reshape the given block to a new shape.</p></td>
<td><p>Tries to reshape the given tensor to a new shape.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.ravel.html#triton.language.ravel" title="triton.language.ravel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ravel</span></code></a></p></td>
<td><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p></td>
@@ -326,10 +326,10 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.load.html#triton.language.load" title="triton.language.load"><code class="xref py py-obj docutils literal notranslate"><span class="pre">load</span></code></a></p></td>
<td><p>Return a block of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
<td><p>Return a tensor of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.store.html#triton.language.store" title="triton.language.store"><code class="xref py py-obj docutils literal notranslate"><span class="pre">store</span></code></a></p></td>
<td><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> block of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
<td><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> tensor of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.atomic_cas.html#triton.language.atomic_cas" title="triton.language.atomic_cas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">atomic_cas</span></code></a></p></td>
<td><p>Performs an atomic compare-and-swap at the memory location specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
@@ -349,7 +349,7 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.where.html#triton.language.where" title="triton.language.where"><code class="xref py py-obj docutils literal notranslate"><span class="pre">where</span></code></a></p></td>
<td><p>Returns a block of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p></td>
<td><p>Returns a tensor of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p></td>
</tr>
</tbody>
</table>
@@ -395,13 +395,13 @@
</colgroup>
<tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.max.html#triton.language.max" title="triton.language.max"><code class="xref py py-obj docutils literal notranslate"><span class="pre">max</span></code></a></p></td>
<td><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
<td><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.min.html#triton.language.min" title="triton.language.min"><code class="xref py py-obj docutils literal notranslate"><span class="pre">min</span></code></a></p></td>
<td><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
<td><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.sum.html#triton.language.sum" title="triton.language.sum"><code class="xref py py-obj docutils literal notranslate"><span class="pre">sum</span></code></a></p></td>
<td><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
<td><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr>
</tbody>
</table>

File diff suppressed because one or more lines are too long

View File

@@ -1,4 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 27fe8ebc933e4ad0f9352faba2dbdccd
config: cad12a755c359cf8dbd8410700935254
tags: 645f666f9bcd5a90fca523b33c5a78b7

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More