[GH-PAGES] Updated website

This commit is contained in:
Philippe Tillet
2022-04-08 00:44:05 +00:00
parent 80b92a0d2d
commit 0c570c178d
173 changed files with 401 additions and 386 deletions

View File

@@ -1,4 +1,4 @@
# Sphinx build info version 1 # Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 456fb2bf2c82c803cfdaa7bcb3e778a9 config: aa051e1ca67e6f2658629dbe2c30cba0
tags: 645f666f9bcd5a90fca523b33c5a78b7 tags: 645f666f9bcd5a90fca523b33c5a78b7

Binary file not shown.

Binary file not shown.

View File

@@ -24,9 +24,11 @@ def add_kernel(
y_ptr, # *Pointer* to second input vector y_ptr, # *Pointer* to second input vector
output_ptr, # *Pointer* to output vector output_ptr, # *Pointer* to output vector
n_elements, # Size of the vector n_elements, # Size of the vector
time_start_ptr, time_end_ptr,
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process BLOCK_SIZE: tl.constexpr, # Number of elements each program should process
# NOTE: `constexpr` so it can be used as a shape value # NOTE: `constexpr` so it can be used as a shape value
): ):
tl.atomic_min(time_start_ptr, tl.clock())
# There are multiple 'program's processing different data. We identify which program # There are multiple 'program's processing different data. We identify which program
# we are here # we are here
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0 pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0
@@ -45,6 +47,7 @@ def add_kernel(
output = x + y output = x + y
# Write x + y back to DRAM # Write x + y back to DRAM
tl.store(output_ptr + offsets, output, mask=mask) tl.store(output_ptr + offsets, output, mask=mask)
tl.atomic_max(time_end_ptr, tl.clock())
# %% # %%
@@ -53,6 +56,8 @@ def add_kernel(
def add(x: torch.Tensor, y: torch.Tensor): def add(x: torch.Tensor, y: torch.Tensor):
time_start = torch.zeros(1, dtype=torch.int64, device='cuda')
time_end = torch.zeros(1, dtype=torch.int64, device='cuda')
# We need to preallocate the output # We need to preallocate the output
output = torch.empty_like(x) output = torch.empty_like(x)
assert x.is_cuda and y.is_cuda and output.is_cuda assert x.is_cuda and y.is_cuda and output.is_cuda
@@ -65,7 +70,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
# - each torch.tensor object is implicitly converted into a pointer to its first element. # - each torch.tensor object is implicitly converted into a pointer to its first element.
# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
# - don't forget to pass meta-parameters as keywords arguments # - don't forget to pass meta-parameters as keywords arguments
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
# running asynchronously at this point. # running asynchronously at this point.
return output return output

View File

@@ -33,7 +33,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"import torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n x_ptr, # *Pointer* to first input vector\n y_ptr, # *Pointer* to second input vector\n output_ptr, # *Pointer* to output vector\n n_elements, # Size of the vector\n BLOCK_SIZE: tl.constexpr, # Number of elements each program should process\n # NOTE: `constexpr` so it can be used as a shape value\n):\n # There are multiple 'program's processing different data. We identify which program\n # we are here\n pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0\n # This program will process inputs that are offset from the initial data.\n # for instance, if you had a vector of length 256 and block_size of 64, the programs\n # would each access the elements [0:64, 64:128, 128:192, 192:256].\n # Note that offsets is a list of pointers\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n # Create a mask to guard memory operations against out-of-bounds accesses\n mask = offsets < n_elements\n # Load x and y from DRAM, masking out any extra elements in case the input is not a\n # multiple of the block size\n x = tl.load(x_ptr + offsets, mask=mask)\n y = tl.load(y_ptr + offsets, mask=mask)\n output = x + y\n # Write x + y back to DRAM\n tl.store(output_ptr + offsets, output, mask=mask)" "import torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n x_ptr, # *Pointer* to first input vector\n y_ptr, # *Pointer* to second input vector\n output_ptr, # *Pointer* to output vector\n n_elements, # Size of the vector\n time_start_ptr, time_end_ptr,\n BLOCK_SIZE: tl.constexpr, # Number of elements each program should process\n # NOTE: `constexpr` so it can be used as a shape value\n):\n tl.atomic_min(time_start_ptr, tl.clock())\n # There are multiple 'program's processing different data. We identify which program\n # we are here\n pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0\n # This program will process inputs that are offset from the initial data.\n # for instance, if you had a vector of length 256 and block_size of 64, the programs\n # would each access the elements [0:64, 64:128, 128:192, 192:256].\n # Note that offsets is a list of pointers\n block_start = pid * BLOCK_SIZE\n offsets = block_start + tl.arange(0, BLOCK_SIZE)\n # Create a mask to guard memory operations against out-of-bounds accesses\n mask = offsets < n_elements\n # Load x and y from DRAM, masking out any extra elements in case the input is not a\n # multiple of the block size\n x = tl.load(x_ptr + offsets, mask=mask)\n y = tl.load(y_ptr + offsets, mask=mask)\n output = x + y\n # Write x + y back to DRAM\n tl.store(output_ptr + offsets, output, mask=mask)\n tl.atomic_max(time_end_ptr, tl.clock())"
] ]
}, },
{ {
@@ -51,7 +51,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"def add(x: torch.Tensor, y: torch.Tensor):\n # We need to preallocate the output\n output = torch.empty_like(x)\n assert x.is_cuda and y.is_cuda and output.is_cuda\n n_elements = output.numel()\n # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n # In this case, we use a 1D grid where the size is the number of blocks\n grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n # NOTE:\n # - each torch.tensor object is implicitly converted into a pointer to its first element.\n # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n # - don't forget to pass meta-parameters as keywords arguments\n add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)\n # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n # running asynchronously at this point.\n return output" "def add(x: torch.Tensor, y: torch.Tensor):\n time_start = torch.zeros(1, dtype=torch.int64, device='cuda')\n time_end = torch.zeros(1, dtype=torch.int64, device='cuda')\n # We need to preallocate the output\n output = torch.empty_like(x)\n assert x.is_cuda and y.is_cuda and output.is_cuda\n n_elements = output.numel()\n # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n # In this case, we use a 1D grid where the size is the number of blocks\n grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n # NOTE:\n # - each torch.tensor object is implicitly converted into a pointer to its first element.\n # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n # - don't forget to pass meta-parameters as keywords arguments\n add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)\n # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n # running asynchronously at this point.\n return output"
] ]
}, },
{ {

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 37 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

@@ -31,7 +31,7 @@ In this tutorial, you will write a simple vector addition using Triton and learn
Compute Kernel Compute Kernel
-------------------------- --------------------------
.. GENERATED FROM PYTHON SOURCE LINES 14-50 .. GENERATED FROM PYTHON SOURCE LINES 14-53
.. code-block:: default .. code-block:: default
@@ -48,9 +48,11 @@ Compute Kernel
y_ptr, # *Pointer* to second input vector y_ptr, # *Pointer* to second input vector
output_ptr, # *Pointer* to output vector output_ptr, # *Pointer* to output vector
n_elements, # Size of the vector n_elements, # Size of the vector
time_start_ptr, time_end_ptr,
BLOCK_SIZE: tl.constexpr, # Number of elements each program should process BLOCK_SIZE: tl.constexpr, # Number of elements each program should process
# NOTE: `constexpr` so it can be used as a shape value # NOTE: `constexpr` so it can be used as a shape value
): ):
tl.atomic_min(time_start_ptr, tl.clock())
# There are multiple 'program's processing different data. We identify which program # There are multiple 'program's processing different data. We identify which program
# we are here # we are here
pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0 pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0
@@ -69,6 +71,7 @@ Compute Kernel
output = x + y output = x + y
# Write x + y back to DRAM # Write x + y back to DRAM
tl.store(output_ptr + offsets, output, mask=mask) tl.store(output_ptr + offsets, output, mask=mask)
tl.atomic_max(time_end_ptr, tl.clock())
@@ -78,18 +81,20 @@ Compute Kernel
.. GENERATED FROM PYTHON SOURCE LINES 51-53 .. GENERATED FROM PYTHON SOURCE LINES 54-56
Let's also declare a helper function to (1) allocate the `z` tensor Let's also declare a helper function to (1) allocate the `z` tensor
and (2) enqueue the above kernel with appropriate grid/block sizes. and (2) enqueue the above kernel with appropriate grid/block sizes.
.. GENERATED FROM PYTHON SOURCE LINES 53-74 .. GENERATED FROM PYTHON SOURCE LINES 56-79
.. code-block:: default .. code-block:: default
def add(x: torch.Tensor, y: torch.Tensor): def add(x: torch.Tensor, y: torch.Tensor):
time_start = torch.zeros(1, dtype=torch.int64, device='cuda')
time_end = torch.zeros(1, dtype=torch.int64, device='cuda')
# We need to preallocate the output # We need to preallocate the output
output = torch.empty_like(x) output = torch.empty_like(x)
assert x.is_cuda and y.is_cuda and output.is_cuda assert x.is_cuda and y.is_cuda and output.is_cuda
@@ -102,7 +107,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
# - each torch.tensor object is implicitly converted into a pointer to its first element. # - each torch.tensor object is implicitly converted into a pointer to its first element.
# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
# - don't forget to pass meta-parameters as keywords arguments # - don't forget to pass meta-parameters as keywords arguments
add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) add_kernel[grid](x, y, output, n_elements, time_start, time_end, BLOCK_SIZE=1024)
# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
# running asynchronously at this point. # running asynchronously at this point.
return output return output
@@ -115,11 +120,11 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
.. GENERATED FROM PYTHON SOURCE LINES 75-76 .. GENERATED FROM PYTHON SOURCE LINES 80-81
We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness: We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
.. GENERATED FROM PYTHON SOURCE LINES 76-90 .. GENERATED FROM PYTHON SOURCE LINES 81-95
.. code-block:: default .. code-block:: default
@@ -154,11 +159,11 @@ We can now use the above function to compute the element-wise sum of two `torch.
.. GENERATED FROM PYTHON SOURCE LINES 91-92 .. GENERATED FROM PYTHON SOURCE LINES 96-97
Seems like we're good to go! Seems like we're good to go!
.. GENERATED FROM PYTHON SOURCE LINES 94-99 .. GENERATED FROM PYTHON SOURCE LINES 99-104
Benchmark Benchmark
----------- -----------
@@ -166,7 +171,7 @@ We can now benchmark our custom op on vectors of increasing sizes to get a sense
To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
for different problem sizes. for different problem sizes.
.. GENERATED FROM PYTHON SOURCE LINES 99-128 .. GENERATED FROM PYTHON SOURCE LINES 104-133
.. code-block:: default .. code-block:: default
@@ -206,12 +211,12 @@ for different problem sizes.
.. GENERATED FROM PYTHON SOURCE LINES 129-131 .. GENERATED FROM PYTHON SOURCE LINES 134-136
We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
`save_path='/path/to/results/' to save them to disk along with raw CSV data `save_path='/path/to/results/' to save them to disk along with raw CSV data
.. GENERATED FROM PYTHON SOURCE LINES 131-132 .. GENERATED FROM PYTHON SOURCE LINES 136-137
.. code-block:: default .. code-block:: default
@@ -232,22 +237,22 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
vector-add-performance: vector-add-performance:
size Triton Torch size Triton Torch
0 4096.0 9.600000 9.600000 0 4096.0 4.800000 9.600000
1 8192.0 19.200000 19.200000 1 8192.0 8.727273 19.200000
2 16384.0 38.400001 38.400001 2 16384.0 17.454545 38.400001
3 32768.0 76.800002 76.800002 3 32768.0 38.400001 76.800002
4 65536.0 127.999995 127.999995 4 65536.0 69.818181 127.999995
5 131072.0 219.428568 219.428568 5 131072.0 139.636363 219.428568
6 262144.0 341.333321 341.333321 6 262144.0 219.428568 341.333321
7 524288.0 472.615390 472.615390 7 524288.0 341.333321 472.615390
8 1048576.0 614.400016 614.400016 8 1048576.0 472.615390 614.400016
9 2097152.0 722.823517 722.823517 9 2097152.0 614.400016 702.171410
10 4194304.0 780.190482 780.190482 10 4194304.0 712.347810 780.190482
11 8388608.0 812.429770 812.429770 11 8388608.0 774.047204 812.429770
12 16777216.0 833.084721 833.084721 12 16777216.0 809.086412 833.084721
13 33554432.0 842.004273 843.811163 13 33554432.0 829.569620 842.004273
14 67108864.0 847.448255 848.362445 14 67108864.0 840.205105 848.362445
15 134217728.0 849.737435 850.656574 15 134217728.0 845.625825 850.656574
@@ -255,7 +260,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 1 minutes 42.600 seconds) **Total running time of the script:** ( 1 minutes 42.917 seconds)
.. _sphx_glr_download_getting-started_tutorials_01-vector-add.py: .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:

View File

@@ -278,17 +278,17 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
softmax-performance: softmax-performance:
N Triton Torch (native) Torch (jit) N Triton Torch (native) Torch (jit)
0 256.0 512.000001 546.133347 186.181817 0 256.0 512.000001 546.133347 190.511628
1 384.0 614.400016 585.142862 153.600004 1 384.0 585.142862 585.142862 151.703707
2 512.0 655.360017 606.814814 154.566038 2 512.0 655.360017 585.142849 154.566038
3 640.0 706.206879 640.000002 158.759699 3 640.0 682.666684 640.000002 160.000000
4 768.0 722.823517 664.216187 162.754967 4 768.0 722.823517 646.736871 163.839992
.. ... ... ... ... .. ... ... ... ...
93 12160.0 814.058574 406.179533 198.834951 93 12160.0 814.058574 405.755985 198.834951
94 12288.0 814.111783 415.661740 199.096718 94 12288.0 814.111783 415.222812 199.197579
95 12416.0 814.163950 412.149375 198.655991 95 12416.0 814.163950 412.149375 198.854847
96 12544.0 814.214963 412.971190 198.913776 96 12544.0 814.214963 412.971190 199.012395
97 12672.0 814.265046 411.679167 198.971549 97 12672.0 814.265046 412.097543 199.167004
[98 rows x 4 columns] [98 rows x 4 columns]
@@ -306,7 +306,7 @@ In the above plot, we can see that:
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 3 minutes 19.345 seconds) **Total running time of the script:** ( 3 minutes 27.571 seconds)
.. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py: .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:

View File

@@ -459,36 +459,36 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
matmul-performance: matmul-performance:
M cuBLAS ... Triton Triton (+ LeakyReLU) M cuBLAS ... Triton Triton (+ LeakyReLU)
0 256.0 2.730667 ... 3.276800 2.978909 0 256.0 2.730667 ... 3.276800 2.978909
1 384.0 7.372800 ... 8.507077 7.899428 1 384.0 7.372800 ... 7.899428 8.507077
2 512.0 14.563555 ... 16.384000 16.384000 2 512.0 14.563555 ... 15.420235 15.420235
3 640.0 22.260869 ... 24.380953 24.380953 3 640.0 22.260869 ... 24.380953 24.380953
4 768.0 32.768000 ... 34.028308 34.028308 4 768.0 32.768000 ... 35.389441 34.028308
5 896.0 39.025776 ... 40.140799 39.025776 5 896.0 37.971025 ... 40.140799 39.025776
6 1024.0 49.932191 ... 52.428801 52.428801 6 1024.0 49.932191 ... 53.773130 53.773130
7 1152.0 45.242181 ... 46.656000 46.656000 7 1152.0 45.242181 ... 48.161033 47.396572
8 1280.0 51.200001 ... 56.888887 56.888887 8 1280.0 51.200001 ... 57.690139 57.690139
9 1408.0 64.138541 ... 67.305878 66.485074 9 1408.0 64.138541 ... 69.009825 67.305878
10 1536.0 80.430545 ... 79.526831 78.643199 10 1536.0 79.526831 ... 80.430545 79.526831
11 1664.0 62.929456 ... 62.492442 61.636381 11 1664.0 63.372618 ... 63.372618 62.929456
12 1792.0 72.512412 ... 72.512412 72.047592 12 1792.0 72.983276 ... 63.142831 63.142831
13 1920.0 69.467336 ... 70.172588 69.818184 13 1920.0 69.120002 ... 71.626943 70.892307
14 2048.0 73.262953 ... 76.608294 76.608294 14 2048.0 73.584279 ... 78.033565 77.672296
15 2176.0 83.500614 ... 85.998493 85.632545 15 2176.0 83.500614 ... 87.115360 86.739860
16 2304.0 68.643310 ... 77.057651 76.319081 16 2304.0 68.446623 ... 77.810656 77.307030
17 2432.0 71.305746 ... 85.393507 85.134737 17 2432.0 71.125224 ... 75.522751 75.320281
18 2560.0 78.019048 ... 80.709358 81.108913 18 2560.0 77.833728 ... 81.715711 81.512437
19 2688.0 83.004501 ... 89.464755 89.254248 19 2688.0 83.737433 ... 90.966561 90.532356
20 2816.0 80.767055 ... 83.552120 82.602666 20 2816.0 79.443003 ... 82.916747 84.035084
21 2944.0 81.298583 ... 82.237674 81.967162 21 2944.0 81.832567 ... 83.198715 82.102191
22 3072.0 81.707223 ... 88.473602 87.516392 22 3072.0 81.121923 ... 88.197981 87.381335
23 3200.0 84.377059 ... 94.955488 94.674553 23 3200.0 83.116885 ... 96.385543 96.096095
24 3328.0 83.034941 ... 84.695641 83.905938 24 3328.0 82.939284 ... 84.895397 85.096096
25 3456.0 81.108217 ... 85.133652 81.029251 25 3456.0 77.745004 ... 86.596744 84.332184
26 3584.0 86.623693 ... 98.483450 98.160909 26 3584.0 86.540320 ... 91.380335 97.416461
27 3712.0 81.682211 ... 88.404730 84.017953 27 3712.0 85.748791 ... 88.404730 86.341700
28 3840.0 82.592983 ... 92.006659 85.169042 28 3840.0 84.874902 ... 93.247896 85.300426
29 3968.0 90.791620 ... 86.236000 90.522206 29 3968.0 92.372393 ... 80.015697 78.220472
30 4096.0 86.369197 ... 85.001726 91.304576 30 4096.0 93.990003 ... 93.206754 86.036145
[31 rows x 5 columns] [31 rows x 5 columns]
@@ -498,7 +498,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 6 minutes 0.645 seconds) **Total running time of the script:** ( 6 minutes 48.521 seconds)
.. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py: .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:

View File

@@ -240,7 +240,7 @@ References
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 0 minutes 0.482 seconds) **Total running time of the script:** ( 0 minutes 0.327 seconds)
.. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py: .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:

View File

@@ -38,36 +38,36 @@ Layer Normalization
layer-norm-backward: layer-norm-backward:
N Triton Torch Apex N Triton Torch Apex
0 1024.0 311.088617 98.303995 303.407414 0 1024.0 356.173905 99.497980 315.076934
1 1536.0 347.773587 134.050910 341.333333 1 1536.0 405.098894 134.050910 344.523365
2 2048.0 420.102553 161.684218 325.509933 2 2048.0 486.653476 159.067963 321.254900
3 2560.0 455.111129 181.238943 326.808501 3 2560.0 458.507457 182.314537 326.808501
4 3072.0 511.999982 191.999993 317.793096 4 3072.0 515.580429 191.501303 319.168834
5 3584.0 547.872604 207.768111 310.527060 5 3584.0 551.384634 207.768111 307.199992
6 4096.0 564.965515 219.919464 295.207204 6 4096.0 568.231237 220.907859 293.444785
7 4608.0 504.986315 232.825259 291.799469 7 4608.0 502.690905 232.336141 290.267724
8 5120.0 527.381977 242.845844 288.450695 8 5120.0 527.381977 243.326731 287.102804
9 5632.0 542.843364 243.107920 288.820505 9 5632.0 540.671974 244.426754 291.310338
10 6144.0 546.133354 248.661056 286.879370 10 6144.0 548.163546 251.202731 288.000001
11 6656.0 532.479975 256.000009 285.767438 11 6656.0 532.479975 255.590406 286.279570
12 7168.0 505.976473 260.654538 286.242939 12 7168.0 510.480705 253.734520 277.919225
13 7680.0 481.253256 262.190612 278.429013 13 7680.0 487.619051 266.743841 284.884090
14 8192.0 463.698115 267.130429 284.939124 14 8192.0 468.114289 258.694729 278.481578
15 8704.0 417.791980 267.815384 284.987724 15 8704.0 415.300208 267.472468 284.987724
16 9216.0 432.845409 272.394084 288.751954 16 9216.0 429.483477 272.394084 290.077383
17 9728.0 439.683593 280.278512 289.667485 17 9728.0 438.033784 280.278512 288.950501
18 10240.0 448.467168 286.433562 290.153487 18 10240.0 443.610086 287.102804 290.153487
19 10752.0 425.821771 247.172406 290.922209 19 10752.0 426.525614 246.699797 290.267711
20 11264.0 427.071098 245.760001 286.676558 20 11264.0 427.071098 245.536784 286.069848
21 11776.0 423.724129 249.667843 288.981596 21 11776.0 418.702211 249.447482 288.981596
22 12288.0 419.504980 254.673582 294.323369 22 12288.0 414.784810 254.673582 294.323369
23 12800.0 413.458944 253.674644 288.180121 23 12800.0 410.146863 254.094291 288.180121
24 13312.0 412.242569 252.859526 289.916513 24 13312.0 409.599999 252.161013 289.129403
25 13824.0 405.594132 257.190689 292.056329 25 13824.0 404.112047 257.190689 291.799461
26 14336.0 394.568805 254.297107 286.719986 26 14336.0 395.930964 256.000002 289.129416
27 14848.0 386.498925 257.665934 289.246765 27 14848.0 385.662341 257.479779 288.777966
28 15360.0 373.117425 257.970599 286.211174 28 15360.0 380.433442 258.332158 286.656296
29 15872.0 371.274849 261.806182 289.899545 29 15872.0 372.363640 261.806182 290.562936
@@ -339,7 +339,7 @@ Layer Normalization
.. rst-class:: sphx-glr-timing .. rst-class:: sphx-glr-timing
**Total running time of the script:** ( 2 minutes 12.550 seconds) **Total running time of the script:** ( 2 minutes 14.583 seconds)
.. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py: .. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:

View File

@@ -5,16 +5,16 @@
Computation times Computation times
================= =================
**13:15.622** total execution time for **getting-started_tutorials** files: **14:13.919** total execution time for **getting-started_tutorials** files:
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:00.645 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:48.521 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 03:19.345 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) | 03:27.571 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``) | 02:12.550 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``) | 02:14.583 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 01:42.600 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) | 01:42.917 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+
| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``) | 00:00.482 | 0.0 MB | | :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``) | 00:00.327 | 0.0 MB |
+---------------------------------------------------------------------------------------------------------+-----------+--------+ +---------------------------------------------------------------------------------------------------------+-----------+--------+

View File

@@ -214,9 +214,11 @@ to download the full example code</p>
<span class="n">y_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to second input vector</span> <span class="n">y_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to second input vector</span>
<span class="n">output_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to output vector</span> <span class="n">output_ptr</span><span class="p">,</span> <span class="c1"># *Pointer* to output vector</span>
<span class="n">n_elements</span><span class="p">,</span> <span class="c1"># Size of the vector</span> <span class="n">n_elements</span><span class="p">,</span> <span class="c1"># Size of the vector</span>
<span class="n">time_start_ptr</span><span class="p">,</span> <span class="n">time_end_ptr</span><span class="p">,</span>
<span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="c1"># Number of elements each program should process</span> <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="c1"># Number of elements each program should process</span>
<span class="c1"># NOTE: `constexpr` so it can be used as a shape value</span> <span class="c1"># NOTE: `constexpr` so it can be used as a shape value</span>
<span class="p">):</span> <span class="p">):</span>
<span class="n">tl</span><span class="o">.</span><span class="n">atomic_min</span><span class="p">(</span><span class="n">time_start_ptr</span><span class="p">,</span> <span class="n">tl</span><span class="o">.</span><span class="n">clock</span><span class="p">())</span>
<span class="c1"># There are multiple &#39;program&#39;s processing different data. We identify which program</span> <span class="c1"># There are multiple &#39;program&#39;s processing different data. We identify which program</span>
<span class="c1"># we are here</span> <span class="c1"># we are here</span>
<span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># We use a 1D launch grid so axis is 0</span> <span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="c1"># We use a 1D launch grid so axis is 0</span>
@@ -235,11 +237,14 @@ to download the full example code</p>
<span class="n">output</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span> <span class="n">output</span> <span class="o">=</span> <span class="n">x</span> <span class="o">+</span> <span class="n">y</span>
<span class="c1"># Write x + y back to DRAM</span> <span class="c1"># Write x + y back to DRAM</span>
<span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">output_ptr</span> <span class="o">+</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span> <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">output_ptr</span> <span class="o">+</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
<span class="n">tl</span><span class="o">.</span><span class="n">atomic_max</span><span class="p">(</span><span class="n">time_end_ptr</span><span class="p">,</span> <span class="n">tl</span><span class="o">.</span><span class="n">clock</span><span class="p">())</span>
</pre></div> </pre></div>
</div> </div>
<p>Lets also declare a helper function to (1) allocate the <cite>z</cite> tensor <p>Lets also declare a helper function to (1) allocate the <cite>z</cite> tensor
and (2) enqueue the above kernel with appropriate grid/block sizes.</p> and (2) enqueue the above kernel with appropriate grid/block sizes.</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span> <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="k">def</span> <span class="nf">add</span><span class="p">(</span><span class="n">x</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">,</span> <span class="n">y</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">Tensor</span><span class="p">):</span>
<span class="n">time_start</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
<span class="n">time_end</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int64</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
<span class="c1"># We need to preallocate the output</span> <span class="c1"># We need to preallocate the output</span>
<span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span> <span class="n">output</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
<span class="k">assert</span> <span class="n">x</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">y</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">output</span><span class="o">.</span><span class="n">is_cuda</span> <span class="k">assert</span> <span class="n">x</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">y</span><span class="o">.</span><span class="n">is_cuda</span> <span class="ow">and</span> <span class="n">output</span><span class="o">.</span><span class="n">is_cuda</span>
@@ -252,7 +257,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.</p>
<span class="c1"># - each torch.tensor object is implicitly converted into a pointer to its first element.</span> <span class="c1"># - each torch.tensor object is implicitly converted into a pointer to its first element.</span>
<span class="c1"># - `triton.jit`&#39;ed functions can be index with a launch grid to obtain a callable GPU kernel</span> <span class="c1"># - `triton.jit`&#39;ed functions can be index with a launch grid to obtain a callable GPU kernel</span>
<span class="c1"># - don&#39;t forget to pass meta-parameters as keywords arguments</span> <span class="c1"># - don&#39;t forget to pass meta-parameters as keywords arguments</span>
<span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span> <span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">time_start</span><span class="p">,</span> <span class="n">time_end</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
<span class="c1"># We return a handle to z but, since `torch.cuda.synchronize()` hasn&#39;t been called, the kernel is still</span> <span class="c1"># We return a handle to z but, since `torch.cuda.synchronize()` hasn&#39;t been called, the kernel is still</span>
<span class="c1"># running asynchronously at this point.</span> <span class="c1"># running asynchronously at this point.</span>
<span class="k">return</span> <span class="n">output</span> <span class="k">return</span> <span class="n">output</span>
@@ -322,25 +327,25 @@ for different problem sizes.</p>
<p class="sphx-glr-script-out">Out:</p> <p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>vector-add-performance: <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>vector-add-performance:
size Triton Torch size Triton Torch
0 4096.0 9.600000 9.600000 0 4096.0 4.800000 9.600000
1 8192.0 19.200000 19.200000 1 8192.0 8.727273 19.200000
2 16384.0 38.400001 38.400001 2 16384.0 17.454545 38.400001
3 32768.0 76.800002 76.800002 3 32768.0 38.400001 76.800002
4 65536.0 127.999995 127.999995 4 65536.0 69.818181 127.999995
5 131072.0 219.428568 219.428568 5 131072.0 139.636363 219.428568
6 262144.0 341.333321 341.333321 6 262144.0 219.428568 341.333321
7 524288.0 472.615390 472.615390 7 524288.0 341.333321 472.615390
8 1048576.0 614.400016 614.400016 8 1048576.0 472.615390 614.400016
9 2097152.0 722.823517 722.823517 9 2097152.0 614.400016 702.171410
10 4194304.0 780.190482 780.190482 10 4194304.0 712.347810 780.190482
11 8388608.0 812.429770 812.429770 11 8388608.0 774.047204 812.429770
12 16777216.0 833.084721 833.084721 12 16777216.0 809.086412 833.084721
13 33554432.0 842.004273 843.811163 13 33554432.0 829.569620 842.004273
14 67108864.0 847.448255 848.362445 14 67108864.0 840.205105 848.362445
15 134217728.0 849.737435 850.656574 15 134217728.0 845.625825 850.656574
</pre></div> </pre></div>
</div> </div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 42.600 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes 42.917 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>

View File

@@ -369,17 +369,17 @@ We will then compare its performance against (1) <code class="code docutils lite
<p class="sphx-glr-script-out">Out:</p> <p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance: <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance:
N Triton Torch (native) Torch (jit) N Triton Torch (native) Torch (jit)
0 256.0 512.000001 546.133347 186.181817 0 256.0 512.000001 546.133347 190.511628
1 384.0 614.400016 585.142862 153.600004 1 384.0 585.142862 585.142862 151.703707
2 512.0 655.360017 606.814814 154.566038 2 512.0 655.360017 585.142849 154.566038
3 640.0 706.206879 640.000002 158.759699 3 640.0 682.666684 640.000002 160.000000
4 768.0 722.823517 664.216187 162.754967 4 768.0 722.823517 646.736871 163.839992
.. ... ... ... ... .. ... ... ... ...
93 12160.0 814.058574 406.179533 198.834951 93 12160.0 814.058574 405.755985 198.834951
94 12288.0 814.111783 415.661740 199.096718 94 12288.0 814.111783 415.222812 199.197579
95 12416.0 814.163950 412.149375 198.655991 95 12416.0 814.163950 412.149375 198.854847
96 12544.0 814.214963 412.971190 198.913776 96 12544.0 814.214963 412.971190 199.012395
97 12672.0 814.265046 411.679167 198.971549 97 12672.0 814.265046 412.097543 199.167004
[98 rows x 4 columns] [98 rows x 4 columns]
</pre></div> </pre></div>
@@ -392,7 +392,7 @@ We will then compare its performance against (1) <code class="code docutils lite
Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li> Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
</ul> </ul>
</div></blockquote> </div></blockquote>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 19.345 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes 27.571 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>

View File

@@ -565,41 +565,41 @@ torch_output=tensor([[ 1.1045, -36.9688, 31.4688, ..., -11.3906, 24.4531, -3
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance: <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
M cuBLAS ... Triton Triton (+ LeakyReLU) M cuBLAS ... Triton Triton (+ LeakyReLU)
0 256.0 2.730667 ... 3.276800 2.978909 0 256.0 2.730667 ... 3.276800 2.978909
1 384.0 7.372800 ... 8.507077 7.899428 1 384.0 7.372800 ... 7.899428 8.507077
2 512.0 14.563555 ... 16.384000 16.384000 2 512.0 14.563555 ... 15.420235 15.420235
3 640.0 22.260869 ... 24.380953 24.380953 3 640.0 22.260869 ... 24.380953 24.380953
4 768.0 32.768000 ... 34.028308 34.028308 4 768.0 32.768000 ... 35.389441 34.028308
5 896.0 39.025776 ... 40.140799 39.025776 5 896.0 37.971025 ... 40.140799 39.025776
6 1024.0 49.932191 ... 52.428801 52.428801 6 1024.0 49.932191 ... 53.773130 53.773130
7 1152.0 45.242181 ... 46.656000 46.656000 7 1152.0 45.242181 ... 48.161033 47.396572
8 1280.0 51.200001 ... 56.888887 56.888887 8 1280.0 51.200001 ... 57.690139 57.690139
9 1408.0 64.138541 ... 67.305878 66.485074 9 1408.0 64.138541 ... 69.009825 67.305878
10 1536.0 80.430545 ... 79.526831 78.643199 10 1536.0 79.526831 ... 80.430545 79.526831
11 1664.0 62.929456 ... 62.492442 61.636381 11 1664.0 63.372618 ... 63.372618 62.929456
12 1792.0 72.512412 ... 72.512412 72.047592 12 1792.0 72.983276 ... 63.142831 63.142831
13 1920.0 69.467336 ... 70.172588 69.818184 13 1920.0 69.120002 ... 71.626943 70.892307
14 2048.0 73.262953 ... 76.608294 76.608294 14 2048.0 73.584279 ... 78.033565 77.672296
15 2176.0 83.500614 ... 85.998493 85.632545 15 2176.0 83.500614 ... 87.115360 86.739860
16 2304.0 68.643310 ... 77.057651 76.319081 16 2304.0 68.446623 ... 77.810656 77.307030
17 2432.0 71.305746 ... 85.393507 85.134737 17 2432.0 71.125224 ... 75.522751 75.320281
18 2560.0 78.019048 ... 80.709358 81.108913 18 2560.0 77.833728 ... 81.715711 81.512437
19 2688.0 83.004501 ... 89.464755 89.254248 19 2688.0 83.737433 ... 90.966561 90.532356
20 2816.0 80.767055 ... 83.552120 82.602666 20 2816.0 79.443003 ... 82.916747 84.035084
21 2944.0 81.298583 ... 82.237674 81.967162 21 2944.0 81.832567 ... 83.198715 82.102191
22 3072.0 81.707223 ... 88.473602 87.516392 22 3072.0 81.121923 ... 88.197981 87.381335
23 3200.0 84.377059 ... 94.955488 94.674553 23 3200.0 83.116885 ... 96.385543 96.096095
24 3328.0 83.034941 ... 84.695641 83.905938 24 3328.0 82.939284 ... 84.895397 85.096096
25 3456.0 81.108217 ... 85.133652 81.029251 25 3456.0 77.745004 ... 86.596744 84.332184
26 3584.0 86.623693 ... 98.483450 98.160909 26 3584.0 86.540320 ... 91.380335 97.416461
27 3712.0 81.682211 ... 88.404730 84.017953 27 3712.0 85.748791 ... 88.404730 86.341700
28 3840.0 82.592983 ... 92.006659 85.169042 28 3840.0 84.874902 ... 93.247896 85.300426
29 3968.0 90.791620 ... 86.236000 90.522206 29 3968.0 92.372393 ... 80.015697 78.220472
30 4096.0 86.369197 ... 85.001726 91.304576 30 4096.0 93.990003 ... 93.206754 86.036145
[31 rows x 5 columns] [31 rows x 5 columns]
</pre></div> </pre></div>
</div> </div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes 0.645 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes 48.521 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>

View File

@@ -372,7 +372,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
<dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p> <dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
</dd> </dd>
</dl> </dl>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 0.482 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes 0.327 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>

View File

@@ -194,36 +194,36 @@ to download the full example code</p>
<p class="sphx-glr-script-out">Out:</p> <p class="sphx-glr-script-out">Out:</p>
<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>layer-norm-backward: <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>layer-norm-backward:
N Triton Torch Apex N Triton Torch Apex
0 1024.0 311.088617 98.303995 303.407414 0 1024.0 356.173905 99.497980 315.076934
1 1536.0 347.773587 134.050910 341.333333 1 1536.0 405.098894 134.050910 344.523365
2 2048.0 420.102553 161.684218 325.509933 2 2048.0 486.653476 159.067963 321.254900
3 2560.0 455.111129 181.238943 326.808501 3 2560.0 458.507457 182.314537 326.808501
4 3072.0 511.999982 191.999993 317.793096 4 3072.0 515.580429 191.501303 319.168834
5 3584.0 547.872604 207.768111 310.527060 5 3584.0 551.384634 207.768111 307.199992
6 4096.0 564.965515 219.919464 295.207204 6 4096.0 568.231237 220.907859 293.444785
7 4608.0 504.986315 232.825259 291.799469 7 4608.0 502.690905 232.336141 290.267724
8 5120.0 527.381977 242.845844 288.450695 8 5120.0 527.381977 243.326731 287.102804
9 5632.0 542.843364 243.107920 288.820505 9 5632.0 540.671974 244.426754 291.310338
10 6144.0 546.133354 248.661056 286.879370 10 6144.0 548.163546 251.202731 288.000001
11 6656.0 532.479975 256.000009 285.767438 11 6656.0 532.479975 255.590406 286.279570
12 7168.0 505.976473 260.654538 286.242939 12 7168.0 510.480705 253.734520 277.919225
13 7680.0 481.253256 262.190612 278.429013 13 7680.0 487.619051 266.743841 284.884090
14 8192.0 463.698115 267.130429 284.939124 14 8192.0 468.114289 258.694729 278.481578
15 8704.0 417.791980 267.815384 284.987724 15 8704.0 415.300208 267.472468 284.987724
16 9216.0 432.845409 272.394084 288.751954 16 9216.0 429.483477 272.394084 290.077383
17 9728.0 439.683593 280.278512 289.667485 17 9728.0 438.033784 280.278512 288.950501
18 10240.0 448.467168 286.433562 290.153487 18 10240.0 443.610086 287.102804 290.153487
19 10752.0 425.821771 247.172406 290.922209 19 10752.0 426.525614 246.699797 290.267711
20 11264.0 427.071098 245.760001 286.676558 20 11264.0 427.071098 245.536784 286.069848
21 11776.0 423.724129 249.667843 288.981596 21 11776.0 418.702211 249.447482 288.981596
22 12288.0 419.504980 254.673582 294.323369 22 12288.0 414.784810 254.673582 294.323369
23 12800.0 413.458944 253.674644 288.180121 23 12800.0 410.146863 254.094291 288.180121
24 13312.0 412.242569 252.859526 289.916513 24 13312.0 409.599999 252.161013 289.129403
25 13824.0 405.594132 257.190689 292.056329 25 13824.0 404.112047 257.190689 291.799461
26 14336.0 394.568805 254.297107 286.719986 26 14336.0 395.930964 256.000002 289.129416
27 14848.0 386.498925 257.665934 289.246765 27 14848.0 385.662341 257.479779 288.777966
28 15360.0 373.117425 257.970599 286.211174 28 15360.0 380.433442 258.332158 286.656296
29 15872.0 371.274849 261.806182 289.899545 29 15872.0 372.363640 261.806182 290.562936
</pre></div> </pre></div>
</div> </div>
<div class="line-block"> <div class="line-block">
@@ -487,7 +487,7 @@ to download the full example code</p>
<span class="n">bench_layer_norm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">save_path</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">,</span> <span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span> <span class="n">bench_layer_norm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">save_path</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">,</span> <span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
</pre></div> </pre></div>
</div> </div>
<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 12.550 seconds)</p> <p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 2 minutes 14.583 seconds)</p>
<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-05-layer-norm-py"> <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-05-layer-norm-py">
<div class="sphx-glr-download sphx-glr-download-python docutils container"> <div class="sphx-glr-download sphx-glr-download-python docutils container">
<p><a class="reference download internal" download="" href="../../_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">05-layer-norm.py</span></code></a></p> <p><a class="reference download internal" download="" href="../../_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">05-layer-norm.py</span></code></a></p>

View File

@@ -174,7 +174,7 @@
<div class="section" id="computation-times"> <div class="section" id="computation-times">
<span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1> <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline"></a></h1>
<p><strong>13:15.622</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p> <p><strong>14:13.919</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
<table class="docutils align-default"> <table class="docutils align-default">
<colgroup> <colgroup>
<col style="width: 85%" /> <col style="width: 85%" />
@@ -183,23 +183,23 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td> <tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
<td><p>06:00.645</p></td> <td><p>06:48.521</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td> <tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
<td><p>03:19.345</p></td> <td><p>03:27.571</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
<tr class="row-odd"><td><p><a class="reference internal" href="05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">Layer Normalization</span></a> (<code class="docutils literal notranslate"><span class="pre">05-layer-norm.py</span></code>)</p></td> <tr class="row-odd"><td><p><a class="reference internal" href="05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">Layer Normalization</span></a> (<code class="docutils literal notranslate"><span class="pre">05-layer-norm.py</span></code>)</p></td>
<td><p>02:12.550</p></td> <td><p>02:14.583</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td> <tr class="row-even"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
<td><p>01:42.600</p></td> <td><p>01:42.917</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
<tr class="row-odd"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td> <tr class="row-odd"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
<td><p>00:00.482</p></td> <td><p>00:00.327</p></td>
<td><p>0.0 MB</p></td> <td><p>0.0 MB</p></td>
</tr> </tr>
</tbody> </tbody>

View File

@@ -200,11 +200,11 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.broadcast_to"> <dt class="sig sig-object py" id="triton.language.broadcast_to">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">broadcast_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.broadcast_to" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">broadcast_to</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.broadcast_to" title="Permalink to this definition"></a></dt>
<dd><p>Tries to broadcast the given block to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p> <dd><p>Tries to broadcast the given tensor to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) The input block.</p></li> <li><p><strong>input</strong> (<em>Block</em>) The input tensor.</p></li>
<li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li> <li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li>
</ul> </ul>
</dd> </dd>

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The first block to be multiplied.</p></li> <li><p><strong>input</strong> (2D tensor of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The first tensor to be multiplied.</p></li>
<li><p><strong>other</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The second block to be multiplied.</p></li> <li><p><strong>other</strong> (2D tensor of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) The second tensor to be multiplied.</p></li>
</ul> </ul>
</dd> </dd>
</dl> </dl>

View File

@@ -201,7 +201,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.load"> <dt class="sig sig-object py" id="triton.language.load">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache_modifier</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eviction_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">volatile</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.load" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache_modifier</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">eviction_policy</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">volatile</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.load" title="Permalink to this definition"></a></dt>
<dd><p>Return a block of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p> <dd><p>Return a tensor of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">mask</span></code> and <code class="code docutils literal notranslate"><span class="pre">other</span></code> are implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code>.</p> <p><code class="code docutils literal notranslate"><span class="pre">mask</span></code> and <code class="code docutils literal notranslate"><span class="pre">other</span></code> are implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">other</span></code> is implicitly typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p> <p><code class="code docutils literal notranslate"><span class="pre">other</span></code> is implicitly typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p>
<dl class="field-list simple"> <dl class="field-list simple">

View File

@@ -200,7 +200,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.max"> <dt class="sig sig-object py" id="triton.language.max">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">max</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.max" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">max</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.max" title="Permalink to this definition"></a></dt>
<dd><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p> <dd><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) the first input block</p></li> <li><p><strong>input</strong> (<em>Block</em>) the first input tensor</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input block</p></li> <li><p><strong>other</strong> (<em>Block</em>) the second input tensor</p></li>
</ul> </ul>
</dd> </dd>
</dl> </dl>

View File

@@ -200,7 +200,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.min"> <dt class="sig sig-object py" id="triton.language.min">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">min</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.min" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">min</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.min" title="Permalink to this definition"></a></dt>
<dd><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p> <dd><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">

View File

@@ -203,8 +203,8 @@
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> (<em>Block</em>) the first input block</p></li> <li><p><strong>input</strong> (<em>Block</em>) the first input tensor</p></li>
<li><p><strong>other</strong> (<em>Block</em>) the second input block</p></li> <li><p><strong>other</strong> (<em>Block</em>) the second input tensor</p></li>
</ul> </ul>
</dd> </dd>
</dl> </dl>

View File

@@ -203,7 +203,7 @@
<dd><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p> <dd><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><p><strong>x</strong> (<em>Block</em>) the input block</p> <dd class="field-odd"><p><strong>x</strong> (<em>Block</em>) the input tensor</p>
</dd> </dd>
</dl> </dl>
</dd></dl> </dd></dl>

View File

@@ -200,11 +200,11 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.reshape"> <dt class="sig sig-object py" id="triton.language.reshape">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">reshape</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.reshape" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">reshape</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">shape</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.reshape" title="Permalink to this definition"></a></dt>
<dd><p>Tries to reshape the given block to a new shape.</p> <dd><p>Tries to reshape the given tensor to a new shape.</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>input</strong> The input block.</p></li> <li><p><strong>input</strong> The input tensor.</p></li>
<li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li> <li><p><strong>shape</strong> (<em>Tuple</em><em>[</em><em>int</em><em>]</em>) The desired shape.</p></li>
</ul> </ul>
</dd> </dd>

View File

@@ -203,7 +203,7 @@
<h1>triton.language.softmax<a class="headerlink" href="#triton-language-softmax" title="Permalink to this headline"></a></h1> <h1>triton.language.softmax<a class="headerlink" href="#triton-language-softmax" title="Permalink to this headline"></a></h1>
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.softmax"> <dt class="sig sig-object py" id="triton.language.softmax">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ieee_rounding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ieee_rounding</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition"></a></dt>
<dd><p>Computes the element-wise softmax of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p> <dd><p>Computes the element-wise softmax of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>

View File

@@ -201,13 +201,13 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.store"> <dt class="sig sig-object py" id="triton.language.store">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">store</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.store" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">store</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">value</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.store" title="Permalink to this definition"></a></dt>
<dd><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> block of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p> <dd><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> tensor of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
<p><code class="code docutils literal notranslate"><span class="pre">value</span></code> is implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code> and typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p> <p><code class="code docutils literal notranslate"><span class="pre">value</span></code> is implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code> and typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">
<li><p><strong>pointer</strong> (<em>Block of dtype=triton.PointerDType</em>) The memory locations where the elements of <code class="code docutils literal notranslate"><span class="pre">value</span></code> are stored.</p></li> <li><p><strong>pointer</strong> (<em>Block of dtype=triton.PointerDType</em>) The memory locations where the elements of <code class="code docutils literal notranslate"><span class="pre">value</span></code> are stored.</p></li>
<li><p><strong>value</strong> (<em>Block</em>) The block of elements to be stored.</p></li> <li><p><strong>value</strong> (<em>Block</em>) The tensor of elements to be stored.</p></li>
<li><p><strong>mask</strong> (<em>Block of triton.int1</em><em>, </em><em>optional</em>) If mask[idx] is false, do not store <code class="code docutils literal notranslate"><span class="pre">value[idx]</span></code> at <code class="code docutils literal notranslate"><span class="pre">pointer[idx]</span></code>.</p></li> <li><p><strong>mask</strong> (<em>Block of triton.int1</em><em>, </em><em>optional</em>) If mask[idx] is false, do not store <code class="code docutils literal notranslate"><span class="pre">value[idx]</span></code> at <code class="code docutils literal notranslate"><span class="pre">pointer[idx]</span></code>.</p></li>
</ul> </ul>
</dd> </dd>

View File

@@ -200,7 +200,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.sum"> <dt class="sig sig-object py" id="triton.language.sum">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">sum</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.sum" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">sum</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">axis</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.sum" title="Permalink to this definition"></a></dt>
<dd><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p> <dd><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">

View File

@@ -198,7 +198,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.where"> <dt class="sig sig-object py" id="triton.language.where">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">where</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">condition</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.where" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">where</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">condition</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">y</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.where" title="Permalink to this definition"></a></dt>
<dd><p>Returns a block of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p> <dd><p>Returns a tensor of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p>
<p>Note that <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are always evaluated regardless of the value of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p> <p>Note that <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are always evaluated regardless of the value of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p>
<p>If you want to avoid unintented memory operations, use the <code class="code docutils literal notranslate"><span class="pre">mask</span></code> arguments in <cite>triton.load</cite> and <cite>triton.store</cite> instead.</p> <p>If you want to avoid unintented memory operations, use the <code class="code docutils literal notranslate"><span class="pre">mask</span></code> arguments in <cite>triton.load</cite> and <cite>triton.store</cite> instead.</p>
<p>The shape of <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are both broadcast to the shape of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>. <p>The shape of <code class="code docutils literal notranslate"><span class="pre">x</span></code> and <code class="code docutils literal notranslate"><span class="pre">y</span></code> are both broadcast to the shape of <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.

View File

@@ -199,7 +199,7 @@
<dl class="py function"> <dl class="py function">
<dt class="sig sig-object py" id="triton.language.zeros"> <dt class="sig sig-object py" id="triton.language.zeros">
<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">zeros</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">shape</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.zeros" title="Permalink to this definition"></a></dt> <span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">zeros</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">shape</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">dtype</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.zeros" title="Permalink to this definition"></a></dt>
<dd><p>Returns a block filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p> <dd><p>Returns a tensor filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p>
<dl class="field-list simple"> <dl class="field-list simple">
<dt class="field-odd">Parameters</dt> <dt class="field-odd">Parameters</dt>
<dd class="field-odd"><ul class="simple"> <dd class="field-odd"><ul class="simple">

View File

@@ -278,7 +278,7 @@
<td><p>Returns contiguous values within the open interval [<code class="code docutils literal notranslate"><span class="pre">start</span></code>, <code class="code docutils literal notranslate"><span class="pre">end</span></code>).</p></td> <td><p>Returns contiguous values within the open interval [<code class="code docutils literal notranslate"><span class="pre">start</span></code>, <code class="code docutils literal notranslate"><span class="pre">end</span></code>).</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.zeros.html#triton.language.zeros" title="triton.language.zeros"><code class="xref py py-obj docutils literal notranslate"><span class="pre">zeros</span></code></a></p></td> <tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.zeros.html#triton.language.zeros" title="triton.language.zeros"><code class="xref py py-obj docutils literal notranslate"><span class="pre">zeros</span></code></a></p></td>
<td><p>Returns a block filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p></td> <td><p>Returns a tensor filled with the scalar value 0 for the given <code class="code docutils literal notranslate"><span class="pre">shape</span></code> and <code class="code docutils literal notranslate"><span class="pre">dtype</span></code>.</p></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
@@ -292,10 +292,10 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.broadcast_to.html#triton.language.broadcast_to" title="triton.language.broadcast_to"><code class="xref py py-obj docutils literal notranslate"><span class="pre">broadcast_to</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.broadcast_to.html#triton.language.broadcast_to" title="triton.language.broadcast_to"><code class="xref py py-obj docutils literal notranslate"><span class="pre">broadcast_to</span></code></a></p></td>
<td><p>Tries to broadcast the given block to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p></td> <td><p>Tries to broadcast the given tensor to a new <code class="code docutils literal notranslate"><span class="pre">shape</span></code>.</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.reshape.html#triton.language.reshape" title="triton.language.reshape"><code class="xref py py-obj docutils literal notranslate"><span class="pre">reshape</span></code></a></p></td> <tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.reshape.html#triton.language.reshape" title="triton.language.reshape"><code class="xref py py-obj docutils literal notranslate"><span class="pre">reshape</span></code></a></p></td>
<td><p>Tries to reshape the given block to a new shape.</p></td> <td><p>Tries to reshape the given tensor to a new shape.</p></td>
</tr> </tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.ravel.html#triton.language.ravel" title="triton.language.ravel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ravel</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.ravel.html#triton.language.ravel" title="triton.language.ravel"><code class="xref py py-obj docutils literal notranslate"><span class="pre">ravel</span></code></a></p></td>
<td><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p></td> <td><p>Returns a contiguous flattened view of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p></td>
@@ -326,10 +326,10 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.load.html#triton.language.load" title="triton.language.load"><code class="xref py py-obj docutils literal notranslate"><span class="pre">load</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.load.html#triton.language.load" title="triton.language.load"><code class="xref py py-obj docutils literal notranslate"><span class="pre">load</span></code></a></p></td>
<td><p>Return a block of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td> <td><p>Return a tensor of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.store.html#triton.language.store" title="triton.language.store"><code class="xref py py-obj docutils literal notranslate"><span class="pre">store</span></code></a></p></td> <tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.store.html#triton.language.store" title="triton.language.store"><code class="xref py py-obj docutils literal notranslate"><span class="pre">store</span></code></a></p></td>
<td><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> block of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td> <td><p>Stores <code class="code docutils literal notranslate"><span class="pre">value</span></code> tensor of elements in memory, element-wise, at the memory locations specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
</tr> </tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.atomic_cas.html#triton.language.atomic_cas" title="triton.language.atomic_cas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">atomic_cas</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.atomic_cas.html#triton.language.atomic_cas" title="triton.language.atomic_cas"><code class="xref py py-obj docutils literal notranslate"><span class="pre">atomic_cas</span></code></a></p></td>
<td><p>Performs an atomic compare-and-swap at the memory location specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td> <td><p>Performs an atomic compare-and-swap at the memory location specified by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p></td>
@@ -349,7 +349,7 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.where.html#triton.language.where" title="triton.language.where"><code class="xref py py-obj docutils literal notranslate"><span class="pre">where</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.where.html#triton.language.where" title="triton.language.where"><code class="xref py py-obj docutils literal notranslate"><span class="pre">where</span></code></a></p></td>
<td><p>Returns a block of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p></td> <td><p>Returns a tensor of elements from either <code class="code docutils literal notranslate"><span class="pre">x</span></code> or <code class="code docutils literal notranslate"><span class="pre">y</span></code>, depending on <code class="code docutils literal notranslate"><span class="pre">condition</span></code>.</p></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
@@ -395,13 +395,13 @@
</colgroup> </colgroup>
<tbody> <tbody>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.max.html#triton.language.max" title="triton.language.max"><code class="xref py py-obj docutils literal notranslate"><span class="pre">max</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.max.html#triton.language.max" title="triton.language.max"><code class="xref py py-obj docutils literal notranslate"><span class="pre">max</span></code></a></p></td>
<td><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td> <td><p>Returns the maximum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr> </tr>
<tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.min.html#triton.language.min" title="triton.language.min"><code class="xref py py-obj docutils literal notranslate"><span class="pre">min</span></code></a></p></td> <tr class="row-even"><td><p><a class="reference internal" href="generated/triton.language.min.html#triton.language.min" title="triton.language.min"><code class="xref py py-obj docutils literal notranslate"><span class="pre">min</span></code></a></p></td>
<td><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td> <td><p>Returns the minimum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr> </tr>
<tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.sum.html#triton.language.sum" title="triton.language.sum"><code class="xref py py-obj docutils literal notranslate"><span class="pre">sum</span></code></a></p></td> <tr class="row-odd"><td><p><a class="reference internal" href="generated/triton.language.sum.html#triton.language.sum" title="triton.language.sum"><code class="xref py py-obj docutils literal notranslate"><span class="pre">sum</span></code></a></p></td>
<td><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> block along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td> <td><p>Returns the sum of all elements in the <code class="code docutils literal notranslate"><span class="pre">input</span></code> tensor along the provided <code class="code docutils literal notranslate"><span class="pre">axis</span></code></p></td>
</tr> </tr>
</tbody> </tbody>
</table> </table>

File diff suppressed because one or more lines are too long

View File

@@ -1,4 +1,4 @@
# Sphinx build info version 1 # Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 27fe8ebc933e4ad0f9352faba2dbdccd config: cad12a755c359cf8dbd8410700935254
tags: 645f666f9bcd5a90fca523b33c5a78b7 tags: 645f666f9bcd5a90fca523b33c5a78b7

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More