triton/python/tutorials/01-vector-add.py

"""
Vector Addition
=================
In this tutorial, you will write a simple vector addition using Triton and learn about:

- The basic programming model of Triton
- The `triton.jit` decorator, which is used to define Triton kernels.
- The best practices for validating and benchmarking your custom ops against native reference implementations
"""

# %%
# Compute Kernel
# --------------------------

import torch

import triton
import triton.language as tl


@triton.jit
def add_kernel(
    x_ptr,  # *Pointer* to first input vector
    y_ptr,  # *Pointer* to second input vector
    output_ptr,  # *Pointer* to output vector
    n_elements,  # Size of the vector
    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
                 # NOTE: `constexpr` so it can be used as a shape value
):
    # There are multiple 'program's processing different data. We identify which program
    # we are here
    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
    # This program will process inputs that are offset from the initial data.
    # for instance, if you had a vector of length 256 and block_size of 64, the programs
    # would each access the elements [0:64, 64:128, 128:192, 192:256].
    # Note that offsets is a list of pointers
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask to guard memory operations against out-of-bounds accesses
    mask = offsets < n_elements
    # Load x and y from DRAM, masking out any extra elements in case the input is not a
    # multiple of the block size
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    # Write x + y back to DRAM
    tl.store(output_ptr + offsets, output, mask=mask)


# %%
# Let's also declare a helper function to (1) allocate the `z` tensor
# and (2) enqueue the above kernel with appropriate grid/block sizes.


def add(x: torch.Tensor, y: torch.Tensor):
    # We need to preallocate the output
    output = torch.empty_like(x)
    assert x.is_cuda and y.is_cuda and output.is_cuda
    n_elements = output.numel()
    # The SPMD launch grid denotes the number of kernel instances that run in parallel.
    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
    # In this case, we use a 1D grid where the size is the number of blocks
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # NOTE:
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    return output


# %%
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:

torch.manual_seed(0)
size = 98432
x = torch.rand(size, device='cuda')
y = torch.rand(size, device='cuda')
output_torch = x + y
output_triton = add(x, y)
print(output_torch)
print(output_triton)
print(
    f'The maximum difference between torch and triton is '
    f'{torch.max(torch.abs(output_torch - output_triton))}'
)

# %%
# Seems like we're good to go!

# %%
# Benchmark
# -----------
# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
# for different problem sizes.


@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['size'],  # argument names to use as an x-axis for the plot
        x_vals=[
            2 ** i for i in range(12, 28, 1)
        ],  # different possible values for `x_name`
        x_log=True,  # x axis is logarithmic
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        line_vals=['triton', 'torch'],  # possible values for `line_arg`
        line_names=['Triton', 'Torch'],  # label name for the lines
        styles=[('blue', '-'), ('green', '-')],  # line styles
        ylabel='GB/s',  # label name for the y-axis
        plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.
        args={},  # values for function arguments not in `x_names` and `y_name`
    )
)
def benchmark(size, provider):
    x = torch.rand(size, device='cuda', dtype=torch.float32)
    y = torch.rand(size, device='cuda', dtype=torch.float32)
    if provider == 'torch':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
    gbps = lambda ms: 12 * size / ms * 1e-6
    return gbps(ms), gbps(max_ms), gbps(min_ms)


# %%
# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
# `save_path='/path/to/results/' to save them to disk along with raw CSV data
# benchmark.run(print_data=True, show_plots=True)
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`"""`
			`Vector Addition`
			`=================`
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`In this tutorial, you will write a simple vector addition using Triton and learn about:`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`- The basic programming model of Triton`
			- The `triton.jit` decorator, which is used to define Triton kernels.
			`- The best practices for validating and benchmarking your custom ops against native reference implementations`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`"""`

			`# %%`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00			`# Compute Kernel`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# --------------------------`

[DOCS] Updates and improvements (#87) 2021-04-22 10:27:02 -04:00			`import torch`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[DOCS] Updates and improvements (#87) 2021-04-22 10:27:02 -04:00			`import triton`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`import triton.language as tl`
[DOCS] Updates and improvements (#87) 2021-04-22 10:27:02 -04:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`@triton.jit`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`def add_kernel(`
			`x_ptr, # Pointer to first input vector`
			`y_ptr, # Pointer to second input vector`
			`output_ptr, # Pointer to output vector`
			`n_elements, # Size of the vector`
[LANG] Added support for constexpr (#361) 2021-10-30 00:32:58 -07:00			`BLOCK_SIZE: tl.constexpr, # Number of elements each program should process`
			# NOTE: `constexpr` so it can be used as a shape value
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`):`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`# There are multiple 'program's processing different data. We identify which program`
			`# we are here`
			`pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0`
			`# This program will process inputs that are offset from the initial data.`
			`# for instance, if you had a vector of length 256 and block_size of 64, the programs`
			`# would each access the elements [0:64, 64:128, 128:192, 192:256].`
			`# Note that offsets is a list of pointers`
			`block_start = pid * BLOCK_SIZE`
			`offsets = block_start + tl.arange(0, BLOCK_SIZE)`
			`# Create a mask to guard memory operations against out-of-bounds accesses`
			`mask = offsets < n_elements`
[TUTORIALS] Fix 01-vector-add.py typo (#406) 2021-12-29 18:09:34 -05:00			`# Load x and y from DRAM, masking out any extra elements in case the input is not a`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`# multiple of the block size`
			`x = tl.load(x_ptr + offsets, mask=mask)`
			`y = tl.load(y_ptr + offsets, mask=mask)`
			`output = x + y`
			`# Write x + y back to DRAM`
[LANG] Added seeded random number generation - philox (#261) 2021-09-02 22:02:40 -07:00			`tl.store(output_ptr + offsets, output, mask=mask)`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`# %%`
[GENERAL] Some minor improvements here and there to build systems and docs (#148) 2021-07-28 01:51:17 -07:00			# Let's also declare a helper function to (1) allocate the `z` tensor
			`# and (2) enqueue the above kernel with appropriate grid/block sizes.`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00

[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`def add(x: torch.Tensor, y: torch.Tensor):`
			`# We need to preallocate the output`
			`output = torch.empty_like(x)`
			`assert x.is_cuda and y.is_cuda and output.is_cuda`
[DOCS] use numel for num_elements in elementwise tutorial (#228) 2021-08-20 08:05:12 +05:30			`n_elements = output.numel()`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# The SPMD launch grid denotes the number of kernel instances that run in parallel.`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`# In this case, we use a 1D grid where the size is the number of blocks`
			`grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# NOTE:`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# - each torch.tensor object is implicitly converted into a pointer to its first element.`
[Triton-MLIR] Fix some typos (#874) Fix some typos 2022-11-14 10:15:53 +08:00			# - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# - don't forget to pass meta-parameters as keywords arguments`
[STYLE] check python with flake8 (#424) I've been using this locally to find errors without running tests, and now that we're using autopep8, it passes with minimal suppressions. This is also what turned up the issues with the tutorials, which were fixed in #422. 2022-01-07 15:28:36 -08:00			`add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# running asynchronously at this point.`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`return output`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# %%`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`torch.manual_seed(0)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`size = 98432`
			`x = torch.rand(size, device='cuda')`
			`y = torch.rand(size, device='cuda')`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`output_torch = x + y`
			`output_triton = add(x, y)`
			`print(output_torch)`
			`print(output_triton)`
			`print(`
			`f'The maximum difference between torch and triton is '`
			`f'{torch.max(torch.abs(output_torch - output_triton))}'`
			`)`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`# %%`
			`# Seems like we're good to go!`

[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# %%`
[DOCS] Added matrix multiplication tutorial 2021-03-14 18:49:59 -04:00			`# Benchmark`
			`# -----------`
[GENERAL] Some minor improvements here and there to build systems and docs (#148) 2021-07-28 01:51:17 -07:00			`# We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			`# for different problem sizes.`


			`@triton.testing.perf_report(`
			`triton.testing.Benchmark(`
			`x_names=['size'], # argument names to use as an x-axis for the plot`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`x_vals=[`
			`2 ** i for i in range(12, 28, 1)`
			], # different possible values for `x_name`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			`x_log=True, # x axis is logarithmic`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`line_arg='provider', # argument name whose value corresponds to a different line in the plot`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			line_vals=['triton', 'torch'], # possible values for `line_arg`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`line_names=['Triton', 'Torch'], # label name for the lines`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`styles=[('blue', '-'), ('green', '-')], # line styles`
[DOCS] Improve tutorial readability (#185) 2021-08-05 12:27:06 -04:00			`ylabel='GB/s', # label name for the y-axis`
			`plot_name='vector-add-performance', # name for the plot. Used also as a file name for saving the plot.`
			args={}, # values for function arguments not in `x_names` and `y_name`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			`)`
			`)`
			`def benchmark(size, provider):`
			`x = torch.rand(size, device='cuda', dtype=torch.float32)`
			`y = torch.rand(size, device='cuda', dtype=torch.float32)`
			`if provider == 'torch':`
			`ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)`
			`if provider == 'triton':`
			`ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))`
			`gbps = lambda ms: 12 * size / ms * 1e-6`
			`return gbps(ms), gbps(max_ms), gbps(min_ms)`
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00

			`# %%`
[GENERAL] Some minor improvements here and there to build systems and docs (#148) 2021-07-28 01:51:17 -07:00			# We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			# `save_path='/path/to/results/' to save them to disk along with raw CSV data
bindings for ModuleOp 2022-03-30 13:32:52 +08:00			`# benchmark.run(print_data=True, show_plots=True)`