triton/python/tutorials/01-vector-add.py

"""
Vector Addition
=================
In this tutorial, you will write a simple vector addition using Triton and learn about:

- The basic programming model of Triton
- The `triton.jit` decorator, which is used to define Triton kernels.
- The best practices for validating and benchmarking your custom ops against native reference implementations
"""

# %%
# Compute Kernel
# --------------------------

import torch
import triton.language as tl
import triton


@triton.jit
def _add(
    X,  # *Pointer* to first input vector
    Y,  # *Pointer* to second input vector
    Z,  # *Pointer* to output vector
    N,  # Size of the vector
    **meta  # Optional meta-parameters for the kernel
):
    pid = tl.program_id(0)
    # Create an offset for the blocks of pointers to be
    # processed by this program instance
    offsets = pid * meta['BLOCK'] + tl.arange(0, meta['BLOCK'])
    # Create a mask to guard memory operations against
    # out-of-bounds accesses
    mask = offsets < N
    # Load x
    x = tl.load(X + offsets, mask=mask)
    y = tl.load(Y + offsets, mask=mask)
    # Write back x + y
    z = x + y
    tl.store(Z + offsets, z)


# %%
# Let's also declare a helper function that to (1) allocate the output vector
# and (2) enqueueing the above kernel.


def add(x, y):
    z = torch.empty_like(x)
    N = z.shape[0]
    # The SPMD launch grid denotes the number of kernel instances that run in parallel.
    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
    grid = lambda meta: (triton.cdiv(N, meta['BLOCK']), )
    # NOTE:
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
    _add[grid](x, y, z, N, BLOCK=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    return z


# %%
# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:

torch.manual_seed(0)
size = 98432
x = torch.rand(size, device='cuda')
y = torch.rand(size, device='cuda')
za = x + y
zb = add(x, y)
print(za)
print(zb)
print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')

# %%
# Seems like we're good to go!

# %%
# Benchmark
# -----------
# We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.
# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
# for different problem sizes.


@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['size'],  # argument names to use as an x-axis for the plot
        x_vals=[2**i for i in range(12, 28, 1)],  # different possible values for `x_name`
        x_log=True,  # x axis is logarithmic
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        line_vals=['triton', 'torch'],  # possible values for `line_arg`
        line_names=["Triton", "Torch"],  # label name for the lines
        styles=[('blue', '-'), ('green', '-')],  # line styles
        ylabel="GB/s",  # label name for the y-axis
        plot_name="vector-add-performance",  # name for the plot. Used also as a file name for saving the plot.
        args={}  # values for function arguments not in `x_names` and `y_name`
    )
)
def benchmark(size, provider):
    x = torch.rand(size, device='cuda', dtype=torch.float32)
    y = torch.rand(size, device='cuda', dtype=torch.float32)
    if provider == 'torch':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
    gbps = lambda ms: 12 * size / ms * 1e-6
    return gbps(ms), gbps(max_ms), gbps(min_ms)


# %%
# We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
# `save_path='/path/to/results/' to save them to disk along with raw CSV data
benchmark.run(print_data=True, show_plots=True)
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`"""`
			`Vector Addition`
			`=================`
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`In this tutorial, you will write a simple vector addition using Triton and learn about:`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`- The basic programming model of Triton`
			- The `triton.jit` decorator, which is used to define Triton kernels.
			`- The best practices for validating and benchmarking your custom ops against native reference implementations`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`"""`

			`# %%`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00			`# Compute Kernel`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# --------------------------`

[DOCS] Updates and improvements (#87) 2021-04-22 10:27:02 -04:00			`import torch`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`import triton.language as tl`
[DOCS] Updates and improvements (#87) 2021-04-22 10:27:02 -04:00			`import triton`

[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`@triton.jit`
			`def _add(`
			`X, # Pointer to first input vector`
			`Y, # Pointer to second input vector`
			`Z, # Pointer to output vector`
			`N, # Size of the vector`
			`**meta # Optional meta-parameters for the kernel`
			`):`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`pid = tl.program_id(0)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# Create an offset for the blocks of pointers to be`
			`# processed by this program instance`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`offsets = pid * meta['BLOCK'] + tl.arange(0, meta['BLOCK'])`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# Create a mask to guard memory operations against`
			`# out-of-bounds accesses`
			`mask = offsets < N`
			`# Load x`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`x = tl.load(X + offsets, mask=mask)`
			`y = tl.load(Y + offsets, mask=mask)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# Write back x + y`
			`z = x + y`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`tl.store(Z + offsets, z)`
[DOCS] Re-structured documentation hierarchy 2021-03-06 17:26:49 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`# %%`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# Let's also declare a helper function that to (1) allocate the output vector`
			`# and (2) enqueueing the above kernel.`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00

			`def add(x, y):`
			`z = torch.empty_like(x)`
			`N = z.shape[0]`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# The SPMD launch grid denotes the number of kernel instances that run in parallel.`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]`
			`grid = lambda meta: (triton.cdiv(N, meta['BLOCK']), )`
			`# NOTE:`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# - each torch.tensor object is implicitly converted into a pointer to its first element.`
			# - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# - don't forget to pass meta-parameters as keywords arguments`
			`_add[grid](x, y, z, N, BLOCK=1024)`
			# We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# running asynchronously at this point.`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`return z`

[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# %%`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			# We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`torch.manual_seed(0)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`size = 98432`
			`x = torch.rand(size, device='cuda')`
			`y = torch.rand(size, device='cuda')`
[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`za = x + y`
			`zb = add(x, y)`
			`print(za)`
			`print(zb)`
			`print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')`

[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`# %%`
			`# Seems like we're good to go!`

[DOCS] Switched tutorials to Python and use Sphinx Gallery 2021-03-06 14:03:01 -05:00			`# %%`
[DOCS] Added matrix multiplication tutorial 2021-03-14 18:49:59 -04:00			`# Benchmark`
			`# -----------`
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00			`# We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does relative to PyTorch.`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			`# for different problem sizes.`


			`@triton.testing.perf_report(`
			`triton.testing.Benchmark(`
			`x_names=['size'], # argument names to use as an x-axis for the plot`
			x_vals=[2**i for i in range(12, 28, 1)], # different possible values for `x_name`
			`x_log=True, # x axis is logarithmic`
[PYTHON] Renamed triton.core -> triton.language (#92) 2021-04-23 17:18:14 -04:00			`line_arg='provider', # argument name whose value corresponds to a different line in the plot`
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			line_vals=['triton', 'torch'], # possible values for `line_arg`
			`line_names=["Triton", "Torch"], # label name for the lines`
			`styles=[('blue', '-'), ('green', '-')], # line styles`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			`ylabel="GB/s", # label name for the y-axis`
			`plot_name="vector-add-performance", # name for the plot. Used also as a file name for saving the plot.`
			args={} # values for function arguments not in `x_names` and `y_name`
			`)`
			`)`
			`def benchmark(size, provider):`
			`x = torch.rand(size, device='cuda', dtype=torch.float32)`
			`y = torch.rand(size, device='cuda', dtype=torch.float32)`
			`if provider == 'torch':`
			`ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)`
			`if provider == 'triton':`
			`ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))`
			`gbps = lambda ms: 12 * size / ms * 1e-6`
			`return gbps(ms), gbps(max_ms), gbps(min_ms)`
[DOCS] Improved tutorials documentation 2021-03-06 22:04:00 -05:00

			`# %%`
[DOCS] Improved plots in tutorials 2021-03-11 00:29:16 -05:00			# We can now run the decorated function above. Pass `show_plots=True` to see the plots and/or
			# `save_path='/path/to/results/' to save them to disk along with raw CSV data
[DOCS] Improved documentation and integration in CI (#139) 2021-07-22 22:45:19 -07:00			`benchmark.run(print_data=True, show_plots=True)`