From 9a11a567ce63fe045bf9661de7409391d4d2d5c7 Mon Sep 17 00:00:00 2001 From: Chris Date: Sun, 9 Oct 2022 21:12:46 -0400 Subject: [PATCH] [DOCS] Fixed typos in 01-vector-add.py (#751) --- python/tutorials/01-vector-add.py | 70 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py index d684106f1..3e8236753 100644 --- a/python/tutorials/01-vector-add.py +++ b/python/tutorials/01-vector-add.py @@ -3,9 +3,9 @@ Vector Addition ================= In this tutorial, you will write a simple vector addition using Triton and learn about: -- The basic programming model of Triton +- The basic programming model of Triton. - The `triton.jit` decorator, which is used to define Triton kernels. -- The best practices for validating and benchmarking your custom ops against native reference implementations +- The best practices for validating and benchmarking your custom ops against native reference implementations. """ # %% @@ -20,51 +20,51 @@ import triton.language as tl @triton.jit def add_kernel( - x_ptr, # *Pointer* to first input vector - y_ptr, # *Pointer* to second input vector - output_ptr, # *Pointer* to output vector - n_elements, # Size of the vector - BLOCK_SIZE: tl.constexpr, # Number of elements each program should process - # NOTE: `constexpr` so it can be used as a shape value + x_ptr, # *Pointer* to first input vector. + y_ptr, # *Pointer* to second input vector. + output_ptr, # *Pointer* to output vector. + n_elements, # Size of the vector. + BLOCK_SIZE: tl.constexpr, # Number of elements each program should process. + # NOTE: `constexpr` so it can be used as a shape value. ): - # There are multiple 'program's processing different data. We identify which program - # we are here - pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0 + # There are multiple 'programs' processing different data. We identify which program + # we are here: + pid = tl.program_id(axis=0) # We use a 1D launch grid so axis is 0. # This program will process inputs that are offset from the initial data. - # for instance, if you had a vector of length 256 and block_size of 64, the programs + # For instance, if you had a vector of length 256 and block_size of 64, the programs # would each access the elements [0:64, 64:128, 128:192, 192:256]. - # Note that offsets is a list of pointers + # Note that offsets is a list of pointers: block_start = pid * BLOCK_SIZE offsets = block_start + tl.arange(0, BLOCK_SIZE) - # Create a mask to guard memory operations against out-of-bounds accesses + # Create a mask to guard memory operations against out-of-bounds accesses. mask = offsets < n_elements # Load x and y from DRAM, masking out any extra elements in case the input is not a - # multiple of the block size + # multiple of the block size. x = tl.load(x_ptr + offsets, mask=mask) y = tl.load(y_ptr + offsets, mask=mask) output = x + y - # Write x + y back to DRAM + # Write x + y back to DRAM. tl.store(output_ptr + offsets, output, mask=mask) # %% # Let's also declare a helper function to (1) allocate the `z` tensor -# and (2) enqueue the above kernel with appropriate grid/block sizes. +# and (2) enqueue the above kernel with appropriate grid/block sizes: def add(x: torch.Tensor, y: torch.Tensor): - # We need to preallocate the output + # We need to preallocate the output. output = torch.empty_like(x) assert x.is_cuda and y.is_cuda and output.is_cuda n_elements = output.numel() # The SPMD launch grid denotes the number of kernel instances that run in parallel. - # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int] - # In this case, we use a 1D grid where the size is the number of blocks + # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]. + # In this case, we use a 1D grid where the size is the number of blocks: grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),) # NOTE: - # - each torch.tensor object is implicitly converted into a pointer to its first element. - # - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel - # - don't forget to pass meta-parameters as keywords arguments + # - Each torch.tensor object is implicitly converted into a pointer to its first element. + # - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel. + # - Don't forget to pass meta-parameters as keywords arguments. add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024) # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still # running asynchronously at this point. @@ -94,24 +94,24 @@ print( # Benchmark # ----------- # We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch. -# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops +# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops. # for different problem sizes. @triton.testing.perf_report( triton.testing.Benchmark( - x_names=['size'], # argument names to use as an x-axis for the plot + x_names=['size'], # Argument names to use as an x-axis for the plot. x_vals=[ 2 ** i for i in range(12, 28, 1) - ], # different possible values for `x_name` - x_log=True, # x axis is logarithmic - line_arg='provider', # argument name whose value corresponds to a different line in the plot - line_vals=['triton', 'torch'], # possible values for `line_arg` - line_names=['Triton', 'Torch'], # label name for the lines - styles=[('blue', '-'), ('green', '-')], # line styles - ylabel='GB/s', # label name for the y-axis - plot_name='vector-add-performance', # name for the plot. Used also as a file name for saving the plot. - args={}, # values for function arguments not in `x_names` and `y_name` + ], # Different possible values for `x_name`. + x_log=True, # x axis is logarithmic. + line_arg='provider', # Argument name whose value corresponds to a different line in the plot. + line_vals=['triton', 'torch'], # Possible values for `line_arg`. + line_names=['Triton', 'Torch'], # Label name for the lines. + styles=[('blue', '-'), ('green', '-')], # Line styles. + ylabel='GB/s', # Label name for the y-axis. + plot_name='vector-add-performance', # Name for the plot. Used also as a file name for saving the plot. + args={}, # Values for function arguments not in `x_names` and `y_name`. ) ) def benchmark(size, provider): @@ -127,5 +127,5 @@ def benchmark(size, provider): # %% # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or -# `save_path='/path/to/results/' to save them to disk along with raw CSV data +# `save_path='/path/to/results/' to save them to disk along with raw CSV data: benchmark.run(print_data=True, show_plots=True)