From 9a11a567ce63fe045bf9661de7409391d4d2d5c7 Mon Sep 17 00:00:00 2001
From: Chris <Chris.W.Carucci@Gmail.com>
Date: Sun, 9 Oct 2022 21:12:46 -0400
Subject: [PATCH] [DOCS] Fixed typos in 01-vector-add.py (#751)

---
 python/tutorials/01-vector-add.py | 70 +++++++++++++++----------------
 1 file changed, 35 insertions(+), 35 deletions(-)

diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
index d684106f1..3e8236753 100644
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -3,9 +3,9 @@ Vector Addition
 =================
 In this tutorial, you will write a simple vector addition using Triton and learn about:
 
-- The basic programming model of Triton
+- The basic programming model of Triton.
 - The `triton.jit` decorator, which is used to define Triton kernels.
-- The best practices for validating and benchmarking your custom ops against native reference implementations
+- The best practices for validating and benchmarking your custom ops against native reference implementations.
 """
 
 # %%
@@ -20,51 +20,51 @@ import triton.language as tl
 
 @triton.jit
 def add_kernel(
-    x_ptr,  # *Pointer* to first input vector
-    y_ptr,  # *Pointer* to second input vector
-    output_ptr,  # *Pointer* to output vector
-    n_elements,  # Size of the vector
-    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
-                 # NOTE: `constexpr` so it can be used as a shape value
+    x_ptr,  # *Pointer* to first input vector.
+    y_ptr,  # *Pointer* to second input vector.
+    output_ptr,  # *Pointer* to output vector.
+    n_elements,  # Size of the vector.
+    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process.
+                 # NOTE: `constexpr` so it can be used as a shape value.
 ):
-    # There are multiple 'program's processing different data. We identify which program
-    # we are here
-    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
+    # There are multiple 'programs' processing different data. We identify which program
+    # we are here:
+    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0.
     # This program will process inputs that are offset from the initial data.
-    # for instance, if you had a vector of length 256 and block_size of 64, the programs
+    # For instance, if you had a vector of length 256 and block_size of 64, the programs
     # would each access the elements [0:64, 64:128, 128:192, 192:256].
-    # Note that offsets is a list of pointers
+    # Note that offsets is a list of pointers:
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    # Create a mask to guard memory operations against out-of-bounds accesses
+    # Create a mask to guard memory operations against out-of-bounds accesses.
     mask = offsets < n_elements
     # Load x and y from DRAM, masking out any extra elements in case the input is not a
-    # multiple of the block size
+    # multiple of the block size.
     x = tl.load(x_ptr + offsets, mask=mask)
     y = tl.load(y_ptr + offsets, mask=mask)
     output = x + y
-    # Write x + y back to DRAM
+    # Write x + y back to DRAM.
     tl.store(output_ptr + offsets, output, mask=mask)
 
 
 # %%
 # Let's also declare a helper function to (1) allocate the `z` tensor
-# and (2) enqueue the above kernel with appropriate grid/block sizes.
+# and (2) enqueue the above kernel with appropriate grid/block sizes:
 
 
 def add(x: torch.Tensor, y: torch.Tensor):
-    # We need to preallocate the output
+    # We need to preallocate the output.
     output = torch.empty_like(x)
     assert x.is_cuda and y.is_cuda and output.is_cuda
     n_elements = output.numel()
     # The SPMD launch grid denotes the number of kernel instances that run in parallel.
-    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
-    # In this case, we use a 1D grid where the size is the number of blocks
+    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
+    # In this case, we use a 1D grid where the size is the number of blocks:
     grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
     # NOTE:
-    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
-    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
-    #  - don't forget to pass meta-parameters as keywords arguments
+    #  - Each torch.tensor object is implicitly converted into a pointer to its first element.
+    #  - `triton.jit`'ed functions can be indexed with a launch grid to obtain a callable GPU kernel.
+    #  - Don't forget to pass meta-parameters as keywords arguments.
     add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
     # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
     # running asynchronously at this point.
@@ -94,24 +94,24 @@ print(
 # Benchmark
 # -----------
 # We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
-# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
+# To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops.
 # for different problem sizes.
 
 
 @triton.testing.perf_report(
     triton.testing.Benchmark(
-        x_names=['size'],  # argument names to use as an x-axis for the plot
+        x_names=['size'],  # Argument names to use as an x-axis for the plot.
         x_vals=[
             2 ** i for i in range(12, 28, 1)
-        ],  # different possible values for `x_name`
-        x_log=True,  # x axis is logarithmic
-        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
-        line_vals=['triton', 'torch'],  # possible values for `line_arg`
-        line_names=['Triton', 'Torch'],  # label name for the lines
-        styles=[('blue', '-'), ('green', '-')],  # line styles
-        ylabel='GB/s',  # label name for the y-axis
-        plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.
-        args={},  # values for function arguments not in `x_names` and `y_name`
+        ],  # Different possible values for `x_name`.
+        x_log=True,  # x axis is logarithmic.
+        line_arg='provider',  # Argument name whose value corresponds to a different line in the plot.
+        line_vals=['triton', 'torch'],  # Possible values for `line_arg`.
+        line_names=['Triton', 'Torch'],  # Label name for the lines.
+        styles=[('blue', '-'), ('green', '-')],  # Line styles.
+        ylabel='GB/s',  # Label name for the y-axis.
+        plot_name='vector-add-performance',  # Name for the plot. Used also as a file name for saving the plot.
+        args={},  # Values for function arguments not in `x_names` and `y_name`.
     )
 )
 def benchmark(size, provider):
@@ -127,5 +127,5 @@ def benchmark(size, provider):
 
 # %%
 # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
-# `save_path='/path/to/results/' to save them to disk along with raw CSV data
+# `save_path='/path/to/results/' to save them to disk along with raw CSV data:
 benchmark.run(print_data=True, show_plots=True)