From 92242ace2c0424c6f966caa034e9a5bb862b85dc Mon Sep 17 00:00:00 2001
From: Philippe Tillet <phil@openai.com>
Date: Sat, 6 Mar 2021 17:26:49 -0500
Subject: [PATCH] [DOCS] Re-structured documentation hierarchy

---
 docs/conf.py                                  |   4 +-
 .../installation.rst}                         |  17 +-
 docs/index.rst                                |  22 ++-
 docs/installation/packaged-binaries.rst       |   8 -
 docs/tutorials/01-vector-add.ipynb            | 158 ------------------
 python/tutorials/01-vector-add.py             |  44 ++---
 python/tutorials/02-fused-softmax.py          |  33 ++--
 python/tutorials/README.rst                   |   4 +-
 8 files changed, 68 insertions(+), 222 deletions(-)
 rename docs/{installation/from-source.rst => getting-started/installation.rst} (86%)
 delete mode 100644 docs/installation/packaged-binaries.rst
 delete mode 100644 docs/tutorials/01-vector-add.ipynb

diff --git a/docs/conf.py b/docs/conf.py
index 85ebd6889..5fed1334a 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -35,11 +35,13 @@ autosectionlabel_prefix_document = True
 
 # Sphinx gallery
 extensions += ['sphinx_gallery.gen_gallery']
+from sphinx_gallery.sorting import FileNameSortKey
 sphinx_gallery_conf = {
     'examples_dirs': '../python/tutorials/',
-    'gallery_dirs': 'tutorials',
+    'gallery_dirs': 'getting-started/tutorials',
     'filename_pattern': '',
     'ignore_pattern': r'__init__\.py',
+    'within_subsection_order': FileNameSortKey,
 }
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/docs/installation/from-source.rst b/docs/getting-started/installation.rst
similarity index 86%
rename from docs/installation/from-source.rst
rename to docs/getting-started/installation.rst
index 021b9a81d..07c50ca06 100644
--- a/docs/installation/from-source.rst
+++ b/docs/getting-started/installation.rst
@@ -1,6 +1,21 @@
 ==============
+Installation
+==============
+
+--------------
+With Pip
+--------------
+
+Triton can be installed directly from pip with the following command
+
+.. code-block:: python
+
+   pip install triton
+
+
+--------------
 From Source
-==============
+--------------
 
 +++++++++++++++
 Python Package
diff --git a/docs/index.rst b/docs/index.rst
index 77b005964..7d678df25 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,20 +1,18 @@
-.. Triton documentation master file, created by
-   sphinx-quickstart on Mon Feb 10 01:01:37 2020.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
 Welcome to Triton's documentation!
 ==================================
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Installation Instructions
+Triton is an imperative language and compiler for parallel programming. It aims to provide a programming environment for productively writing custom DNN compute kernels capable of running at maximal throughput on modern GPU hardware.
 
-   installation/packaged-binaries
-   installation/from-source
+Getting Started
+---------------
+
+- Follow the :doc:`installation instructions <getting-started/installation>` for your platform of choice.
+- Take a look at the :doc:`tutorials <getting-started/tutorials/index>` to learn how to write your first Triton program.
 
 .. toctree::
    :maxdepth: 1
-   :caption: Installation Instructions
+   :caption: Getting Started
+   :hidden:
 
-   tutorials/index
\ No newline at end of file
+   getting-started/installation
+   getting-started/tutorials/index
\ No newline at end of file
diff --git a/docs/installation/packaged-binaries.rst b/docs/installation/packaged-binaries.rst
deleted file mode 100644
index fdcddd887..000000000
--- a/docs/installation/packaged-binaries.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-Packaged Binaries
-=================
-
-Triton can be installed directly from pip with the following command
-
-.. code-block:: python
-
-   pip install triton
\ No newline at end of file
diff --git a/docs/tutorials/01-vector-add.ipynb b/docs/tutorials/01-vector-add.ipynb
deleted file mode 100644
index 6856022dc..000000000
--- a/docs/tutorials/01-vector-add.ipynb
+++ /dev/null
@@ -1,158 +0,0 @@
-{
-  "cells": [
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "%matplotlib inline"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "\n# Vector Addition\nIn this tutorial, we will see how to construct a simple, high-performance vector addition using Triton. You will learn:\n* The basic syntax of the Triton programming language\n* The best practices for creating PyTorch custom operators using the `triton.kernel` Python API\n* The best practices for validating and benchmarking custom ops against native reference implementations\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Writing the Compute Kernel\n\nEach compute kernel is declared using the :code:`__global__` attribute, and executed many times in parallel\non different chunks of data (See the `Single Program, Multiple Data <(https://en.wikipedia.org/wiki/SPMD>`_)\nprogramming model for more details).\n\n .. code-block:: C\n\n   __global__ void add(float* z, float* x, float* y, int N){\n       // The `get_program_id(i)` returns the i-th coordinate\n       // of the program in the overaching SPMD context\n       // (a.k.a launch grid). This is what allows us to process\n       // different chunks of data in parallel.\n       // For those similar with CUDA, `get_program_id({0,1,2})`\n       // is similar to blockIdx.{x,y,z}\n       int pid = get_program_id(0);\n       // In Triton, arrays are first-class citizen. In other words,\n       // they are primitives data-types and are -- contrary to C and\n       // CUDA -- not implemented as pointers to contiguous chunks of\n       // memory.\n       // In the few lines below, we create an array of `BLOCK` pointers\n       // whose memory values are, e.g.:\n       // [z + pid*BLOCK + 0, z + pid*BLOCK + 1, ..., z + pid*BLOCK + BLOCK - 1]\n       // Note: here BLOCK is expected to be a pre-processor macro defined at compile-time\n       int offset[BLOCK] = pid * BLOCK + 0 ... BLOCK;\n       float* pz [BLOCK] = z + offset;\n       float* px [BLOCK] = x + offset;\n       float* py [BLOCK] = y + offset;\n       // Simple element-wise control-flow for load/store operations can\n       // be achieved using the the ternary operator `cond ? val_true : val_false`\n       // or the conditional dereferencing operator `*?(cond)ptr\n       // Here, we make sure that we do not access memory out-of-bounds when we\n       // write-back `z`\n       bool check[BLOCK] = offset < N;\n       *?(check)pz = *?(check)px + *?(check)py;\n   }\n\nThe existence of arrays as a primitive data-type for Triton comes with a number of advantages that are highlighted in the `MAPL'2019 Triton paper <http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf>`_.\n\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Writing the Torch bindings\nThe only thing that matters when it comes to Triton and Torch is the `triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify `torch.tensor` objects.\n\nTo create a `triton.kernel`, you only need three things:\n- `source: string`: the source-code of the kernel you want to create\n- `device: torch.device`: the device you want to compile this code for\n- `defines: dict`: the set of macros that you want the pre-processor to `#define` for you\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "import torch\nimport triton"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "source-code for Triton compute kernel\nhere we just copy-paste the above code without the extensive comments.\nyou may prefer to store it in a .c file and load it from there instead.\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "_src = \"\"\"\n__global__ void add(float* z, float* x, float* y, int N){\n    // program id\n    int pid = get_program_id(0);\n    // create arrays of pointers\n    int offset[BLOCK] = pid * BLOCK + 0 ... BLOCK;\n    float* pz[BLOCK] = z + offset;\n    float* px[BLOCK] = x + offset;\n    float* py[BLOCK] = y + offset;\n    // bounds checking\n    bool check[BLOCK] = offset < N;\n    // write-back\n    *?(check)pz = *?(check)px + *?(check)py;\n}\n    \"\"\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This function returns a callable `triton.kernel` object\ncreated from the above source code.\nFor portability, we maintain a cache of kernels for different `torch.device`\nWe compile the kernel with -DBLOCK=1024\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "def make_add_kernel(device):\n    cache = make_add_kernel.cache\n    if device not in cache:\n        defines = {'BLOCK': 1024}\n        cache[device] = triton.kernel(_src, device=device, defines=defines)\n    return cache[device]\n\n\nmake_add_kernel.cache = dict()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "This is a standard torch custom autograd Function\nThe only difference is that we can now use the above kernel\nin the `forward` and `backward` functions.`\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "class _add(torch.autograd.Function):\n    @staticmethod\n    def forward(ctx, x, y):\n        # constraints of the op\n        assert x.dtype == torch.float32\n        # *allocate output*\n        z = torch.empty_like(x)\n        # *create launch grid*:\n        # this is a function which takes compilation parameters `opt`\n        # as input and returns a tuple of int (i.e., launch grid) for the kernel.\n        # triton.cdiv is a shortcut for ceil division:\n        # triton.cdiv(a, b) = (a + b - 1) // b\n        N = z.shape[0]\n        grid = lambda opt: (triton.cdiv(N, opt.BLOCK), )\n        # *launch kernel*:\n        # pointer to the data of torch tensors can be retrieved with\n        # the `.data_ptr()` method\n        kernel = make_add_kernel(z.device)\n        kernel(z.data_ptr(), x.data_ptr(), y.data_ptr(), N, grid=grid)\n        return z\n\n\n# Just like we standard PyTorch ops We use the `.apply` method to create a callable object for our function\nadd = _add.apply"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Writing a Unit Test\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "torch.manual_seed(0)\nx = torch.rand(98432, device='cuda')\ny = torch.rand(98432, device='cuda')\nza = x + y\nzb = add(x, y)\nprint(za)\nprint(zb)\nprint(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {},
-      "source": [
-        "## Writing a Benchmark\nWe can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does\n\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "collapsed": false
-      },
-      "outputs": [],
-      "source": [
-        "warmup = 10\nrep = 200\nfor N in [2**i for i in range(17, 26, 1)]:\n    x = torch.rand(N, device='cuda')\n    y = torch.rand(N, device='cuda')\n    triton_ms = triton.testing.do_bench(lambda: add(x, y), warmup=warmup, rep=rep)\n    torch_ms = triton.testing.do_bench(lambda: x + y, warmup=warmup, rep=rep)\n    # print the performance of triton and torch as well as the achieved bandwidth\n    print(f'{N} {triton_ms:.3f} {torch_ms:.3f}')"
-      ]
-    }
-  ],
-  "metadata": {
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "language_info": {
-      "codemirror_mode": {
-        "name": "ipython",
-        "version": 3
-      },
-      "file_extension": ".py",
-      "mimetype": "text/x-python",
-      "name": "python",
-      "nbconvert_exporter": "python",
-      "pygments_lexer": "ipython3",
-      "version": "3.7.9"
-    }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
\ No newline at end of file
diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
index 732a8845a..e99216788 100644
--- a/python/tutorials/01-vector-add.py
+++ b/python/tutorials/01-vector-add.py
@@ -1,14 +1,15 @@
 """
 Vector Addition
 =================
-In this tutorial, we will see how to construct a simple, high-performance vector addition using Triton. You will learn:
-* The basic syntax of the Triton programming language
-* The best practices for creating PyTorch custom operators using the `triton.kernel` Python API
-* The best practices for validating and benchmarking custom ops against native reference implementations
+In this tutorial, you will write a simple, high-performance vector addition using Triton and learn about:
+
+- The basic syntax of the Triton programming language
+- The best practices for creating PyTorch custom operators using the :code:`triton.kernel` Python API
+- The best practices for validating and benchmarking custom ops against native reference implementations
 """
 
 # %%
-# Writing the Compute Kernel
+# Compute Kernel
 # --------------------------
 #
 # Each compute kernel is declared using the :code:`__global__` attribute, and executed many times in parallel
@@ -49,23 +50,20 @@ In this tutorial, we will see how to construct a simple, high-performance vector
 # The existence of arrays as a primitive data-type for Triton comes with a number of advantages that are highlighted in the `MAPL'2019 Triton paper <http://www.eecs.harvard.edu/~htk/publication/2019-mapl-tillet-kung-cox.pdf>`_.
 
 # %%
-# Writing the Torch bindings
+# Torch bindings
 # --------------------------
-# The only thing that matters when it comes to Triton and Torch is the `triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify `torch.tensor` objects.
+# The only thing that matters when it comes to Triton and Torch is the :code:`triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify :code:`torch.tensor` objects. To create a :code:`triton.kernel`, you only need three things:
 #
-# To create a `triton.kernel`, you only need three things:
-# - `source: string`: the source-code of the kernel you want to create
-# - `device: torch.device`: the device you want to compile this code for
-# - `defines: dict`: the set of macros that you want the pre-processor to `#define` for you
+# - :code:`source: string`: the source-code of the kernel you want to create
+# - :code:`device: torch.device`: the device you want to compile this code for
+# - :code:`defines: dict`: the set of macros that you want the pre-processor to `#define` for you
 
 import torch
 import triton
 
-# %%
 # source-code for Triton compute kernel
 # here we just copy-paste the above code without the extensive comments.
 # you may prefer to store it in a .c file and load it from there instead.
-
 _src = """
 __global__ void add(float* z, float* x, float* y, int N){
     // program id
@@ -82,13 +80,10 @@ __global__ void add(float* z, float* x, float* y, int N){
 }
     """
 
-# %%
-# This function returns a callable `triton.kernel` object
-# created from the above source code.
+
+# This function returns a callable `triton.kernel` object created from the above source code.
 # For portability, we maintain a cache of kernels for different `torch.device`
 # We compile the kernel with -DBLOCK=1024
-
-
 def make_add_kernel(device):
     cache = make_add_kernel.cache
     if device not in cache:
@@ -99,12 +94,9 @@ def make_add_kernel(device):
 
 make_add_kernel.cache = dict()
 
-# %%
-# This is a standard torch custom autograd Function
-# The only difference is that we can now use the above kernel
-# in the `forward` and `backward` functions.`
-
 
+# This is a standard torch custom autograd Function;
+# The only difference is that we can now use the above kernel in the `forward` and `backward` functions.`
 class _add(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x, y):
@@ -127,11 +119,11 @@ class _add(torch.autograd.Function):
         return z
 
 
-# Just like we standard PyTorch ops We use the `.apply` method to create a callable object for our function
+# Just like we standard PyTorch ops We use the :code:`.apply` method to create a callable object for our function
 add = _add.apply
 
 # %%
-# Writing a Unit Test
+# Unit Test
 # --------------------------
 torch.manual_seed(0)
 x = torch.rand(98432, device='cuda')
@@ -143,7 +135,7 @@ print(zb)
 print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')
 
 # %%
-# Writing a Benchmark
+# Benchmarking
 # --------------------------
 # We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does
 
diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py
index dfbb273fe..f715e1af0 100644
--- a/python/tutorials/02-fused-softmax.py
+++ b/python/tutorials/02-fused-softmax.py
@@ -1,16 +1,23 @@
 """
 Fused Softmax
 =================
+In this tutorial, you will write a fused softmax layer that outperform's PyTorch implementation and learn about:
+
+- The benefits of kernel fusion for bandwidth-bound operations.
+- The syntax and usage of reduction operators in Triton.
+- The automatic vectorization capabilities of the Triton compiler.
 """
 
 # %%
+# Motivations
+# ------------
 # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 # Let us consider instead the case of a simple (numerically stabilized) softmax operation:
 
 import torch
 
 
-# Compute the row-wise softmax of x \in R^{M \times N}
+# Compute the row-wise softmax of x
 def naive_softmax(x):
     # read  MN elements ; write M  elements
     x_max = torch.max(x, axis=1)[0]
@@ -27,11 +34,13 @@ def naive_softmax(x):
 
 
 # %%
-# When implemented naively in pytorch, computing :math:`y` requires reading :math:`7MN` elements from DRAM and writing back :math:`3MN + 2M` elements.
-# Instead, we want to write a custom "fused" pytorch operators that only reads X once and does all the necessary computations on-chip. This would require reading and writing back only :math:`MN` bytes, so we could expect a theoretical speed-up of 5x. In practice, though, we expect less because our kernel will spend some time computing exponentials and moving data around in shared memory.
+# When implemented naively in pytorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}` requires reading :math:`7MN` elements from DRAM and writing back :math:`3MN + 2M` elements.
+# Instead, we want to write a custom "fused" pytorch operators that only reads X once and does all the necessary computations on-chip.
+# This would require reading and writing back only :math:`MN` bytes, so we could expect a theoretical speed-up of 5x.
+# In practice, though, we expect less because our kernel will spend some time computing exponentials and moving data around in shared memory.
 
 # %%
-# Writing the Compute Kernel
+# Compute Kernel
 # ----------------------------
 # Our softmax kernel works as follows: each program loads a row of X and writes back a normalized row of Y. Note that one important limitation of Triton is that each block must have a power-of-two number of elements, which means that we need to guard the memory operations properly if we want to handle any possible input shapes:
 #
@@ -69,14 +78,16 @@ def naive_softmax(x):
 #    }
 
 # %%
-# Writing the Compute Kernel
+# Torch Bindings
 # ----------------------------
+# We need to make sure that BLOCK is the smallest power of two
+# greater than the number of rows N of the input matrix.
+# Different values of BLOCK will result in different kernels
 
 import torch
 import triton
 
-# %%
-# source-code for Triton compute kernel
+# Source code for the Triton kernel
 _src = """
 __global__ void softmax(float* Y, float* X, int stride_ym, int stride_xm, int M, int N){
     int    m             = get_program_id(0);
@@ -94,12 +105,6 @@ __global__ void softmax(float* Y, float* X, int stride_ym, int stride_xm, int M,
 """
 
 
-# %%
-# Writing the Torch bindings
-# ----------------------------
-# We need to make sure that BLOCK is the smallest power of two
-# greater than the number of rows N of the input matrix.
-# Different values of BLOCK will result in different kernels
 def next_power_of_2(n):
     n -= 1
     n |= n >> 1
@@ -156,7 +161,7 @@ print(torch.allclose(y_tri, y_ref))
 # Seems to work!
 
 # %%
-# Benchmark
+# Benchmarking
 # ----------
 
 import matplotlib.pyplot as plt
diff --git a/python/tutorials/README.rst b/python/tutorials/README.rst
index f47d9864f..24c752842 100644
--- a/python/tutorials/README.rst
+++ b/python/tutorials/README.rst
@@ -1,4 +1,4 @@
-Triton Tutorials
+Tutorials
 ==================
 
-Below is a gallery of tutorials to help you get started with Triton.
\ No newline at end of file
+Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. 
\ No newline at end of file