From 92242ace2c0424c6f966caa034e9a5bb862b85dc Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sat, 6 Mar 2021 17:26:49 -0500 Subject: [PATCH] [DOCS] Re-structured documentation hierarchy --- docs/conf.py | 4 +- .../installation.rst} | 17 +- docs/index.rst | 22 ++- docs/installation/packaged-binaries.rst | 8 - docs/tutorials/01-vector-add.ipynb | 158 ------------------ python/tutorials/01-vector-add.py | 44 ++--- python/tutorials/02-fused-softmax.py | 33 ++-- python/tutorials/README.rst | 4 +- 8 files changed, 68 insertions(+), 222 deletions(-) rename docs/{installation/from-source.rst => getting-started/installation.rst} (86%) delete mode 100644 docs/installation/packaged-binaries.rst delete mode 100644 docs/tutorials/01-vector-add.ipynb diff --git a/docs/conf.py b/docs/conf.py index 85ebd6889..5fed1334a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -35,11 +35,13 @@ autosectionlabel_prefix_document = True # Sphinx gallery extensions += ['sphinx_gallery.gen_gallery'] +from sphinx_gallery.sorting import FileNameSortKey sphinx_gallery_conf = { 'examples_dirs': '../python/tutorials/', - 'gallery_dirs': 'tutorials', + 'gallery_dirs': 'getting-started/tutorials', 'filename_pattern': '', 'ignore_pattern': r'__init__\.py', + 'within_subsection_order': FileNameSortKey, } # Add any paths that contain templates here, relative to this directory. diff --git a/docs/installation/from-source.rst b/docs/getting-started/installation.rst similarity index 86% rename from docs/installation/from-source.rst rename to docs/getting-started/installation.rst index 021b9a81d..07c50ca06 100644 --- a/docs/installation/from-source.rst +++ b/docs/getting-started/installation.rst @@ -1,6 +1,21 @@ ============== +Installation +============== + +-------------- +With Pip +-------------- + +Triton can be installed directly from pip with the following command + +.. code-block:: python + + pip install triton + + +-------------- From Source -============== +-------------- +++++++++++++++ Python Package diff --git a/docs/index.rst b/docs/index.rst index 77b005964..7d678df25 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,20 +1,18 @@ -.. Triton documentation master file, created by - sphinx-quickstart on Mon Feb 10 01:01:37 2020. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - Welcome to Triton's documentation! ================================== -.. toctree:: - :maxdepth: 1 - :caption: Installation Instructions +Triton is an imperative language and compiler for parallel programming. It aims to provide a programming environment for productively writing custom DNN compute kernels capable of running at maximal throughput on modern GPU hardware. - installation/packaged-binaries - installation/from-source +Getting Started +--------------- + +- Follow the :doc:`installation instructions ` for your platform of choice. +- Take a look at the :doc:`tutorials ` to learn how to write your first Triton program. .. toctree:: :maxdepth: 1 - :caption: Installation Instructions + :caption: Getting Started + :hidden: - tutorials/index \ No newline at end of file + getting-started/installation + getting-started/tutorials/index \ No newline at end of file diff --git a/docs/installation/packaged-binaries.rst b/docs/installation/packaged-binaries.rst deleted file mode 100644 index fdcddd887..000000000 --- a/docs/installation/packaged-binaries.rst +++ /dev/null @@ -1,8 +0,0 @@ -Packaged Binaries -================= - -Triton can be installed directly from pip with the following command - -.. code-block:: python - - pip install triton \ No newline at end of file diff --git a/docs/tutorials/01-vector-add.ipynb b/docs/tutorials/01-vector-add.ipynb deleted file mode 100644 index 6856022dc..000000000 --- a/docs/tutorials/01-vector-add.ipynb +++ /dev/null @@ -1,158 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# Vector Addition\nIn this tutorial, we will see how to construct a simple, high-performance vector addition using Triton. You will learn:\n* The basic syntax of the Triton programming language\n* The best practices for creating PyTorch custom operators using the `triton.kernel` Python API\n* The best practices for validating and benchmarking custom ops against native reference implementations\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing the Compute Kernel\n\nEach compute kernel is declared using the :code:`__global__` attribute, and executed many times in parallel\non different chunks of data (See the `Single Program, Multiple Data <(https://en.wikipedia.org/wiki/SPMD>`_)\nprogramming model for more details).\n\n .. code-block:: C\n\n __global__ void add(float* z, float* x, float* y, int N){\n // The `get_program_id(i)` returns the i-th coordinate\n // of the program in the overaching SPMD context\n // (a.k.a launch grid). This is what allows us to process\n // different chunks of data in parallel.\n // For those similar with CUDA, `get_program_id({0,1,2})`\n // is similar to blockIdx.{x,y,z}\n int pid = get_program_id(0);\n // In Triton, arrays are first-class citizen. In other words,\n // they are primitives data-types and are -- contrary to C and\n // CUDA -- not implemented as pointers to contiguous chunks of\n // memory.\n // In the few lines below, we create an array of `BLOCK` pointers\n // whose memory values are, e.g.:\n // [z + pid*BLOCK + 0, z + pid*BLOCK + 1, ..., z + pid*BLOCK + BLOCK - 1]\n // Note: here BLOCK is expected to be a pre-processor macro defined at compile-time\n int offset[BLOCK] = pid * BLOCK + 0 ... BLOCK;\n float* pz [BLOCK] = z + offset;\n float* px [BLOCK] = x + offset;\n float* py [BLOCK] = y + offset;\n // Simple element-wise control-flow for load/store operations can\n // be achieved using the the ternary operator `cond ? val_true : val_false`\n // or the conditional dereferencing operator `*?(cond)ptr\n // Here, we make sure that we do not access memory out-of-bounds when we\n // write-back `z`\n bool check[BLOCK] = offset < N;\n *?(check)pz = *?(check)px + *?(check)py;\n }\n\nThe existence of arrays as a primitive data-type for Triton comes with a number of advantages that are highlighted in the `MAPL'2019 Triton paper `_.\n\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing the Torch bindings\nThe only thing that matters when it comes to Triton and Torch is the `triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify `torch.tensor` objects.\n\nTo create a `triton.kernel`, you only need three things:\n- `source: string`: the source-code of the kernel you want to create\n- `device: torch.device`: the device you want to compile this code for\n- `defines: dict`: the set of macros that you want the pre-processor to `#define` for you\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import torch\nimport triton" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "source-code for Triton compute kernel\nhere we just copy-paste the above code without the extensive comments.\nyou may prefer to store it in a .c file and load it from there instead.\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "_src = \"\"\"\n__global__ void add(float* z, float* x, float* y, int N){\n // program id\n int pid = get_program_id(0);\n // create arrays of pointers\n int offset[BLOCK] = pid * BLOCK + 0 ... BLOCK;\n float* pz[BLOCK] = z + offset;\n float* px[BLOCK] = x + offset;\n float* py[BLOCK] = y + offset;\n // bounds checking\n bool check[BLOCK] = offset < N;\n // write-back\n *?(check)pz = *?(check)px + *?(check)py;\n}\n \"\"\"" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function returns a callable `triton.kernel` object\ncreated from the above source code.\nFor portability, we maintain a cache of kernels for different `torch.device`\nWe compile the kernel with -DBLOCK=1024\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "def make_add_kernel(device):\n cache = make_add_kernel.cache\n if device not in cache:\n defines = {'BLOCK': 1024}\n cache[device] = triton.kernel(_src, device=device, defines=defines)\n return cache[device]\n\n\nmake_add_kernel.cache = dict()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is a standard torch custom autograd Function\nThe only difference is that we can now use the above kernel\nin the `forward` and `backward` functions.`\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "class _add(torch.autograd.Function):\n @staticmethod\n def forward(ctx, x, y):\n # constraints of the op\n assert x.dtype == torch.float32\n # *allocate output*\n z = torch.empty_like(x)\n # *create launch grid*:\n # this is a function which takes compilation parameters `opt`\n # as input and returns a tuple of int (i.e., launch grid) for the kernel.\n # triton.cdiv is a shortcut for ceil division:\n # triton.cdiv(a, b) = (a + b - 1) // b\n N = z.shape[0]\n grid = lambda opt: (triton.cdiv(N, opt.BLOCK), )\n # *launch kernel*:\n # pointer to the data of torch tensors can be retrieved with\n # the `.data_ptr()` method\n kernel = make_add_kernel(z.device)\n kernel(z.data_ptr(), x.data_ptr(), y.data_ptr(), N, grid=grid)\n return z\n\n\n# Just like we standard PyTorch ops We use the `.apply` method to create a callable object for our function\nadd = _add.apply" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing a Unit Test\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "torch.manual_seed(0)\nx = torch.rand(98432, device='cuda')\ny = torch.rand(98432, device='cuda')\nza = x + y\nzb = add(x, y)\nprint(za)\nprint(zb)\nprint(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Writing a Benchmark\nWe can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "warmup = 10\nrep = 200\nfor N in [2**i for i in range(17, 26, 1)]:\n x = torch.rand(N, device='cuda')\n y = torch.rand(N, device='cuda')\n triton_ms = triton.testing.do_bench(lambda: add(x, y), warmup=warmup, rep=rep)\n torch_ms = triton.testing.do_bench(lambda: x + y, warmup=warmup, rep=rep)\n # print the performance of triton and torch as well as the achieved bandwidth\n print(f'{N} {triton_ms:.3f} {torch_ms:.3f}')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.9" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} \ No newline at end of file diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py index 732a8845a..e99216788 100644 --- a/python/tutorials/01-vector-add.py +++ b/python/tutorials/01-vector-add.py @@ -1,14 +1,15 @@ """ Vector Addition ================= -In this tutorial, we will see how to construct a simple, high-performance vector addition using Triton. You will learn: -* The basic syntax of the Triton programming language -* The best practices for creating PyTorch custom operators using the `triton.kernel` Python API -* The best practices for validating and benchmarking custom ops against native reference implementations +In this tutorial, you will write a simple, high-performance vector addition using Triton and learn about: + +- The basic syntax of the Triton programming language +- The best practices for creating PyTorch custom operators using the :code:`triton.kernel` Python API +- The best practices for validating and benchmarking custom ops against native reference implementations """ # %% -# Writing the Compute Kernel +# Compute Kernel # -------------------------- # # Each compute kernel is declared using the :code:`__global__` attribute, and executed many times in parallel @@ -49,23 +50,20 @@ In this tutorial, we will see how to construct a simple, high-performance vector # The existence of arrays as a primitive data-type for Triton comes with a number of advantages that are highlighted in the `MAPL'2019 Triton paper `_. # %% -# Writing the Torch bindings +# Torch bindings # -------------------------- -# The only thing that matters when it comes to Triton and Torch is the `triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify `torch.tensor` objects. +# The only thing that matters when it comes to Triton and Torch is the :code:`triton.kernel` class. This allows you to transform the above C-like function into a callable python object that can be used to modify :code:`torch.tensor` objects. To create a :code:`triton.kernel`, you only need three things: # -# To create a `triton.kernel`, you only need three things: -# - `source: string`: the source-code of the kernel you want to create -# - `device: torch.device`: the device you want to compile this code for -# - `defines: dict`: the set of macros that you want the pre-processor to `#define` for you +# - :code:`source: string`: the source-code of the kernel you want to create +# - :code:`device: torch.device`: the device you want to compile this code for +# - :code:`defines: dict`: the set of macros that you want the pre-processor to `#define` for you import torch import triton -# %% # source-code for Triton compute kernel # here we just copy-paste the above code without the extensive comments. # you may prefer to store it in a .c file and load it from there instead. - _src = """ __global__ void add(float* z, float* x, float* y, int N){ // program id @@ -82,13 +80,10 @@ __global__ void add(float* z, float* x, float* y, int N){ } """ -# %% -# This function returns a callable `triton.kernel` object -# created from the above source code. + +# This function returns a callable `triton.kernel` object created from the above source code. # For portability, we maintain a cache of kernels for different `torch.device` # We compile the kernel with -DBLOCK=1024 - - def make_add_kernel(device): cache = make_add_kernel.cache if device not in cache: @@ -99,12 +94,9 @@ def make_add_kernel(device): make_add_kernel.cache = dict() -# %% -# This is a standard torch custom autograd Function -# The only difference is that we can now use the above kernel -# in the `forward` and `backward` functions.` - +# This is a standard torch custom autograd Function; +# The only difference is that we can now use the above kernel in the `forward` and `backward` functions.` class _add(torch.autograd.Function): @staticmethod def forward(ctx, x, y): @@ -127,11 +119,11 @@ class _add(torch.autograd.Function): return z -# Just like we standard PyTorch ops We use the `.apply` method to create a callable object for our function +# Just like we standard PyTorch ops We use the :code:`.apply` method to create a callable object for our function add = _add.apply # %% -# Writing a Unit Test +# Unit Test # -------------------------- torch.manual_seed(0) x = torch.rand(98432, device='cuda') @@ -143,7 +135,7 @@ print(zb) print(f'The maximum difference between torch and triton is ' f'{torch.max(torch.abs(za - zb))}') # %% -# Writing a Benchmark +# Benchmarking # -------------------------- # We can now benchmark our custom op for vectors of increasing sizes to get a sense of how it does diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py index dfbb273fe..f715e1af0 100644 --- a/python/tutorials/02-fused-softmax.py +++ b/python/tutorials/02-fused-softmax.py @@ -1,16 +1,23 @@ """ Fused Softmax ================= +In this tutorial, you will write a fused softmax layer that outperform's PyTorch implementation and learn about: + +- The benefits of kernel fusion for bandwidth-bound operations. +- The syntax and usage of reduction operators in Triton. +- The automatic vectorization capabilities of the Triton compiler. """ # %% +# Motivations +# ------------ # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice. # Let us consider instead the case of a simple (numerically stabilized) softmax operation: import torch -# Compute the row-wise softmax of x \in R^{M \times N} +# Compute the row-wise softmax of x def naive_softmax(x): # read MN elements ; write M elements x_max = torch.max(x, axis=1)[0] @@ -27,11 +34,13 @@ def naive_softmax(x): # %% -# When implemented naively in pytorch, computing :math:`y` requires reading :math:`7MN` elements from DRAM and writing back :math:`3MN + 2M` elements. -# Instead, we want to write a custom "fused" pytorch operators that only reads X once and does all the necessary computations on-chip. This would require reading and writing back only :math:`MN` bytes, so we could expect a theoretical speed-up of 5x. In practice, though, we expect less because our kernel will spend some time computing exponentials and moving data around in shared memory. +# When implemented naively in pytorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}` requires reading :math:`7MN` elements from DRAM and writing back :math:`3MN + 2M` elements. +# Instead, we want to write a custom "fused" pytorch operators that only reads X once and does all the necessary computations on-chip. +# This would require reading and writing back only :math:`MN` bytes, so we could expect a theoretical speed-up of 5x. +# In practice, though, we expect less because our kernel will spend some time computing exponentials and moving data around in shared memory. # %% -# Writing the Compute Kernel +# Compute Kernel # ---------------------------- # Our softmax kernel works as follows: each program loads a row of X and writes back a normalized row of Y. Note that one important limitation of Triton is that each block must have a power-of-two number of elements, which means that we need to guard the memory operations properly if we want to handle any possible input shapes: # @@ -69,14 +78,16 @@ def naive_softmax(x): # } # %% -# Writing the Compute Kernel +# Torch Bindings # ---------------------------- +# We need to make sure that BLOCK is the smallest power of two +# greater than the number of rows N of the input matrix. +# Different values of BLOCK will result in different kernels import torch import triton -# %% -# source-code for Triton compute kernel +# Source code for the Triton kernel _src = """ __global__ void softmax(float* Y, float* X, int stride_ym, int stride_xm, int M, int N){ int m = get_program_id(0); @@ -94,12 +105,6 @@ __global__ void softmax(float* Y, float* X, int stride_ym, int stride_xm, int M, """ -# %% -# Writing the Torch bindings -# ---------------------------- -# We need to make sure that BLOCK is the smallest power of two -# greater than the number of rows N of the input matrix. -# Different values of BLOCK will result in different kernels def next_power_of_2(n): n -= 1 n |= n >> 1 @@ -156,7 +161,7 @@ print(torch.allclose(y_tri, y_ref)) # Seems to work! # %% -# Benchmark +# Benchmarking # ---------- import matplotlib.pyplot as plt diff --git a/python/tutorials/README.rst b/python/tutorials/README.rst index f47d9864f..24c752842 100644 --- a/python/tutorials/README.rst +++ b/python/tutorials/README.rst @@ -1,4 +1,4 @@ -Triton Tutorials +Tutorials ================== -Below is a gallery of tutorials to help you get started with Triton. \ No newline at end of file +Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. \ No newline at end of file