[GH-PAGES] Updated website

2022-02-08 23:45:21 +00:00
parent 95bb988ed0
commit 989c163b13
278 changed files with 7571 additions and 106 deletions
--- a/v1.1.1/.doctrees/environment.pickle
+++ b/v1.1.1/.doctrees/environment.pickle
--- a/v1.1.1/.doctrees/python-api/generated/triton.language.load.doctree
+++ b/v1.1.1/.doctrees/python-api/generated/triton.language.load.doctree
--- a/v1.1.1/objects.inv
+++ b/v1.1.1/objects.inv
--- a/v1.1.1/searchindex.js
+++ b/v1.1.1/searchindex.js
--- a/v1.1.2/.buildinfo
+++ b/v1.1.2/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: cac7e7cece053880c1040a240b480ea1
+config: 0b3571d18ebacb7d725e34a920cd2c6d
 tags: 645f666f9bcd5a90fca523b33c5a78b7
--- a/v1.1.2/.doctrees/environment.pickle
+++ b/v1.1.2/.doctrees/environment.pickle
--- a/v1.1.2/.doctrees/getting-started/installation.doctree
+++ b/v1.1.2/.doctrees/getting-started/installation.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/01-vector-add.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/01-vector-add.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/02-fused-softmax.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/02-fused-softmax.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/05-layer-norm.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/05-layer-norm.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/index.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/index.doctree
--- a/v1.1.2/.doctrees/getting-started/tutorials/sg_execution_times.doctree
+++ b/v1.1.2/.doctrees/getting-started/tutorials/sg_execution_times.doctree
--- a/v1.1.2/.doctrees/index.doctree
+++ b/v1.1.2/.doctrees/index.doctree
--- a/v1.1.2/.doctrees/programming-guide/chapter-1/introduction.doctree
+++ b/v1.1.2/.doctrees/programming-guide/chapter-1/introduction.doctree
--- a/v1.1.2/.doctrees/programming-guide/chapter-2/related-work.doctree
+++ b/v1.1.2/.doctrees/programming-guide/chapter-2/related-work.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.Config.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.Config.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.autotune.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.autotune.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.heuristics.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.heuristics.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.jit.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.jit.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.arange.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.arange.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_add.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_add.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_cas.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_cas.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_max.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_max.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_min.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_min.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_xchg.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.atomic_xchg.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.broadcast_to.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.broadcast_to.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.cos.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.cos.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.dot.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.dot.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.exp.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.exp.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.load.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.load.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.log.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.log.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.max.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.max.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.maximum.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.maximum.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.min.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.min.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.minimum.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.minimum.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.multiple_of.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.multiple_of.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.num_programs.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.num_programs.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.program_id.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.program_id.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.rand.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.rand.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.randint.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.randint.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.randint4x.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.randint4x.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.randn.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.randn.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.ravel.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.ravel.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.reshape.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.reshape.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.sigmoid.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.sigmoid.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.sin.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.sin.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.softmax.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.softmax.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.sqrt.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.sqrt.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.store.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.store.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.sum.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.sum.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.where.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.where.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.language.zeros.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.language.zeros.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.testing.Benchmark.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.testing.Benchmark.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.testing.do_bench.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.testing.do_bench.doctree
--- a/v1.1.2/.doctrees/python-api/generated/triton.testing.perf_report.doctree
+++ b/v1.1.2/.doctrees/python-api/generated/triton.testing.perf_report.doctree
--- a/v1.1.2/.doctrees/python-api/triton.doctree
+++ b/v1.1.2/.doctrees/python-api/triton.doctree
--- a/v1.1.2/.doctrees/python-api/triton.language.doctree
+++ b/v1.1.2/.doctrees/python-api/triton.language.doctree
--- a/v1.1.2/.doctrees/python-api/triton.testing.doctree
+++ b/v1.1.2/.doctrees/python-api/triton.testing.doctree
--- a/v1.1.2/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
+++ b/v1.1.2/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
@@ -0,0 +1,161 @@
 {
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Fused Softmax\nIn this tutorial, you will write a fused softmax operation that is significantly faster\nthan PyTorch's native op for a particular class of matrices: those whose rows can fit in\nthe GPU's SRAM.\nYou will learn about:\n\n- The benefits of kernel fusion for bandwidth-bound operations.\n- Reduction operators in Triton.\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Motivations\nCustom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.\nLet us consider instead the case of a simple (numerically stabilized) softmax operation:\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import torch\n\n\n@torch.jit.script\ndef naive_softmax(x):\n    \"\"\"Compute row-wise softmax of X using native pytorch\n\n    We subtract the maximum element in order to avoid overflows. Softmax is invariant to\n    this shift.\n    \"\"\"\n    # read  MN elements ; write M  elements\n    x_max = x.max(dim=1)[0]\n    # read MN + M elements ; write MN elements\n    z = x - x_max[:, None]\n    # read  MN elements ; write MN elements\n    numerator = torch.exp(z)\n    # read  MN elements ; write M  elements\n    denominator = numerator.sum(dim=1)\n    # read MN + M elements ; write MN elements\n    ret = numerator / denominator[:, None]\n    # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements\n    return ret"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for $x \\in R^{M \\times N}$\nrequires reading $5MN + 2M$ elements from DRAM and writing back $3MN + 2M$ elements.\nThis is obviously wasteful; we'd prefer to have a custom \"fused\" kernel that only reads\nX once and does all the necessary computations on-chip.\nDoing so would require reading and writing back only $MN$ bytes, so we could\nexpect a theoretical speed-up of ~4x (i.e., $(8MN + 4M) / 2MN$).\nThe `torch.jit.script` flags aims to perform this kind of \"kernel fusion\" automatically\nbut, as we will see later, it is still far from ideal.\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Compute Kernel\nOur softmax kernel works as follows: each program loads a row of the input matrix X,\nnormalizes it and writes back the result to the output Y.\nNote that one important limitation of Triton is that each block must have a\npower-of-two number of elements, so we need to internally \"pad\" each row and guard the\nmemory operations properly if we want to handle any possible input shapes:\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import triton\nimport triton.language as tl\n\n\n@triton.jit\ndef softmax_kernel(\n    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta\n):\n    # The rows of the softmax are independent, so we parallelize across those\n    row_idx = tl.program_id(0)\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    # The stride represents how much we need to increase the pointer to advance 1 row\n    row_start_ptr = input_ptr + row_idx * input_row_stride\n    # The block size is the next power of two greater than n_cols, so we can fit each\n    # row in a single block\n    col_offsets = tl.arange(0, BLOCK_SIZE)\n    input_ptrs = row_start_ptr + col_offsets\n    # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols\n    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))\n    # Substract maximum for numerical stability\n    row_minus_max = row - tl.max(row, axis=0)\n    # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)\n    numerator = tl.exp(row_minus_max)\n    denominator = tl.sum(numerator, axis=0)\n    softmax_output = numerator / denominator\n    # Write back output to DRAM\n    output_row_start_ptr = output_ptr + row_idx * output_row_stride\n    output_ptrs = output_row_start_ptr + col_offsets\n    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def softmax(x):\n    n_rows, n_cols = x.shape\n    # The block size is the smallest power of two greater than the number of columns in `x`\n    BLOCK_SIZE = triton.next_power_of_2(n_cols)\n    # Another trick we can use is to ask the compiler to use more threads per row by\n    # increasing the number of warps (`num_warps`) over which each row is distributed.\n    # You will see in the next tutorial how to auto-tune this value in a more natural\n    # way so you don't have to come up with manual heuristics yourself.\n    num_warps = 4\n    if BLOCK_SIZE >= 2048:\n        num_warps = 8\n    if BLOCK_SIZE >= 4096:\n        num_warps = 16\n    # Allocate output\n    y = torch.empty_like(x)\n    # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o\n    # f the input matrix\n    softmax_kernel[(n_rows,)](\n        y,\n        x,\n        x.stride(0),\n        y.stride(0),\n        n_cols,\n        num_warps=num_warps,\n        BLOCK_SIZE=BLOCK_SIZE,\n    )\n    return y"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Unit Test\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We make sure that we test our kernel on a matrix with an irregular number of rows and columns.\nThis will allow us to verify that our padding mechanism works.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "torch.manual_seed(0)\nx = torch.randn(1823, 781, device='cuda')\ny_triton = softmax(x)\ny_torch = torch.softmax(x, axis=1)\nprint(torch.allclose(y_triton, y_torch))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "As expected, the results are identical.\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Benchmark\nHere we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.\nWe will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "@triton.testing.perf_report(\n    triton.testing.Benchmark(\n        x_names=['N'],  # argument names to use as an x-axis for the plot\n        x_vals=[\n            128 * i for i in range(2, 100)\n        ],  # different possible values for `x_name`\n        line_arg='provider',  # argument name whose value corresponds to a different line in the plot\n        line_vals=[\n            'triton',\n            'torch-native',\n            'torch-jit',\n        ],  # possible values for `line_arg``\n        line_names=[\n            \"Triton\",\n            \"Torch (native)\",\n            \"Torch (jit)\",\n        ],  # label name for the lines\n        styles=[('blue', '-'), ('green', '-'), ('green', '--')],  # line styles\n        ylabel=\"GB/s\",  # label name for the y-axis\n        plot_name=\"softmax-performance\",  # name for the plot. Used also as a file name for saving the plot.\n        args={'M': 4096},  # values for function arguments not in `x_names` and `y_name`\n    )\n)\ndef benchmark(M, N, provider):\n    x = torch.randn(M, N, device='cuda', dtype=torch.float32)\n    if provider == 'torch-native':\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))\n    if provider == 'triton':\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))\n    if provider == 'torch-jit':\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))\n    gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)\n    return gbps(ms), gbps(max_ms), gbps(min_ms)\n\n\nbenchmark.run(show_plots=True, print_data=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "In the above plot, we can see that:\n\n - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.\n - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. \n   Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.\n\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/v1.1.2/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
+++ b/v1.1.2/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
@@ -0,0 +1,130 @@
 """
 Vector Addition
 =================
 In this tutorial, you will write a simple vector addition using Triton and learn about:
 - The basic programming model of Triton
 - The `triton.jit` decorator, which is used to define Triton kernels.
 - The best practices for validating and benchmarking your custom ops against native reference implementations
 """
 # %%
 # Compute Kernel
 # --------------------------
 import torch
 import triton
 import triton.language as tl
@triton.jit
 def add_kernel(
    x_ptr,  # *Pointer* to first input vector
    y_ptr,  # *Pointer* to second input vector
    output_ptr,  # *Pointer* to output vector
    n_elements,  # Size of the vector
    **meta,  # Optional meta-parameters for the kernel
 ):
    BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process
    # There are multiple 'program's processing different data. We identify which program
    # we are here
    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
    # This program will process inputs that are offset from the initial data.
    # for instance, if you had a vector of length 256 and block_size of 64, the programs
    # would each access the elements [0:64, 64:128, 128:192, 192:256].
    # Note that offsets is a list of pointers
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # Create a mask to guard memory operations against out-of-bounds accesses
    mask = offsets < n_elements
    # Load x and y from DRAM, masking out any extar elements in case the input is not a
    # multiple of the block size
    x = tl.load(x_ptr + offsets, mask=mask)
    y = tl.load(y_ptr + offsets, mask=mask)
    output = x + y
    # Write x + y back to DRAM
    tl.store(output_ptr + offsets, output, mask=mask)
 # %%
 # Let's also declare a helper function to (1) allocate the `z` tensor
 # and (2) enqueue the above kernel with appropriate grid/block sizes.
 def add(x: torch.Tensor, y: torch.Tensor):
    # We need to preallocate the output
    output = torch.empty_like(x)
    assert x.is_cuda and y.is_cuda and output.is_cuda
    n_elements = output.numel()
    # The SPMD launch grid denotes the number of kernel instances that run in parallel.
    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
    # In this case, we use a 1D grid where the size is the number of blocks
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    # NOTE:
    #  - each torch.tensor object is implicitly converted into a pointer to its first element.
    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
    #  - don't forget to pass meta-parameters as keywords arguments
    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
    # running asynchronously at this point.
    return output
 # %%
 # We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
 torch.manual_seed(0)
 size = 98432
 x = torch.rand(size, device='cuda')
 y = torch.rand(size, device='cuda')
 output_torch = x + y
 output_triton = add(x, y)
 print(output_torch)
 print(output_triton)
 print(
    f'The maximum difference between torch and triton is '
    f'{torch.max(torch.abs(output_torch - output_triton))}'
 )
 # %%
 # Seems like we're good to go!
 # %%
 # Benchmark
 # -----------
 # We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
 # To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
 # for different problem sizes.
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['size'],  # argument names to use as an x-axis for the plot
        x_vals=[
            2 ** i for i in range(12, 28, 1)
        ],  # different possible values for `x_name`
        x_log=True,  # x axis is logarithmic
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        line_vals=['triton', 'torch'],  # possible values for `line_arg`
        line_names=['Triton', 'Torch'],  # label name for the lines
        styles=[('blue', '-'), ('green', '-')],  # line styles
        ylabel='GB/s',  # label name for the y-axis
        plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.
        args={},  # values for function arguments not in `x_names` and `y_name`
    )
 )
 def benchmark(size, provider):
    x = torch.rand(size, device='cuda', dtype=torch.float32)
    y = torch.rand(size, device='cuda', dtype=torch.float32)
    if provider == 'torch':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
    gbps = lambda ms: 12 * size / ms * 1e-6
    return gbps(ms), gbps(max_ms), gbps(min_ms)
 # %%
 # We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 # `save_path='/path/to/results/' to save them to disk along with raw CSV data
 benchmark.run(print_data=True, show_plots=True)
--- a/v1.1.2/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
+++ b/v1.1.2/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
--- a/v1.1.2/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
+++ b/v1.1.2/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
--- a/v1.1.2/_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py
+++ b/v1.1.2/_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py
@@ -0,0 +1,251 @@
 """
 Layer Normalization
 ====================
 """
 import torch
 import triton.language as tl
 import triton
 # Forward Pass
@triton.jit
 def _layer_norm_fwd_fused(X, Y, W, B, M, V, stride, N, eps, **META):
    BLOCK_SIZE = META['BLOCK_SIZE']
    # position of elements processed by this program
    row =  tl.program_id(0)
    cols = tl.arange(0, BLOCK_SIZE)
    mask = cols < N
    # offset data pointers to start at the row of interest
    X += row * stride
    Y += row * stride
    # load data and cast to float32
    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
    # compute mean
    mean = tl.sum(x, axis=0) / N
    # compute std
    xmean = tl.where(mask, x - mean, 0.)
    var   = tl.sum(xmean * xmean, axis=0) / N
    rstd  = 1 / tl.sqrt(var + eps)
    xhat  = xmean*rstd
    # write-back mean/rstd
    tl.store(M + row, mean)
    tl.store(V + row, rstd)
    # multiply by weight and add bias
    w = tl.load(W + cols, mask=mask)
    b = tl.load(B + cols, mask=mask)
    y = xhat * w + b
    # write-back
    tl.store(Y + cols, y, mask=mask)
 # Backward pass (DX + partial DW + partial DB)
@triton.jit
 def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock,
                       stride, N, eps, 
                       **META):
    GROUP_SIZE_M = META['GROUP_SIZE_M']
    BLOCK_SIZE_N = META['BLOCK_SIZE_N']
    # position of elements processed by this program
    row =  tl.program_id(0)
    cols = tl.arange(0, BLOCK_SIZE_N)
    mask = cols < N
    # offset data pointers to start at the row of interest
    X  += row * stride
    DY += row * stride
    DX += row * stride
    # offset locks and weight/bias gradient pointer
    # each kernel instance accumulates partial sums for
    # DW and DB into one of GROUP_SIZE_M independent buffers
    # these buffers stay in the L2, which allow this kernel
    # to be fast
    lock_id = row % GROUP_SIZE_M
    Lock   += lock_id
    Count   = Lock + GROUP_SIZE_M
    DW      = DW + lock_id*N + cols
    DB      = DB + lock_id*N + cols
    # load data to SRAM
    x     = tl.load(X  + cols, mask=mask, other=0).to(tl.float32)
    dy    = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
    w     = tl.load(W  + cols, mask=mask).to(tl.float32)
    mean  = tl.load(M + row)
    rstd  = tl.load(V + row)
    # compute dx
    xhat  = (x - mean)*rstd
    wdy   = w * dy
    xhat  = tl.where(mask, xhat, 0.)
    wdy   = tl.where(mask, wdy , 0.)
    mean1 = tl.sum(xhat * wdy, axis=0) / N
    mean2 = tl.sum(wdy, axis=0) / N
    dx    = (wdy - (xhat*mean1 + mean2))*rstd
    # write-back dx
    tl.store(DX + cols, dx, mask=mask)
    # accumulate partial sums for dw/db
    partial_dw = (dy*xhat).to(w.dtype)
    partial_db = (dy).to(w.dtype)
    while tl.atomic_cas(Lock, 0, 1) == 1:
        pass
    count = tl.load(Count)
    # first store doesn't accumulate
    if count == 0:
        tl.atomic_xchg(Count, 1)
    else:
        partial_dw += tl.load(DW, mask=mask)
        partial_db += tl.load(DB, mask=mask)
    tl.store(DW, partial_dw, mask=mask)
    tl.store(DB, partial_db, mask=mask)
    # release lock
    tl.atomic_xchg(Lock, 0)
 # Backward pass (total DW + total DB)
@triton.jit
 def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N, **meta):
    pid = tl.program_id(0)
    BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
    BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
    cols = pid*BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    dw   = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    db   = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for i in range(0, M, BLOCK_SIZE_M):
        rows = i + tl.arange(0, meta['BLOCK_SIZE_M'])
        mask = (rows[:, None] < M) & (cols[None, :] < N)
        offs = rows[:, None]*N + cols[None, :]
        dw += tl.load(DW + offs, mask=mask, other=0.)
        db += tl.load(DB + offs, mask=mask, other=0.)
    sum_dw = tl.sum(dw, axis=0)
    sum_db = tl.sum(db, axis=0)
    tl.store(FINAL_DW + cols, sum_dw, mask=cols<N)
    tl.store(FINAL_DB + cols, sum_db, mask=cols<N)
 class LayerNorm(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, normalized_shape, weight, bias, eps):
        # allocate output
        y = torch.empty_like(x)
        # reshape input data into 2D tensor
        x_arg = x.reshape(-1, x.shape[-1])
        M, N = x_arg.shape
        mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
        rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
        # Less than 64KB per feature: enqueue fused kernel
        MAX_FUSED_SIZE = 65536 // x.element_size()
        BLOCK_SIZE     = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
        if N > BLOCK_SIZE:
            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
        # heuristics for number of warps
        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
        # enqueue kernel
        _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd, 
                                    x_arg.stride(0), N, eps, 
                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
        ctx.save_for_backward(x, weight, bias, mean, rstd)
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps  = num_warps
        ctx.eps        = eps
        return y
    @staticmethod
    def backward(ctx, dy):
        x, w, b, m, v = ctx.saved_tensors
        # heuristics for amount of parallel reduction stream for DG/DB
        N = w.shape[0]
        GROUP_SIZE_M = 64
        if N <= 8192: GROUP_SIZE_M = 96
        if N <= 4096: GROUP_SIZE_M = 128
        if N <= 1024: GROUP_SIZE_M = 256
        # allocate output
        locks = torch.zeros(2*GROUP_SIZE_M, dtype=torch.int32, device='cuda')
        _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
        _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
        dw  = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
        db  = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
        dx = torch.empty_like(dy)
        # enqueue kernel using forward pass heuristics
        # also compute partial sums for DW and DB
        x_arg = x.reshape(-1, x.shape[-1])
        M, N = x_arg.shape
        _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
                                       x_arg.stride(0), N, ctx.eps,
                                       BLOCK_SIZE_N=ctx.BLOCK_SIZE, 
                                       GROUP_SIZE_M=GROUP_SIZE_M,
                                       num_warps=ctx.num_warps)
        grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
        # accumulate partial sums in separate kernel
        _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N, 
                                   BLOCK_SIZE_M = 32, 
                                   BLOCK_SIZE_N = 128)
        return dx, None, dw, db, None
 layer_norm = LayerNorm.apply
 def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
    # create data
    x_shape = (M, N)
    w_shape = (x_shape[-1], )
    weight  = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
    bias    = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
    x       = -2.3 + 0.5*torch.randn(x_shape, dtype=dtype, device='cuda')
    dy      = .1*torch.randn_like(x)
    x.requires_grad_(True)
    # forward pass
    y_tri = layer_norm(x, w_shape, weight, bias, eps)
    y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
    # backward pass (triton)
    y_tri.backward(dy, retain_graph=True)
    dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
    x.grad, weight.grad, bias.grad = None, None, None
    # backward pass (torch)
    y_ref.backward(dy, retain_graph=True)
    dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
    # compare
    triton.testing.assert_almost_equal(y_tri, y_ref)
    triton.testing.assert_almost_equal(dx_tri, dx_ref)
    triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)
    triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['N'],
        x_vals=[512 * i for i in range(2, 32)],
        line_arg='provider',
        line_vals=['triton', 'torch', 'apex'],
        line_names=['Triton', 'Torch', 'Apex'],
        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
        ylabel='GB/s',
        plot_name='layer-norm-backward',
        args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
    )
 )
 def bench_layer_norm(M, N, dtype, provider, mode='backward',eps=1e-5, device='cuda'):
    # create data
    x_shape = (M, N)
    w_shape = (x_shape[-1], )
    weight  = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
    bias    = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
    x       = -2.3 + 0.5*torch.randn(x_shape, dtype=dtype, device='cuda')
    dy      = .1*torch.randn_like(x)
    x.requires_grad_(True)
    # utility functions
    if provider == 'triton':
        y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
    if provider == 'torch':
        y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
    if provider == 'apex':
        import apex
        apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
        y_fwd = lambda: apex_layer_norm(x)
    # forward pass
    if mode == 'forward':
        gbps = lambda ms: 2*x.numel()*x.element_size()/ms*1e-6
        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)
    # backward pass
    if mode == 'backward':
        gbps = lambda ms: 3*x.numel()*x.element_size()/ms*1e-6
        y = y_fwd()
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), 
                                                     grad_to_none=[x], rep=500)
    return gbps(ms), gbps(max_ms), gbps(min_ms)
 bench_layer_norm.run(save_path='.', print_data=True)
--- a/v1.1.2/_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb
+++ b/v1.1.2/_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb
--- a/v1.1.2/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb
+++ b/v1.1.2/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb
--- a/v1.1.2/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb
+++ b/v1.1.2/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb
@@ -0,0 +1,100 @@
 {
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Low-Memory Dropout\n\nIn this tutorial, you will write a memory-efficient implementation of dropout whose state\nwill be composed of a single int32 seed. This differs from more traditional implementations of dropout,\nwhose state is generally composed of a bit mask tensor of the same shape as the input. You will learn about:\n\n- The limitations of naive implementations of Dropout with PyTorch\n- Parallel pseudo-random number generation in Triton\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Baseline\nThe *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance \nof deep neural networks in low-data regime (i.e. regularization).\n\nIt takes a vector as input and produces a vector of the same shape as output. Each scalar in the\noutput has a probability $p$ of being changed to zero and otherwise it is copied from the input.\nThis forces the network to perform well even when only $1 - p$ scalars from the input are available.\n\nAt evaluation time we want to use the full power of the network so we set $p=0$. Naively this would\nincrease the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease\nin the output softmax temperature). To prevent this we multiply the output by $\\frac{1}{1 - p}$, which\nkeeps the norm consistent regardless of the dropout probability.\n\nLet's first take a look at the baseline implementation.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import tabulate\nimport torch\nimport triton\nimport triton.language as tl\n\n@triton.jit\ndef _dropout(\n        x_ptr, # pointer to the input\n        x_keep_ptr, # pointer to a mask of 0s and 1s\n        output_ptr, # pointer to the output\n        n_elements, # number of elements in the `x` tensor\n        p, # probability that an element of `x` is changed to zero\n        **meta,\n):\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    mask = offsets < n_elements\n    # Load data\n    x = tl.load(x_ptr + offsets, mask=mask)\n    x_keep = tl.load(x_keep_ptr + offsets, mask=mask)\n    # The line below is the crucial part, described in the paragraph above!\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    # Write-back output\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef dropout(x, x_keep, p):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)\n    return output\n\n# Input tensor\nx = torch.randn(size=(10,)).cuda()\n# Dropout mask\np = 0.5\nx_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()\n#\noutput = dropout(x, x_keep=x_keep, p=p)\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"keep mask\"] + x_keep.tolist(),\n    [\"output\"] + output.tolist()\n]))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Seeded dropout\nAbove implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly\nwe need to store the dropout mask for backpropagation. Secondly, dropout state management can get\nvery tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in\nhttps://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation\nthat (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management\nof persisting randomness across multiple invocations of the kernel.\n\nPseudorandom number generation in Triton is simple! In this tutorial we will use the\n:code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` \nvalues in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides\nother `random number generation strategies <Random Number Generation>`.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).</p></div>\n\nLet's put it all together.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "@triton.jit\ndef _seeded_dropout(\n        x_ptr,\n        output_ptr,\n        n_elements,\n        p,\n        seed,\n        **meta,\n):\n    # compute memory offsets of elements handled by this instance\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # load data from x\n    mask = offsets < n_elements\n    x = tl.load(x_ptr + offsets, mask=mask)\n    # randomly prune it\n    random = tl.rand(seed, offsets)\n    x_keep = random > p\n    # write-back\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef seeded_dropout(x, p, seed):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)\n    return output\n\n\nx = torch.randn(size=(10,)).cuda()\n# Compare this to the baseline - dropout mask is never instantiated!\noutput = seeded_dropout(x, p=0.5, seed=123)\noutput2 = seeded_dropout(x, p=0.5, seed=123)\noutput3 = seeded_dropout(x, p=0.5, seed=512)\n\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"output (seed = 123)\"] + output.tolist(),\n    [\"output (seed = 123)\"] + output2.tolist(),\n    [\"output (seed = 512)\"] + output3.tolist()\n]))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Et Voil\u00e0! We have a triton kernel that applies the same dropout mask provided the seed is the same!\nIf you'd like explore further applications of pseudorandomness in GPU programming, we encourage you\nto explore the `triton/language/random` folder!\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Exercises\n1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row.\n2. Add support for striding.\n3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed.\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## References\n\n.. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, \"Parallel Random Numbers: As Easy as 1, 2, 3\", 2011\n.. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, \"Dropout: A Simple Way to Prevent Neural Networks from Overfitting\", JMLR 2014\n\n"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/v1.1.2/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py
+++ b/v1.1.2/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py
@@ -0,0 +1,164 @@
 """
 Low-Memory Dropout
 =================
 In this tutorial, you will write a memory-efficient implementation of dropout whose state
 will be composed of a single int32 seed. This differs from more traditional implementations of dropout,
 whose state is generally composed of a bit mask tensor of the same shape as the input. You will learn about:
 - The limitations of naive implementations of Dropout with PyTorch
 - Parallel pseudo-random number generation in Triton
 """
 # %%
 # Baseline
 # -------------
 # The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance 
 # of deep neural networks in low-data regime (i.e. regularization).
 #
 # It takes a vector as input and produces a vector of the same shape as output. Each scalar in the
 # output has a probability :math:`p` of being changed to zero and otherwise it is copied from the input.
 # This forces the network to perform well even when only :math:`1 - p` scalars from the input are available.
 #
 # At evaluation time we want to use the full power of the network so we set :math:`p=0`. Naively this would
 # increase the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease
 # in the output softmax temperature). To prevent this we multiply the output by :math:`\frac{1}{1 - p}`, which
 # keeps the norm consistent regardless of the dropout probability.
 #
 # Let's first take a look at the baseline implementation.
 import tabulate
 import torch
 import triton
 import triton.language as tl
@triton.jit
 def _dropout(
        x_ptr, # pointer to the input
        x_keep_ptr, # pointer to a mask of 0s and 1s
        output_ptr, # pointer to the output
        n_elements, # number of elements in the `x` tensor
        p, # probability that an element of `x` is changed to zero
        **meta,
 ):
    BLOCK_SIZE = meta['BLOCK_SIZE']
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements
    # Load data
    x = tl.load(x_ptr + offsets, mask=mask)
    x_keep = tl.load(x_keep_ptr + offsets, mask=mask)
    # The line below is the crucial part, described in the paragraph above!
    output = tl.where(x_keep, x / (1 - p), 0.0)
    # Write-back output
    tl.store(output_ptr + offsets, output, mask=mask)
 def dropout(x, x_keep, p):
    output = torch.empty_like(x)
    assert x.is_contiguous()
    n_elements = x.numel()
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)
    return output
 # Input tensor
 x = torch.randn(size=(10,)).cuda()
 # Dropout mask
 p = 0.5
 x_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()
 #
 output = dropout(x, x_keep=x_keep, p=p)
 print(tabulate.tabulate([
    ["input"] + x.tolist(),
    ["keep mask"] + x_keep.tolist(),
    ["output"] + output.tolist()
 ]))
 # %%
 # Seeded dropout
 # -------------
 # Above implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly
 # we need to store the dropout mask for backpropagation. Secondly, dropout state management can get
 # very tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in
 # https://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation
 # that (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management
 # of persisting randomness across multiple invocations of the kernel.
 #
 # Pseudorandom number generation in Triton is simple! In this tutorial we will use the
 # :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` 
 # values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides
 # other :ref:`random number generation strategies <Random Number Generation>`.
 #
 # .. note::
 #    Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).
 #
 # Let's put it all together.
@triton.jit
 def _seeded_dropout(
        x_ptr,
        output_ptr,
        n_elements,
        p,
        seed,
        **meta,
 ):
    # compute memory offsets of elements handled by this instance
    BLOCK_SIZE = meta['BLOCK_SIZE']
    pid = tl.program_id(axis=0)
    block_start = pid * BLOCK_SIZE
    offsets = block_start + tl.arange(0, BLOCK_SIZE)
    # load data from x
    mask = offsets < n_elements
    x = tl.load(x_ptr + offsets, mask=mask)
    # randomly prune it
    random = tl.rand(seed, offsets)
    x_keep = random > p
    # write-back
    output = tl.where(x_keep, x / (1 - p), 0.0)
    tl.store(output_ptr + offsets, output, mask=mask)
 def seeded_dropout(x, p, seed):
    output = torch.empty_like(x)
    assert x.is_contiguous()
    n_elements = x.numel()
    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
    _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)
    return output
 x = torch.randn(size=(10,)).cuda()
 # Compare this to the baseline - dropout mask is never instantiated!
 output = seeded_dropout(x, p=0.5, seed=123)
 output2 = seeded_dropout(x, p=0.5, seed=123)
 output3 = seeded_dropout(x, p=0.5, seed=512)
 print(tabulate.tabulate([
    ["input"] + x.tolist(),
    ["output (seed = 123)"] + output.tolist(),
    ["output (seed = 123)"] + output2.tolist(),
    ["output (seed = 512)"] + output3.tolist()
 ]))
 # %%
 # Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same!
 # If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you
 # to explore the `triton/language/random` folder!
 # %%
 # Exercises
 # -------------
 # 1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row.
 # 2. Add support for striding.
 # 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed.
 # %%
 # References
 # --------------
 #
 # .. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, "Parallel Random Numbers: As Easy as 1, 2, 3", 2011
 # .. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014
--- a/v1.1.2/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py
+++ b/v1.1.2/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py
@@ -0,0 +1,359 @@
 """
 Matrix Multiplication
 ======================
 In this tutorial, you will write a 25-lines high-performance FP16 matrix multiplication
 kernel that achieves performance on par with cuBLAS.
 You will specifically learn about:
 - Block-level matrix multiplications
 - Multi-dimensional pointer arithmetic
 - Program re-ordering for improved L2 cache hit rate
 - Automatic performance tuning
 """
 # %%
 # Motivations
 # -------------
 # Matrix multiplications are a key building block of most modern high-performance computing systems.
 # They are notoriously hard to optimize, hence their implementation is generally done by
 # hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS).
 # Unfortunately, these libraries are often proprietary and cannot be easily customized
 # to accomodate the needs of modern deep learning workloads (e.g., fused activation functions).
 # In this tutorial, you will learn how to implement efficient matrix multiplications by
 # yourself with Triton, in a way that is easy to customize and extend.
 #
 # Roughly speaking, the kernel that we will write will implement the following blocked
 # algorithm to multiply a (M, K) by a (K, N) matrix:
 #
 #  .. code-block:: python
 #
 #    # do in parallel
 #    for m in range(0, M, BLOCK_SIZE_M):
 #      # do in parallel
 #      for n in range(0, N, BLOCK_SIZE_N):
 #        acc = zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=float32)
 #        for k in range(0, K, BLOCK_SIZE_K):
 #          a = A[m : m+BLOCK_SIZE_M, k : k+BLOCK_SIZE_K]
 #          b = B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]
 #          acc += dot(a, b)
 #        C[m : m+BLOCK_SIZE_M, n : n+BLOCK_SIZE_N] = acc;
 #
 # where each iteration of the doubly-nested for-loop is performed by a dedicated Triton program instance.
 # %%
 # Compute Kernel
 # ----------------
 #
 # The above algorithm is, actually, fairly straightforward to implement in Triton.
 # The main difficulty comes from the computation of the memory locations at which blocks
 # of :code:`A` and :code:`B` must be read in the inner loop. For that, we need
 # multi-dimensional pointer arithmetics.
 #
 # Pointer Arithmetics
 # ~~~~~~~~~~~~~~~~~~~~
 #
 # For a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b
 # y :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`.
 # Therefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and
 # :code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as:
 #
 #  .. code-block:: python
 #
 #    &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] =  a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1);
 #    &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] =  b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1);
 #
 # Which means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as:
 #
 #  .. code-block:: python
 #
 #    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
 #    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
 #    offs_k = tl.arange(0, BLOCK_SIZE_K)
 #    a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
 #    b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
 #
 # And then updated in the inner loop as follows:
 #
 #  .. code-block:: python
 #
 #    pa += BLOCK_SIZE_K * stride_ak;
 #    pb += BLOCK_SIZE_K * stride_bk;
 #
 #
 # L2 Cache Optimizations
 # ~~~~~~~~~~~~~~~~~~~~~~~~
 #
 # As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`
 # block of :code:`C`.
 # It is important to remember that the order in which these blocks are computed does
 # matter, since it affects the L2 cache hit rate of our program. and unfortunately, a
 # a simple row-major ordering
 #
 #  .. code-block:: Python
 #
 #    pid = triton.program_id(0);
 #    grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M;
 #    grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N;
 #    pid_m = pid / grid_n;
 #    pid_n = pid % grid_n;
 #
 # is just not going to cut it.
 #
 # One possible solution is to launch blocks in an order that promotes data reuse.
 # This can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before
 # switching to the next column:
 #
 #  .. code-block:: python
 #
 #    # program ID
 #    pid = tl.program_id(axis=0)
 #    # number of program ids along the M axis
 #    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
 #    # number of programs ids along the N axis
 #    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
 #    # number of programs in group
 #    num_pid_in_group = GROUP_SIZE_M * num_pid_n 
 #    # id of the group this program is in
 #    group_id = pid // num_pid_in_group 
 #    # row-id of the first program in the group
 #    first_pid_m = group_id * GROUP_SIZE_M 
 #    # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller
 #    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
 #    # *within groups*, programs are ordered in a column-major order
 #    # row-id of the program in the *launch grid*
 #    pid_m = first_pid_m + (pid % group_size_m)
 #    # col-id of the program in the *launch grid*
 #    pid_n = (pid % num_pid_in_group) // group_size_m
 #
 # For example, in the following matmul where each matrix is 9 blocks by 9 blocks,
 # we can see that if we compute the output in row-major ordering, we need to load 90
 # blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped
 # ordering, we only need to load 54 blocks.
 #   .. image:: grouped_vs_row_major_ordering.png
 #
 # In practice, this can improve the performance of our matrix multiplication kernel by
 # more than 10\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).
 #
 # %%
 # Final Result
 # -------------
 #
 import torch
 import triton
 import triton.language as tl
 # %
 # :code:`triton.jit`'ed functions can be auto-tuned by using the `triton.autotune`
 # decorator, which consumes:
 #   - A list of :code:`triton.Config` objects that define different configurations of
 #       meta-parameters (e.g., BLOCK_SIZE_M) and compilation options (e.g., num_warps) to try
 #   - An autotuning *key* whose change in values will trigger evaluation of all the
 #       provided configs
@triton.autotune(
    configs=[
        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
        triton.Config({'BLOCK_SIZE_M': 32 , 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
    ],
    key=['M', 'N', 'K'],
 )
 # %
 # We can now define our kernel as normal, using all the techniques presented above
@triton.jit
 def matmul_kernel(
    # Pointers to matrices
    a_ptr, b_ptr, c_ptr,
    # Matrix dimensions
    M, N, K,
    # The stride variables represent how much to increase the ptr by when moving by 1
    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
    # by to get the element one row down (A has M rows)
    stride_am, stride_ak,
    stride_bk, stride_bn,
    stride_cm, stride_cn,
    # Meta-parameters
    **meta,
 ):
    """Kernel for computing the matmul C = A x B.
    A has shape (M, K), B has shape (K, N) and C has shape (M, N)
    """
    # extract meta-parameters
    BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
    BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
    BLOCK_SIZE_K = meta['BLOCK_SIZE_K']
    GROUP_SIZE_M = 8
    # -----------------------------------------------------------
    # Map program ids `pid` to the block of C it should compute.
    # This is done in a grouped ordering to promote L2 data reuse
    # See above `L2 Cache Optimizations` section for details
    pid = tl.program_id(axis=0)
    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    num_pid_in_group = GROUP_SIZE_M * num_pid_n 
    group_id = pid // num_pid_in_group 
    first_pid_m = group_id * GROUP_SIZE_M 
    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
    pid_m = first_pid_m + (pid % group_size_m)
    pid_n = (pid % num_pid_in_group) // group_size_m
    # ----------------------------------------------------------
    # Create pointers for the first blocks of A and B.
    # We will advance this pointer as we move in the K direction 
    # and accumulate
    # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
    # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
    # see above `Pointer Arithmetics` section for details
    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    offs_k = tl.arange(0, BLOCK_SIZE_K)
    a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
    b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
    # -----------------------------------------------------------
    # Iterate to compute a block of the C matrix
    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
    # of fp32 values for higher accuracy.
    # `accumulator` will be converted back to fp16 after the loop
    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    for k in range(0, K, BLOCK_SIZE_K):
        # Note that for simplicity, we don't apply a mask here. 
        # This means that if K is not a multiple of BLOCK_SIZE_K, 
        # this will access out-of-bounds memory and produce an
        # error or (worse!) incorrect results.
        a = tl.load(a_ptrs)
        b = tl.load(b_ptrs)
        # We accumulate along the K dimension
        accumulator += tl.dot(a, b)
        # Advance the ptrs to the next K block
        a_ptrs += BLOCK_SIZE_K * stride_ak
        b_ptrs += BLOCK_SIZE_K * stride_bk
    # you can fuse arbitrary activation functions here
    # while the accumulator is still in FP32 !
    if meta['ACTIVATION']: 
        accumulator = meta['ACTIVATION'](accumulator)
    c = accumulator.to(tl.float16)
    # -----------------------------------------------------------
    # Write back the block of the output matrix C
    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
    tl.store(c_ptrs, c, mask=c_mask)
 # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`
@triton.jit
 def leaky_relu(x):
    return tl.where(x >= 0, x, 0.01 * x)
 # %%
 # We can now create a convenience wrapper function that only takes two input tensors
 # and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel
 def matmul(a, b, activation=None):
    # checks constraints
    assert a.shape[1] == b.shape[0], "incompatible dimensions"
    assert a.is_contiguous(), "matrix A must be contiguous"
    assert b.is_contiguous(), "matrix B must be contiguous"
    M, K = a.shape
    K, N = b.shape
    assert (
        K % 32 == 0
    ), "We don't check memory-out-of-bounds with K so K must be divisible by BLOCK_SIZE_K"
    # allocates output
    c = torch.empty((M, N), device=a.device, dtype=a.dtype)
    # 1D launch kernel where each block gets its own program.
    grid = lambda META: (
        triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
    )
    matmul_kernel[grid](
        a, b, c,
        M, N, K,
        a.stride(0), a.stride(1),
        b.stride(0), b.stride(1),
        c.stride(0), c.stride(1),
        ACTIVATION=activation,
    )
    return c
 # %%
 # Unit Test
 # -----------
 #
 # We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS)
 torch.manual_seed(0)
 a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 triton_output = matmul(a, b, activation=None)
 torch_output = torch.matmul(a, b)
 print(f"triton_output={triton_output}")
 print(f"torch_output={torch_output}")
 if triton.testing.allclose(triton_output, torch_output):
    print("✅ Triton and Torch match")
 else:
    print("❌ Triton and Torch differ")
 # %%
 # Benchmark
 # --------------
 #
 # Square Matrix Performance
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~
 # We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices, but feel free to arrange this script as you wish to benchmark any other matrix shape.
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['M', 'N', 'K'],  # argument names to use as an x-axis for the plot
        x_vals=[
            128 * i for i in range(2, 33)
        ],  # different possible values for `x_name`
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        # possible values for `line_arg``
        line_vals=['cublas', 'cublas + relu', 'triton', 'triton + relu'],
        # label name for the lines
        line_names=["cuBLAS", "cuBLAS (+ torch.nn.LeakyReLU)", "Triton", "Triton (+ LeakyReLU)"],
        # line styles
        styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
        ylabel="TFLOPS",  # label name for the y-axis
        plot_name="matmul-performance",  # name for the plot. Used also as a file name for saving the plot.
        args={},
    )
 )
 def benchmark(M, N, K, provider):
    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
    if provider == 'cublas':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b))
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b))
    if provider == 'cublas + relu':
        torch_relu = torch.nn.ReLU(inplace=True)
        ms, min_ms, max_ms = triton.testing.do_bench(
            lambda: torch_relu(torch.matmul(a, b))
        )
    if provider == 'triton + relu':
        ms, min_ms, max_ms = triton.testing.do_bench(
            lambda: matmul(a, b, activation=leaky_relu)
        )
    perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
    return perf(ms), perf(max_ms), perf(min_ms)
 benchmark.run(show_plots=True, print_data=True)
--- a/v1.1.2/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
+++ b/v1.1.2/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
@@ -0,0 +1,191 @@
 """
 Fused Softmax
 =================
 In this tutorial, you will write a fused softmax operation that is significantly faster
 than PyTorch's native op for a particular class of matrices: those whose rows can fit in
 the GPU's SRAM.
 You will learn about:
 - The benefits of kernel fusion for bandwidth-bound operations.
 - Reduction operators in Triton.
 """
 # %%
 # Motivations
 # ------------
 # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 # Let us consider instead the case of a simple (numerically stabilized) softmax operation:
 import torch
@torch.jit.script
 def naive_softmax(x):
    """Compute row-wise softmax of X using native pytorch
    We subtract the maximum element in order to avoid overflows. Softmax is invariant to
    this shift.
    """
    # read  MN elements ; write M  elements
    x_max = x.max(dim=1)[0]
    # read MN + M elements ; write MN elements
    z = x - x_max[:, None]
    # read  MN elements ; write MN elements
    numerator = torch.exp(z)
    # read  MN elements ; write M  elements
    denominator = numerator.sum(dim=1)
    # read MN + M elements ; write MN elements
    ret = numerator / denominator[:, None]
    # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements
    return ret
 # %%
 # When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}`
 # requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements.
 # This is obviously wasteful; we'd prefer to have a custom "fused" kernel that only reads
 # X once and does all the necessary computations on-chip.
 # Doing so would require reading and writing back only :math:`MN` bytes, so we could
 # expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`).
 # The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically
 # but, as we will see later, it is still far from ideal.
 # %%
 # Compute Kernel
 # ----------------
 # Our softmax kernel works as follows: each program loads a row of the input matrix X,
 # normalizes it and writes back the result to the output Y.
 # Note that one important limitation of Triton is that each block must have a
 # power-of-two number of elements, so we need to internally "pad" each row and guard the
 # memory operations properly if we want to handle any possible input shapes:
 import triton
 import triton.language as tl
@triton.jit
 def softmax_kernel(
    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta
 ):
    # The rows of the softmax are independent, so we parallelize across those
    row_idx = tl.program_id(0)
    BLOCK_SIZE = meta['BLOCK_SIZE']
    # The stride represents how much we need to increase the pointer to advance 1 row
    row_start_ptr = input_ptr + row_idx * input_row_stride
    # The block size is the next power of two greater than n_cols, so we can fit each
    # row in a single block
    col_offsets = tl.arange(0, BLOCK_SIZE)
    input_ptrs = row_start_ptr + col_offsets
    # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols
    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
    # Substract maximum for numerical stability
    row_minus_max = row - tl.max(row, axis=0)
    # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)
    numerator = tl.exp(row_minus_max)
    denominator = tl.sum(numerator, axis=0)
    softmax_output = numerator / denominator
    # Write back output to DRAM
    output_row_start_ptr = output_ptr + row_idx * output_row_stride
    output_ptrs = output_row_start_ptr + col_offsets
    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
 # %%
 # We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor.
 def softmax(x):
    n_rows, n_cols = x.shape
    # The block size is the smallest power of two greater than the number of columns in `x`
    BLOCK_SIZE = triton.next_power_of_2(n_cols)
    # Another trick we can use is to ask the compiler to use more threads per row by
    # increasing the number of warps (`num_warps`) over which each row is distributed.
    # You will see in the next tutorial how to auto-tune this value in a more natural
    # way so you don't have to come up with manual heuristics yourself.
    num_warps = 4
    if BLOCK_SIZE >= 2048:
        num_warps = 8
    if BLOCK_SIZE >= 4096:
        num_warps = 16
    # Allocate output
    y = torch.empty_like(x)
    # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o
    # f the input matrix
    softmax_kernel[(n_rows,)](
        y,
        x,
        x.stride(0),
        y.stride(0),
        n_cols,
        num_warps=num_warps,
        BLOCK_SIZE=BLOCK_SIZE,
    )
    return y
 # %%
 # Unit Test
 # ----------
 # %%
 # We make sure that we test our kernel on a matrix with an irregular number of rows and columns.
 # This will allow us to verify that our padding mechanism works.
 torch.manual_seed(0)
 x = torch.randn(1823, 781, device='cuda')
 y_triton = softmax(x)
 y_torch = torch.softmax(x, axis=1)
 print(torch.allclose(y_triton, y_torch))
 #%%
 # As expected, the results are identical.
 # %%
 # Benchmark
 # -------------
 # Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
 # We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.
@triton.testing.perf_report(
    triton.testing.Benchmark(
        x_names=['N'],  # argument names to use as an x-axis for the plot
        x_vals=[
            128 * i for i in range(2, 100)
        ],  # different possible values for `x_name`
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        line_vals=[
            'triton',
            'torch-native',
            'torch-jit',
        ],  # possible values for `line_arg``
        line_names=[
            "Triton",
            "Torch (native)",
            "Torch (jit)",
        ],  # label name for the lines
        styles=[('blue', '-'), ('green', '-'), ('green', '--')],  # line styles
        ylabel="GB/s",  # label name for the y-axis
        plot_name="softmax-performance",  # name for the plot. Used also as a file name for saving the plot.
        args={'M': 4096},  # values for function arguments not in `x_names` and `y_name`
    )
 )
 def benchmark(M, N, provider):
    x = torch.randn(M, N, device='cuda', dtype=torch.float32)
    if provider == 'torch-native':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
    if provider == 'triton':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))
    if provider == 'torch-jit':
        ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))
    gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
    return gbps(ms), gbps(max_ms), gbps(min_ms)
 benchmark.run(show_plots=True, print_data=True)
 # %%
 # In the above plot, we can see that:
 #
 #  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
 #  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. 
 #    Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
--- a/v1.1.2/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
+++ b/v1.1.2/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
@@ -0,0 +1,140 @@
 {
  "cells": [
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "%matplotlib inline"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "\n# Vector Addition\nIn this tutorial, you will write a simple vector addition using Triton and learn about:\n\n- The basic programming model of Triton\n- The `triton.jit` decorator, which is used to define Triton kernels.\n- The best practices for validating and benchmarking your custom ops against native reference implementations\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Compute Kernel\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n    x_ptr,  # *Pointer* to first input vector\n    y_ptr,  # *Pointer* to second input vector\n    output_ptr,  # *Pointer* to output vector\n    n_elements,  # Size of the vector\n    **meta,  # Optional meta-parameters for the kernel\n):\n    BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process\n    # There are multiple 'program's processing different data. We identify which program\n    # we are here\n    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0\n    # This program will process inputs that are offset from the initial data.\n    # for instance, if you had a vector of length 256 and block_size of 64, the programs\n    # would each access the elements [0:64, 64:128, 128:192, 192:256].\n    # Note that offsets is a list of pointers\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # Create a mask to guard memory operations against out-of-bounds accesses\n    mask = offsets < n_elements\n    # Load x and y from DRAM, masking out any extar elements in case the input is not a\n    # multiple of the block size\n    x = tl.load(x_ptr + offsets, mask=mask)\n    y = tl.load(y_ptr + offsets, mask=mask)\n    output = x + y\n    # Write x + y back to DRAM\n    tl.store(output_ptr + offsets, output, mask=mask)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Let's also declare a helper function to (1) allocate the `z` tensor\nand (2) enqueue the above kernel with appropriate grid/block sizes.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "def add(x: torch.Tensor, y: torch.Tensor):\n    # We need to preallocate the output\n    output = torch.empty_like(x)\n    assert x.is_cuda and y.is_cuda and output.is_cuda\n    n_elements = output.numel()\n    # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n    # In this case, we use a 1D grid where the size is the number of blocks\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    # NOTE:\n    #  - each torch.tensor object is implicitly converted into a pointer to its first element.\n    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n    #  - don't forget to pass meta-parameters as keywords arguments\n    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)\n    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n    # running asynchronously at this point.\n    return output"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "torch.manual_seed(0)\nsize = 98432\nx = torch.rand(size, device='cuda')\ny = torch.rand(size, device='cuda')\noutput_torch = x + y\noutput_triton = add(x, y)\nprint(output_torch)\nprint(output_triton)\nprint(\n    f'The maximum difference between torch and triton is '\n    f'{torch.max(torch.abs(output_torch - output_triton))}'\n)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "Seems like we're good to go!\n\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Benchmark\nWe can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.\nTo make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops\nfor different problem sizes.\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "@triton.testing.perf_report(\n    triton.testing.Benchmark(\n        x_names=['size'],  # argument names to use as an x-axis for the plot\n        x_vals=[\n            2 ** i for i in range(12, 28, 1)\n        ],  # different possible values for `x_name`\n        x_log=True,  # x axis is logarithmic\n        line_arg='provider',  # argument name whose value corresponds to a different line in the plot\n        line_vals=['triton', 'torch'],  # possible values for `line_arg`\n        line_names=['Triton', 'Torch'],  # label name for the lines\n        styles=[('blue', '-'), ('green', '-')],  # line styles\n        ylabel='GB/s',  # label name for the y-axis\n        plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.\n        args={},  # values for function arguments not in `x_names` and `y_name`\n    )\n)\ndef benchmark(size, provider):\n    x = torch.rand(size, device='cuda', dtype=torch.float32)\n    y = torch.rand(size, device='cuda', dtype=torch.float32)\n    if provider == 'torch':\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)\n    if provider == 'triton':\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))\n    gbps = lambda ms: 12 * size / ms * 1e-6\n    return gbps(ms), gbps(max_ms), gbps(min_ms)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or\n`save_path='/path/to/results/' to save them to disk along with raw CSV data\n\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "collapsed": false
      },
      "outputs": [],
      "source": [
        "benchmark.run(print_data=True, show_plots=True)"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.8.10"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
 }
--- a/v1.1.2/_images/cuda-parallel-matmul.png
+++ b/v1.1.2/_images/cuda-parallel-matmul.png
--- a/v1.1.2/_images/grouped_vs_row_major_ordering.png
+++ b/v1.1.2/_images/grouped_vs_row_major_ordering.png
--- a/v1.1.2/_images/halide-iteration.png
+++ b/v1.1.2/_images/halide-iteration.png
--- a/v1.1.2/_images/polyhedral-iteration.png
+++ b/v1.1.2/_images/polyhedral-iteration.png
--- a/v1.1.2/_images/sphx_glr_01-vector-add_001.png
+++ b/v1.1.2/_images/sphx_glr_01-vector-add_001.png
--- a/v1.1.2/_images/sphx_glr_01-vector-add_thumb.png
+++ b/v1.1.2/_images/sphx_glr_01-vector-add_thumb.png
--- a/v1.1.2/_images/sphx_glr_02-fused-softmax_001.png
+++ b/v1.1.2/_images/sphx_glr_02-fused-softmax_001.png
--- a/v1.1.2/_images/sphx_glr_02-fused-softmax_thumb.png
+++ b/v1.1.2/_images/sphx_glr_02-fused-softmax_thumb.png
--- a/v1.1.2/_images/sphx_glr_03-matrix-multiplication_001.png
+++ b/v1.1.2/_images/sphx_glr_03-matrix-multiplication_001.png
--- a/v1.1.2/_images/sphx_glr_03-matrix-multiplication_thumb.png
+++ b/v1.1.2/_images/sphx_glr_03-matrix-multiplication_thumb.png
--- a/v1.1.2/_images/sphx_glr_04-low-memory-dropout_thumb.png
+++ b/v1.1.2/_images/sphx_glr_04-low-memory-dropout_thumb.png
--- a/v1.1.2/_images/sphx_glr_05-layer-norm_001.png
+++ b/v1.1.2/_images/sphx_glr_05-layer-norm_001.png
--- a/v1.1.2/_images/sphx_glr_05-layer-norm_thumb.png
+++ b/v1.1.2/_images/sphx_glr_05-layer-norm_thumb.png
--- a/v1.1.2/_images/triton-parallel-matmul.png
+++ b/v1.1.2/_images/triton-parallel-matmul.png
--- a/v1.1.2/_sources/getting-started/installation.rst.txt
+++ b/v1.1.2/_sources/getting-started/installation.rst.txt
--- a/v1.1.2/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -0,0 +1,285 @@
 .. DO NOT EDIT.
 .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
 .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
 .. "getting-started/tutorials/01-vector-add.py"
 .. LINE NUMBERS ARE GIVEN BELOW.
 .. only:: html
    .. note::
        :class: sphx-glr-download-link-note
        Click :ref:`here <sphx_glr_download_getting-started_tutorials_01-vector-add.py>`
        to download the full example code
 .. rst-class:: sphx-glr-example-title
 .. _sphx_glr_getting-started_tutorials_01-vector-add.py:
 Vector Addition
 =================
 In this tutorial, you will write a simple vector addition using Triton and learn about:
 - The basic programming model of Triton
 - The `triton.jit` decorator, which is used to define Triton kernels.
 - The best practices for validating and benchmarking your custom ops against native reference implementations
 .. GENERATED FROM PYTHON SOURCE LINES 12-14
 Compute Kernel
 --------------------------
 .. GENERATED FROM PYTHON SOURCE LINES 14-49
 .. code-block:: default
    import torch
    import triton
    import triton.language as tl
    @triton.jit
    def add_kernel(
        x_ptr,  # *Pointer* to first input vector
        y_ptr,  # *Pointer* to second input vector
        output_ptr,  # *Pointer* to output vector
        n_elements,  # Size of the vector
        **meta,  # Optional meta-parameters for the kernel
    ):
        BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process
        # There are multiple 'program's processing different data. We identify which program
        # we are here
        pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
        # This program will process inputs that are offset from the initial data.
        # for instance, if you had a vector of length 256 and block_size of 64, the programs
        # would each access the elements [0:64, 64:128, 128:192, 192:256].
        # Note that offsets is a list of pointers
        block_start = pid * BLOCK_SIZE
        offsets = block_start + tl.arange(0, BLOCK_SIZE)
        # Create a mask to guard memory operations against out-of-bounds accesses
        mask = offsets < n_elements
        # Load x and y from DRAM, masking out any extar elements in case the input is not a
        # multiple of the block size
        x = tl.load(x_ptr + offsets, mask=mask)
        y = tl.load(y_ptr + offsets, mask=mask)
        output = x + y
        # Write x + y back to DRAM
        tl.store(output_ptr + offsets, output, mask=mask)
 .. GENERATED FROM PYTHON SOURCE LINES 50-52
 Let's also declare a helper function to (1) allocate the `z` tensor
 and (2) enqueue the above kernel with appropriate grid/block sizes.
 .. GENERATED FROM PYTHON SOURCE LINES 52-73
 .. code-block:: default
    def add(x: torch.Tensor, y: torch.Tensor):
        # We need to preallocate the output
        output = torch.empty_like(x)
        assert x.is_cuda and y.is_cuda and output.is_cuda
        n_elements = output.numel()
        # The SPMD launch grid denotes the number of kernel instances that run in parallel.
        # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]
        # In this case, we use a 1D grid where the size is the number of blocks
        grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
        # NOTE:
        #  - each torch.tensor object is implicitly converted into a pointer to its first element.
        #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
        #  - don't forget to pass meta-parameters as keywords arguments
        pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
        # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
        # running asynchronously at this point.
        return output
 .. GENERATED FROM PYTHON SOURCE LINES 74-75
 We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
 .. GENERATED FROM PYTHON SOURCE LINES 75-89
 .. code-block:: default
    torch.manual_seed(0)
    size = 98432
    x = torch.rand(size, device='cuda')
    y = torch.rand(size, device='cuda')
    output_torch = x + y
    output_triton = add(x, y)
    print(output_torch)
    print(output_triton)
    print(
        f'The maximum difference between torch and triton is '
        f'{torch.max(torch.abs(output_torch - output_triton))}'
    )
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    tensor([1.3713, 1.3076, 0.4940,  ..., 0.6724, 1.2141, 0.9733], device='cuda:0')
    tensor([1.3713, 1.3076, 0.4940,  ..., 0.6724, 1.2141, 0.9733], device='cuda:0')
    The maximum difference between torch and triton is 0.0
 .. GENERATED FROM PYTHON SOURCE LINES 90-91
 Seems like we're good to go!
 .. GENERATED FROM PYTHON SOURCE LINES 93-98
 Benchmark
 -----------
 We can now benchmark our custom op on vectors of increasing sizes to get a sense of how it does relative to PyTorch.
 To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
 for different problem sizes.
 .. GENERATED FROM PYTHON SOURCE LINES 98-127
 .. code-block:: default
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=['size'],  # argument names to use as an x-axis for the plot
            x_vals=[
                2 ** i for i in range(12, 28, 1)
            ],  # different possible values for `x_name`
            x_log=True,  # x axis is logarithmic
            line_arg='provider',  # argument name whose value corresponds to a different line in the plot
            line_vals=['triton', 'torch'],  # possible values for `line_arg`
            line_names=['Triton', 'Torch'],  # label name for the lines
            styles=[('blue', '-'), ('green', '-')],  # line styles
            ylabel='GB/s',  # label name for the y-axis
            plot_name='vector-add-performance',  # name for the plot. Used also as a file name for saving the plot.
            args={},  # values for function arguments not in `x_names` and `y_name`
        )
    )
    def benchmark(size, provider):
        x = torch.rand(size, device='cuda', dtype=torch.float32)
        y = torch.rand(size, device='cuda', dtype=torch.float32)
        if provider == 'torch':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y)
        if provider == 'triton':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: add(x, y))
        gbps = lambda ms: 12 * size / ms * 1e-6
        return gbps(ms), gbps(max_ms), gbps(min_ms)
 .. GENERATED FROM PYTHON SOURCE LINES 128-130
 We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 `save_path='/path/to/results/' to save them to disk along with raw CSV data
 .. GENERATED FROM PYTHON SOURCE LINES 130-131
 .. code-block:: default
    benchmark.run(print_data=True, show_plots=True)
 .. image:: /getting-started/tutorials/images/sphx_glr_01-vector-add_001.png
    :alt: 01 vector add
    :class: sphx-glr-single-img
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    vector-add-performance:
               size      Triton       Torch
    0        4096.0    9.600000    9.600000
    1        8192.0   19.200000   19.200000
    2       16384.0   38.400001   38.400001
    3       32768.0   63.999998   76.800002
    4       65536.0  127.999995  127.999995
    5      131072.0  219.428568  219.428568
    6      262144.0  341.333321  384.000001
    7      524288.0  472.615390  472.615390
    8     1048576.0  614.400016  614.400016
    9     2097152.0  722.823517  722.823517
    10    4194304.0  780.190482  780.190482
    11    8388608.0  812.429770  812.429770
    12   16777216.0  833.084721  833.084721
    13   33554432.0  842.004273  842.004273
    14   67108864.0  847.448255  848.362445
    15  134217728.0  849.737435  850.656574
 .. rst-class:: sphx-glr-timing
   **Total running time of the script:** ( 1 minutes  46.338 seconds)
 .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example
  .. container:: sphx-glr-download sphx-glr-download-python
     :download:`Download Python source code: 01-vector-add.py <01-vector-add.py>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
     :download:`Download Jupyter notebook: 01-vector-add.ipynb <01-vector-add.ipynb>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -0,0 +1,345 @@
 .. DO NOT EDIT.
 .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
 .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
 .. "getting-started/tutorials/02-fused-softmax.py"
 .. LINE NUMBERS ARE GIVEN BELOW.
 .. only:: html
    .. note::
        :class: sphx-glr-download-link-note
        Click :ref:`here <sphx_glr_download_getting-started_tutorials_02-fused-softmax.py>`
        to download the full example code
 .. rst-class:: sphx-glr-example-title
 .. _sphx_glr_getting-started_tutorials_02-fused-softmax.py:
 Fused Softmax
 =================
 In this tutorial, you will write a fused softmax operation that is significantly faster
 than PyTorch's native op for a particular class of matrices: those whose rows can fit in
 the GPU's SRAM.
 You will learn about:
 - The benefits of kernel fusion for bandwidth-bound operations.
 - Reduction operators in Triton.
 .. GENERATED FROM PYTHON SOURCE LINES 14-18
 Motivations
 ------------
 Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 Let us consider instead the case of a simple (numerically stabilized) softmax operation:
 .. GENERATED FROM PYTHON SOURCE LINES 18-43
 .. code-block:: default
    import torch
    @torch.jit.script
    def naive_softmax(x):
        """Compute row-wise softmax of X using native pytorch
        We subtract the maximum element in order to avoid overflows. Softmax is invariant to
        this shift.
        """
        # read  MN elements ; write M  elements
        x_max = x.max(dim=1)[0]
        # read MN + M elements ; write MN elements
        z = x - x_max[:, None]
        # read  MN elements ; write MN elements
        numerator = torch.exp(z)
        # read  MN elements ; write M  elements
        denominator = numerator.sum(dim=1)
        # read MN + M elements ; write MN elements
        ret = numerator / denominator[:, None]
        # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements
        return ret
 .. GENERATED FROM PYTHON SOURCE LINES 44-52
 When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}`
 requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements.
 This is obviously wasteful; we'd prefer to have a custom "fused" kernel that only reads
 X once and does all the necessary computations on-chip.
 Doing so would require reading and writing back only :math:`MN` bytes, so we could
 expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`).
 The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically
 but, as we will see later, it is still far from ideal.
 .. GENERATED FROM PYTHON SOURCE LINES 54-61
 Compute Kernel
 ----------------
 Our softmax kernel works as follows: each program loads a row of the input matrix X,
 normalizes it and writes back the result to the output Y.
 Note that one important limitation of Triton is that each block must have a
 power-of-two number of elements, so we need to internally "pad" each row and guard the
 memory operations properly if we want to handle any possible input shapes:
 .. GENERATED FROM PYTHON SOURCE LINES 61-93
 .. code-block:: default
    import triton
    import triton.language as tl
    @triton.jit
    def softmax_kernel(
        output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta
    ):
        # The rows of the softmax are independent, so we parallelize across those
        row_idx = tl.program_id(0)
        BLOCK_SIZE = meta['BLOCK_SIZE']
        # The stride represents how much we need to increase the pointer to advance 1 row
        row_start_ptr = input_ptr + row_idx * input_row_stride
        # The block size is the next power of two greater than n_cols, so we can fit each
        # row in a single block
        col_offsets = tl.arange(0, BLOCK_SIZE)
        input_ptrs = row_start_ptr + col_offsets
        # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols
        row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
        # Substract maximum for numerical stability
        row_minus_max = row - tl.max(row, axis=0)
        # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)
        numerator = tl.exp(row_minus_max)
        denominator = tl.sum(numerator, axis=0)
        softmax_output = numerator / denominator
        # Write back output to DRAM
        output_row_start_ptr = output_ptr + row_idx * output_row_stride
        output_ptrs = output_row_start_ptr + col_offsets
        tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)
 .. GENERATED FROM PYTHON SOURCE LINES 94-95
 We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor.
 .. GENERATED FROM PYTHON SOURCE LINES 95-125
 .. code-block:: default
    def softmax(x):
        n_rows, n_cols = x.shape
        # The block size is the smallest power of two greater than the number of columns in `x`
        BLOCK_SIZE = triton.next_power_of_2(n_cols)
        # Another trick we can use is to ask the compiler to use more threads per row by
        # increasing the number of warps (`num_warps`) over which each row is distributed.
        # You will see in the next tutorial how to auto-tune this value in a more natural
        # way so you don't have to come up with manual heuristics yourself.
        num_warps = 4
        if BLOCK_SIZE >= 2048:
            num_warps = 8
        if BLOCK_SIZE >= 4096:
            num_warps = 16
        # Allocate output
        y = torch.empty_like(x)
        # Enqueue kernel. The 1D launch grid is simple: we have one kernel instance per row o
        # f the input matrix
        softmax_kernel[(n_rows,)](
            y,
            x,
            x.stride(0),
            y.stride(0),
            n_cols,
            num_warps=num_warps,
            BLOCK_SIZE=BLOCK_SIZE,
        )
        return y
 .. GENERATED FROM PYTHON SOURCE LINES 126-128
 Unit Test
 ----------
 .. GENERATED FROM PYTHON SOURCE LINES 130-132
 We make sure that we test our kernel on a matrix with an irregular number of rows and columns.
 This will allow us to verify that our padding mechanism works.
 .. GENERATED FROM PYTHON SOURCE LINES 132-139
 .. code-block:: default
    torch.manual_seed(0)
    x = torch.randn(1823, 781, device='cuda')
    y_triton = softmax(x)
    y_torch = torch.softmax(x, axis=1)
    print(torch.allclose(y_triton, y_torch))
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    True
 .. GENERATED FROM PYTHON SOURCE LINES 140-141
 As expected, the results are identical.
 .. GENERATED FROM PYTHON SOURCE LINES 143-147
 Benchmark
 -------------
 Here we will benchmark our operation as a function of the number of columns in the input matrix -- assuming 4096 rows.
 We will then compare its performance against (1) :code:`torch.softmax` and (2) the :code:`naive_softmax` defined above.
 .. GENERATED FROM PYTHON SOURCE LINES 147-186
 .. code-block:: default
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=['N'],  # argument names to use as an x-axis for the plot
            x_vals=[
                128 * i for i in range(2, 100)
            ],  # different possible values for `x_name`
            line_arg='provider',  # argument name whose value corresponds to a different line in the plot
            line_vals=[
                'triton',
                'torch-native',
                'torch-jit',
            ],  # possible values for `line_arg``
            line_names=[
                "Triton",
                "Torch (native)",
                "Torch (jit)",
            ],  # label name for the lines
            styles=[('blue', '-'), ('green', '-'), ('green', '--')],  # line styles
            ylabel="GB/s",  # label name for the y-axis
            plot_name="softmax-performance",  # name for the plot. Used also as a file name for saving the plot.
            args={'M': 4096},  # values for function arguments not in `x_names` and `y_name`
        )
    )
    def benchmark(M, N, provider):
        x = torch.randn(M, N, device='cuda', dtype=torch.float32)
        if provider == 'torch-native':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
        if provider == 'triton':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: softmax(x))
        if provider == 'torch-jit':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: naive_softmax(x))
        gbps = lambda ms: 2 * x.nelement() * x.element_size() * 1e-9 / (ms * 1e-3)
        return gbps(ms), gbps(max_ms), gbps(min_ms)
    benchmark.run(show_plots=True, print_data=True)
 .. image:: /getting-started/tutorials/images/sphx_glr_02-fused-softmax_001.png
    :alt: 02 fused softmax
    :class: sphx-glr-single-img
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    softmax-performance:
              N      Triton  Torch (native)  Torch (jit)
    0     256.0  512.000001      546.133347   190.511628
    1     384.0  585.142862      585.142862   153.600004
    2     512.0  655.360017      606.814814   154.566038
    3     640.0  682.666684      640.000002   160.000000
    4     768.0  722.823517      664.216187   162.754967
    ..      ...         ...             ...          ...
    93  12160.0  814.058574      406.179533   198.936606
    94  12288.0  814.111783      415.661740   199.197579
    95  12416.0  812.498981      412.149375   198.755369
    96  12544.0  812.566838      412.971190   199.012395
    97  12672.0  812.633240      412.097543   199.069228
    [98 rows x 4 columns]
 .. GENERATED FROM PYTHON SOURCE LINES 187-192
 In the above plot, we can see that:
 - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
 - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. 
   Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
 .. rst-class:: sphx-glr-timing
   **Total running time of the script:** ( 3 minutes  24.992 seconds)
 .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example
  .. container:: sphx-glr-download sphx-glr-download-python
     :download:`Download Python source code: 02-fused-softmax.py <02-fused-softmax.py>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
     :download:`Download Jupyter notebook: 02-fused-softmax.ipynb <02-fused-softmax.ipynb>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -0,0 +1,533 @@
 .. DO NOT EDIT.
 .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
 .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
 .. "getting-started/tutorials/03-matrix-multiplication.py"
 .. LINE NUMBERS ARE GIVEN BELOW.
 .. only:: html
    .. note::
        :class: sphx-glr-download-link-note
        Click :ref:`here <sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py>`
        to download the full example code
 .. rst-class:: sphx-glr-example-title
 .. _sphx_glr_getting-started_tutorials_03-matrix-multiplication.py:
 Matrix Multiplication
 ======================
 In this tutorial, you will write a 25-lines high-performance FP16 matrix multiplication
 kernel that achieves performance on par with cuBLAS.
 You will specifically learn about:
 - Block-level matrix multiplications
 - Multi-dimensional pointer arithmetic
 - Program re-ordering for improved L2 cache hit rate
 - Automatic performance tuning
 .. GENERATED FROM PYTHON SOURCE LINES 15-42
 Motivations
 -------------
 Matrix multiplications are a key building block of most modern high-performance computing systems.
 They are notoriously hard to optimize, hence their implementation is generally done by
 hardware vendors themselves as part of so-called "kernel libraries" (e.g., cuBLAS).
 Unfortunately, these libraries are often proprietary and cannot be easily customized
 to accomodate the needs of modern deep learning workloads (e.g., fused activation functions).
 In this tutorial, you will learn how to implement efficient matrix multiplications by
 yourself with Triton, in a way that is easy to customize and extend.
 Roughly speaking, the kernel that we will write will implement the following blocked
 algorithm to multiply a (M, K) by a (K, N) matrix:
 .. code-block:: python
   # do in parallel
   for m in range(0, M, BLOCK_SIZE_M):
     # do in parallel
     for n in range(0, N, BLOCK_SIZE_N):
       acc = zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=float32)
       for k in range(0, K, BLOCK_SIZE_K):
         a = A[m : m+BLOCK_SIZE_M, k : k+BLOCK_SIZE_K]
         b = B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]
         acc += dot(a, b)
       C[m : m+BLOCK_SIZE_M, n : n+BLOCK_SIZE_N] = acc;
 where each iteration of the doubly-nested for-loop is performed by a dedicated Triton program instance.
 .. GENERATED FROM PYTHON SOURCE LINES 44-137
 Compute Kernel
 ----------------
 The above algorithm is, actually, fairly straightforward to implement in Triton.
 The main difficulty comes from the computation of the memory locations at which blocks
 of :code:`A` and :code:`B` must be read in the inner loop. For that, we need
 multi-dimensional pointer arithmetics.
 Pointer Arithmetics
 ~~~~~~~~~~~~~~~~~~~~
 For a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b
 y :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`.
 Therefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and
 :code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as:
 .. code-block:: python
   &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] =  a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1);
   &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] =  b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1);
 Which means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as:
 .. code-block:: python
   offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
   offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
   offs_k = tl.arange(0, BLOCK_SIZE_K)
   a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
   b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
 And then updated in the inner loop as follows:
 .. code-block:: python
   pa += BLOCK_SIZE_K * stride_ak;
   pb += BLOCK_SIZE_K * stride_bk;
 L2 Cache Optimizations
 ~~~~~~~~~~~~~~~~~~~~~~~~
 As mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`
 block of :code:`C`.
 It is important to remember that the order in which these blocks are computed does
 matter, since it affects the L2 cache hit rate of our program. and unfortunately, a
 a simple row-major ordering
 .. code-block:: Python
   pid = triton.program_id(0);
   grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M;
   grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N;
   pid_m = pid / grid_n;
   pid_n = pid % grid_n;
 is just not going to cut it.
 One possible solution is to launch blocks in an order that promotes data reuse.
 This can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before
 switching to the next column:
 .. code-block:: python
   # program ID
   pid = tl.program_id(axis=0)
   # number of program ids along the M axis
   num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
   # number of programs ids along the N axis
   num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
   # number of programs in group
   num_pid_in_group = GROUP_SIZE_M * num_pid_n 
   # id of the group this program is in
   group_id = pid // num_pid_in_group 
   # row-id of the first program in the group
   first_pid_m = group_id * GROUP_SIZE_M 
   # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller
   group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
   # *within groups*, programs are ordered in a column-major order
   # row-id of the program in the *launch grid*
   pid_m = first_pid_m + (pid % group_size_m)
   # col-id of the program in the *launch grid*
   pid_n = (pid % num_pid_in_group) // group_size_m
 For example, in the following matmul where each matrix is 9 blocks by 9 blocks,
 we can see that if we compute the output in row-major ordering, we need to load 90
 blocks into SRAM to compute the first 9 output blocks, but if we do it in grouped
 ordering, we only need to load 54 blocks.
  .. image:: grouped_vs_row_major_ordering.png
 In practice, this can improve the performance of our matrix multiplication kernel by
 more than 10\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).
 .. GENERATED FROM PYTHON SOURCE LINES 139-142
 Final Result
 -------------
 .. GENERATED FROM PYTHON SOURCE LINES 142-262
 .. code-block:: default
    import torch
    import triton
    import triton.language as tl
    # %
    # :code:`triton.jit`'ed functions can be auto-tuned by using the `triton.autotune`
    # decorator, which consumes:
    #   - A list of :code:`triton.Config` objects that define different configurations of
    #       meta-parameters (e.g., BLOCK_SIZE_M) and compilation options (e.g., num_warps) to try
    #   - An autotuning *key* whose change in values will trigger evaluation of all the
    #       provided configs
    @triton.autotune(
        configs=[
            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
            triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
            triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
            triton.Config({'BLOCK_SIZE_M': 32 , 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
        ],
        key=['M', 'N', 'K'],
    )
    # %
    # We can now define our kernel as normal, using all the techniques presented above
    @triton.jit
    def matmul_kernel(
        # Pointers to matrices
        a_ptr, b_ptr, c_ptr,
        # Matrix dimensions
        M, N, K,
        # The stride variables represent how much to increase the ptr by when moving by 1
        # element in a particular dimension. E.g. stride_am is how much to increase a_ptr
        # by to get the element one row down (A has M rows)
        stride_am, stride_ak,
        stride_bk, stride_bn,
        stride_cm, stride_cn,
        # Meta-parameters
        **meta,
    ):
        """Kernel for computing the matmul C = A x B.
        A has shape (M, K), B has shape (K, N) and C has shape (M, N)
        """
        # extract meta-parameters
        BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
        BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
        BLOCK_SIZE_K = meta['BLOCK_SIZE_K']
        GROUP_SIZE_M = 8
        # -----------------------------------------------------------
        # Map program ids `pid` to the block of C it should compute.
        # This is done in a grouped ordering to promote L2 data reuse
        # See above `L2 Cache Optimizations` section for details
        pid = tl.program_id(axis=0)
        num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
        num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
        num_pid_in_group = GROUP_SIZE_M * num_pid_n 
        group_id = pid // num_pid_in_group 
        first_pid_m = group_id * GROUP_SIZE_M 
        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
        pid_m = first_pid_m + (pid % group_size_m)
        pid_n = (pid % num_pid_in_group) // group_size_m
        # ----------------------------------------------------------
        # Create pointers for the first blocks of A and B.
        # We will advance this pointer as we move in the K direction 
        # and accumulate
        # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
        # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
        # see above `Pointer Arithmetics` section for details
        offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
        offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
        offs_k = tl.arange(0, BLOCK_SIZE_K)
        a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
        b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
        # -----------------------------------------------------------
        # Iterate to compute a block of the C matrix
        # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block
        # of fp32 values for higher accuracy.
        # `accumulator` will be converted back to fp16 after the loop
        accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        for k in range(0, K, BLOCK_SIZE_K):
            # Note that for simplicity, we don't apply a mask here. 
            # This means that if K is not a multiple of BLOCK_SIZE_K, 
            # this will access out-of-bounds memory and produce an
            # error or (worse!) incorrect results.
            a = tl.load(a_ptrs)
            b = tl.load(b_ptrs)
            # We accumulate along the K dimension
            accumulator += tl.dot(a, b)
            # Advance the ptrs to the next K block
            a_ptrs += BLOCK_SIZE_K * stride_ak
            b_ptrs += BLOCK_SIZE_K * stride_bk
        # you can fuse arbitrary activation functions here
        # while the accumulator is still in FP32 !
        if meta['ACTIVATION']: 
            accumulator = meta['ACTIVATION'](accumulator)
        c = accumulator.to(tl.float16)
        # -----------------------------------------------------------
        # Write back the block of the output matrix C
        offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
        offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
        c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
        c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
        tl.store(c_ptrs, c, mask=c_mask)
    # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`
    @triton.jit
    def leaky_relu(x):
        return tl.where(x >= 0, x, 0.01 * x)
 .. GENERATED FROM PYTHON SOURCE LINES 263-265
 We can now create a convenience wrapper function that only takes two input tensors
 and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel
 .. GENERATED FROM PYTHON SOURCE LINES 265-294
 .. code-block:: default
    def matmul(a, b, activation=None):
        # checks constraints
        assert a.shape[1] == b.shape[0], "incompatible dimensions"
        assert a.is_contiguous(), "matrix A must be contiguous"
        assert b.is_contiguous(), "matrix B must be contiguous"
        M, K = a.shape
        K, N = b.shape
        assert (
            K % 32 == 0
        ), "We don't check memory-out-of-bounds with K so K must be divisible by BLOCK_SIZE_K"
        # allocates output
        c = torch.empty((M, N), device=a.device, dtype=a.dtype)
        # 1D launch kernel where each block gets its own program.
        grid = lambda META: (
            triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']),
        )
        matmul_kernel[grid](
            a, b, c,
            M, N, K,
            a.stride(0), a.stride(1),
            b.stride(0), b.stride(1),
            c.stride(0), c.stride(1),
            ACTIVATION=activation,
        )
        return c
 .. GENERATED FROM PYTHON SOURCE LINES 295-299
 Unit Test
 -----------
 We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS)
 .. GENERATED FROM PYTHON SOURCE LINES 299-312
 .. code-block:: default
    torch.manual_seed(0)
    a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
    b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
    triton_output = matmul(a, b, activation=None)
    torch_output = torch.matmul(a, b)
    print(f"triton_output={triton_output}")
    print(f"torch_output={torch_output}")
    if triton.testing.allclose(triton_output, torch_output):
        print("✅ Triton and Torch match")
    else:
        print("❌ Triton and Torch differ")
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    triton_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3984,  24.4531, -32.3438],
            [  6.3555, -19.6094,  34.0938,  ...,  -5.8945,   5.2891,   6.8867],
            [-32.0625,   5.9492,  15.3984,  ..., -21.3906, -23.9844, -10.1328],
            ...,
            [ -5.7031,   7.4492,   8.2656,  ..., -10.6953, -40.0000,  17.7500],
            [ 25.5000,  24.3281,  -8.4688,  ..., -18.9375,  32.5312, -29.9219],
            [ -5.3477,   4.9844,  11.8906,  ...,   5.5898,   6.4023, -17.3125]],
           device='cuda:0', dtype=torch.float16)
    torch_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3906,  24.4531, -32.3438],
            [  6.3516, -19.6094,  34.0938,  ...,  -5.8906,   5.2812,   6.8828],
            [-32.0625,   5.9531,  15.3984,  ..., -21.4062, -23.9844, -10.1328],
            ...,
            [ -5.7070,   7.4492,   8.2656,  ..., -10.6953, -40.0000,  17.7500],
            [ 25.5000,  24.3438,  -8.4609,  ..., -18.9375,  32.5312, -29.9219],
            [ -5.3477,   4.9805,  11.8828,  ...,   5.5859,   6.4023, -17.3125]],
           device='cuda:0', dtype=torch.float16)
    ✅ Triton and Torch match
 .. GENERATED FROM PYTHON SOURCE LINES 313-319
 Benchmark
 --------------
 Square Matrix Performance
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices, but feel free to arrange this script as you wish to benchmark any other matrix shape.
 .. GENERATED FROM PYTHON SOURCE LINES 319-360
 .. code-block:: default
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=['M', 'N', 'K'],  # argument names to use as an x-axis for the plot
            x_vals=[
                128 * i for i in range(2, 33)
            ],  # different possible values for `x_name`
            line_arg='provider',  # argument name whose value corresponds to a different line in the plot
            # possible values for `line_arg``
            line_vals=['cublas', 'cublas + relu', 'triton', 'triton + relu'],
            # label name for the lines
            line_names=["cuBLAS", "cuBLAS (+ torch.nn.LeakyReLU)", "Triton", "Triton (+ LeakyReLU)"],
            # line styles
            styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
            ylabel="TFLOPS",  # label name for the y-axis
            plot_name="matmul-performance",  # name for the plot. Used also as a file name for saving the plot.
            args={},
        )
    )
    def benchmark(M, N, K, provider):
        a = torch.randn((M, K), device='cuda', dtype=torch.float16)
        b = torch.randn((K, N), device='cuda', dtype=torch.float16)
        if provider == 'cublas':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b))
        if provider == 'triton':
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b))
        if provider == 'cublas + relu':
            torch_relu = torch.nn.ReLU(inplace=True)
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: torch_relu(torch.matmul(a, b))
            )
        if provider == 'triton + relu':
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: matmul(a, b, activation=leaky_relu)
            )
        perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
        return perf(ms), perf(max_ms), perf(min_ms)
    benchmark.run(show_plots=True, print_data=True)
 .. image:: /getting-started/tutorials/images/sphx_glr_03-matrix-multiplication_001.png
    :alt: 03 matrix multiplication
    :class: sphx-glr-single-img
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    matmul-performance:
             M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
    0    256.0   2.730667  ...   3.276800              2.978909
    1    384.0   7.372800  ...   8.507077              8.507077
    2    512.0  14.563555  ...  16.384000             16.384000
    3    640.0  22.260869  ...  24.380953             24.380953
    4    768.0  32.768000  ...  34.028308             34.028308
    5    896.0  39.025776  ...  40.140799             39.025776
    6   1024.0  51.150050  ...  52.428801             52.428801
    7   1152.0  45.242181  ...  46.656000             46.656000
    8   1280.0  51.200001  ...  56.888887             56.888887
    9   1408.0  64.138541  ...  67.305878             67.305878
    10  1536.0  79.526831  ...  79.526831             79.526831
    11  1664.0  62.929456  ...  62.492442             62.061463
    12  1792.0  72.983276  ...  72.512412             72.047592
    13  1920.0  68.776119  ...  70.172588             70.172588
    14  2048.0  73.584279  ...  76.959706             76.959706
    15  2176.0  83.500614  ...  86.367588             85.632545
    16  2304.0  68.643310  ...  76.319081             76.319081
    17  2432.0  71.125224  ...  82.147552             84.367759
    18  2560.0  77.283019  ...  80.908642             80.511054
    19  2688.0  83.369354  ...  89.676257             89.464755
    20  2816.0  83.552120  ...  82.916747             83.233226
    21  2944.0  82.373605  ...  80.771529             82.921853
    22  3072.0  81.589488  ...  88.612060             88.612060
    23  3200.0  82.368085  ...  94.674553             94.117647
    24  3328.0  82.369902  ...  83.275067             80.347427
    25  3456.0  81.849303  ...  84.775569             90.382926
    26  3584.0  87.381330  ...  98.375705             98.160909
    27  3712.0  83.386762  ...  88.718781             83.074717
    28  3840.0  84.356981  ...  92.390975             84.940091
    29  3968.0  91.885495  ...  84.096442             88.615785
    30  4096.0  86.480498  ...  92.755862             85.271746
    [31 rows x 5 columns]
 .. rst-class:: sphx-glr-timing
   **Total running time of the script:** ( 5 minutes  35.980 seconds)
 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example
  .. container:: sphx-glr-download sphx-glr-download-python
     :download:`Download Python source code: 03-matrix-multiplication.py <03-matrix-multiplication.py>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
     :download:`Download Jupyter notebook: 03-matrix-multiplication.ipynb <03-matrix-multiplication.ipynb>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
@@ -0,0 +1,269 @@
 .. DO NOT EDIT.
 .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
 .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
 .. "getting-started/tutorials/04-low-memory-dropout.py"
 .. LINE NUMBERS ARE GIVEN BELOW.
 .. only:: html
    .. note::
        :class: sphx-glr-download-link-note
        Click :ref:`here <sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py>`
        to download the full example code
 .. rst-class:: sphx-glr-example-title
 .. _sphx_glr_getting-started_tutorials_04-low-memory-dropout.py:
 Low-Memory Dropout
 =================
 In this tutorial, you will write a memory-efficient implementation of dropout whose state
 will be composed of a single int32 seed. This differs from more traditional implementations of dropout,
 whose state is generally composed of a bit mask tensor of the same shape as the input. You will learn about:
 - The limitations of naive implementations of Dropout with PyTorch
 - Parallel pseudo-random number generation in Triton
 .. GENERATED FROM PYTHON SOURCE LINES 14-29
 Baseline
 -------------
 The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance 
 of deep neural networks in low-data regime (i.e. regularization).
 It takes a vector as input and produces a vector of the same shape as output. Each scalar in the
 output has a probability :math:`p` of being changed to zero and otherwise it is copied from the input.
 This forces the network to perform well even when only :math:`1 - p` scalars from the input are available.
 At evaluation time we want to use the full power of the network so we set :math:`p=0`. Naively this would
 increase the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease
 in the output softmax temperature). To prevent this we multiply the output by :math:`\frac{1}{1 - p}`, which
 keeps the norm consistent regardless of the dropout probability.
 Let's first take a look at the baseline implementation.
 .. GENERATED FROM PYTHON SOURCE LINES 29-80
 .. code-block:: default
    import tabulate
    import torch
    import triton
    import triton.language as tl
    @triton.jit
    def _dropout(
            x_ptr, # pointer to the input
            x_keep_ptr, # pointer to a mask of 0s and 1s
            output_ptr, # pointer to the output
            n_elements, # number of elements in the `x` tensor
            p, # probability that an element of `x` is changed to zero
            **meta,
    ):
        BLOCK_SIZE = meta['BLOCK_SIZE']
        pid = tl.program_id(axis=0)
        block_start = pid * BLOCK_SIZE
        offsets = block_start + tl.arange(0, BLOCK_SIZE)
        mask = offsets < n_elements
        # Load data
        x = tl.load(x_ptr + offsets, mask=mask)
        x_keep = tl.load(x_keep_ptr + offsets, mask=mask)
        # The line below is the crucial part, described in the paragraph above!
        output = tl.where(x_keep, x / (1 - p), 0.0)
        # Write-back output
        tl.store(output_ptr + offsets, output, mask=mask)
    def dropout(x, x_keep, p):
        output = torch.empty_like(x)
        assert x.is_contiguous()
        n_elements = x.numel()
        grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
        _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)
        return output
    # Input tensor
    x = torch.randn(size=(10,)).cuda()
    # Dropout mask
    p = 0.5
    x_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()
    #
    output = dropout(x, x_keep=x_keep, p=p)
    print(tabulate.tabulate([
        ["input"] + x.tolist(),
        ["keep mask"] + x_keep.tolist(),
        ["output"] + output.tolist()
    ]))
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    ---------  -------  ---------  --------  --------  --------  --------  --------  --------  ---------  ---------
    input      1.541    -0.293429  -2.17879  0.568431  -1.08452  -1.3986   0.403347  0.838026  -0.719258  -0.403344
    keep mask  1         1          0        1          0         1        1         0          0          0
    output     3.08199  -0.586858   0        1.13686    0        -2.79719  0.806694  0          0          0
    ---------  -------  ---------  --------  --------  --------  --------  --------  --------  ---------  ---------
 .. GENERATED FROM PYTHON SOURCE LINES 81-99
 Seeded dropout
 -------------
 Above implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly
 we need to store the dropout mask for backpropagation. Secondly, dropout state management can get
 very tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in
 https://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation
 that (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management
 of persisting randomness across multiple invocations of the kernel.
 Pseudorandom number generation in Triton is simple! In this tutorial we will use the
 :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` 
 values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides
 other :ref:`random number generation strategies <Random Number Generation>`.
 .. note::
   Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).
 Let's put it all together.
 .. GENERATED FROM PYTHON SOURCE LINES 99-147
 .. code-block:: default
    @triton.jit
    def _seeded_dropout(
            x_ptr,
            output_ptr,
            n_elements,
            p,
            seed,
            **meta,
    ):
        # compute memory offsets of elements handled by this instance
        BLOCK_SIZE = meta['BLOCK_SIZE']
        pid = tl.program_id(axis=0)
        block_start = pid * BLOCK_SIZE
        offsets = block_start + tl.arange(0, BLOCK_SIZE)
        # load data from x
        mask = offsets < n_elements
        x = tl.load(x_ptr + offsets, mask=mask)
        # randomly prune it
        random = tl.rand(seed, offsets)
        x_keep = random > p
        # write-back
        output = tl.where(x_keep, x / (1 - p), 0.0)
        tl.store(output_ptr + offsets, output, mask=mask)
    def seeded_dropout(x, p, seed):
        output = torch.empty_like(x)
        assert x.is_contiguous()
        n_elements = x.numel()
        grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
        _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)
        return output
    x = torch.randn(size=(10,)).cuda()
    # Compare this to the baseline - dropout mask is never instantiated!
    output = seeded_dropout(x, p=0.5, seed=123)
    output2 = seeded_dropout(x, p=0.5, seed=123)
    output3 = seeded_dropout(x, p=0.5, seed=512)
    print(tabulate.tabulate([
        ["input"] + x.tolist(),
        ["output (seed = 123)"] + output.tolist(),
        ["output (seed = 123)"] + output2.tolist(),
        ["output (seed = 512)"] + output3.tolist()
    ]))
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    -------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
    input                -0.952835  0.371721  0.408716  1.42142  0.149397  -0.67086  -0.214186  -0.431969  -0.707878  -0.106434
    output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
    output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
    output (seed = 512)   0         0         0.817432  2.84284  0         -1.34172  -0.428372   0          0          0
    -------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
 .. GENERATED FROM PYTHON SOURCE LINES 148-151
 Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same!
 If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you
 to explore the `triton/language/random` folder!
 .. GENERATED FROM PYTHON SOURCE LINES 153-158
 Exercises
 -------------
 1. Extend the kernel to operate over a matrix and use a vector of seeds - one per row.
 2. Add support for striding.
 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed.
 .. GENERATED FROM PYTHON SOURCE LINES 160-165
 References
 --------------
 .. [SALMON2011] John K. Salmon, Mark A. Moraes, Ron O. Dror, and David E. Shaw, "Parallel Random Numbers: As Easy as 1, 2, 3", 2011
 .. [SRIVASTAVA2014] Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014
 .. rst-class:: sphx-glr-timing
   **Total running time of the script:** ( 0 minutes  0.010 seconds)
 .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example
  .. container:: sphx-glr-download sphx-glr-download-python
     :download:`Download Python source code: 04-low-memory-dropout.py <04-low-memory-dropout.py>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
     :download:`Download Jupyter notebook: 04-low-memory-dropout.ipynb <04-low-memory-dropout.ipynb>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/05-layer-norm.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/05-layer-norm.rst.txt
@@ -0,0 +1,360 @@
 .. DO NOT EDIT.
 .. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
 .. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
 .. "getting-started/tutorials/05-layer-norm.py"
 .. LINE NUMBERS ARE GIVEN BELOW.
 .. only:: html
    .. note::
        :class: sphx-glr-download-link-note
        Click :ref:`here <sphx_glr_download_getting-started_tutorials_05-layer-norm.py>`
        to download the full example code
 .. rst-class:: sphx-glr-example-title
 .. _sphx_glr_getting-started_tutorials_05-layer-norm.py:
 Layer Normalization
 ====================
 .. GENERATED FROM PYTHON SOURCE LINES 5-252
 .. image:: /getting-started/tutorials/images/sphx_glr_05-layer-norm_001.png
    :alt: 05 layer norm
    :class: sphx-glr-single-img
 .. rst-class:: sphx-glr-script-out
 Out:
 .. code-block:: none
    layer-norm-backward:
              N      Triton       Torch        Apex
    0    1024.0  311.088617   98.303995  303.407414
    1    1536.0  351.085717  134.050910  341.333333
    2    2048.0  423.724127  161.684218  334.367350
    3    2560.0  465.454542  181.238943  330.322572
    4    3072.0  511.999982  191.999993  320.556515
    5    3584.0  551.384634  208.271186  310.527060
    6    4096.0  568.231237  219.919464  298.796351
    7    4608.0  500.416301  232.825259  286.507772
    8    5120.0  525.128191  242.845844  284.444444
    9    5632.0  540.671974  243.107920  289.438969
    10   6144.0  544.118087  248.242431  285.767458
    11   6656.0  532.479975  256.000009  285.767438
    12   7168.0  505.976473  260.260201  286.242939
    13   7680.0  481.253256  262.190612  279.272719
    14   8192.0  463.698115  267.130429  284.526763
    15   8704.0  416.958106  267.472468  284.987724
    16   9216.0  430.319054  272.394084  288.751954
    17   9728.0  438.857162  280.278512  289.667485
    18  10240.0  447.650282  286.767793  290.496460
    19  10752.0  428.651173  246.935876  290.594591
    20  11264.0  429.786952  245.536784  286.676558
    21  11776.0  423.089806  249.888595  288.686414
    22  12288.0  420.102570  254.673582  294.617366
    23  12800.0  414.574901  253.674644  288.450715
    24  13312.0  412.242569  252.759501  289.916513
    25  13824.0  406.090579  257.190689  292.056329
    26  14336.0  396.387109  254.297107  286.719986
    27  14848.0  386.498925  257.665934  289.481735
    28  15360.0  373.495460  257.970599  287.326580
    29  15872.0  370.192407  261.806182  289.899545
 |
 .. code-block:: default
    import torch
    import triton.language as tl
    import triton
    # Forward Pass
    @triton.jit
    def _layer_norm_fwd_fused(X, Y, W, B, M, V, stride, N, eps, **META):
        BLOCK_SIZE = META['BLOCK_SIZE']
        # position of elements processed by this program
        row =  tl.program_id(0)
        cols = tl.arange(0, BLOCK_SIZE)
        mask = cols < N
        # offset data pointers to start at the row of interest
        X += row * stride
        Y += row * stride
        # load data and cast to float32
        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
        # compute mean
        mean = tl.sum(x, axis=0) / N
        # compute std
        xmean = tl.where(mask, x - mean, 0.)
        var   = tl.sum(xmean * xmean, axis=0) / N
        rstd  = 1 / tl.sqrt(var + eps)
        xhat  = xmean*rstd
        # write-back mean/rstd
        tl.store(M + row, mean)
        tl.store(V + row, rstd)
        # multiply by weight and add bias
        w = tl.load(W + cols, mask=mask)
        b = tl.load(B + cols, mask=mask)
        y = xhat * w + b
        # write-back
        tl.store(Y + cols, y, mask=mask)
    # Backward pass (DX + partial DW + partial DB)
    @triton.jit
    def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock,
                           stride, N, eps, 
                           **META):
        GROUP_SIZE_M = META['GROUP_SIZE_M']
        BLOCK_SIZE_N = META['BLOCK_SIZE_N']
        # position of elements processed by this program
        row =  tl.program_id(0)
        cols = tl.arange(0, BLOCK_SIZE_N)
        mask = cols < N
        # offset data pointers to start at the row of interest
        X  += row * stride
        DY += row * stride
        DX += row * stride
        # offset locks and weight/bias gradient pointer
        # each kernel instance accumulates partial sums for
        # DW and DB into one of GROUP_SIZE_M independent buffers
        # these buffers stay in the L2, which allow this kernel
        # to be fast
        lock_id = row % GROUP_SIZE_M
        Lock   += lock_id
        Count   = Lock + GROUP_SIZE_M
        DW      = DW + lock_id*N + cols
        DB      = DB + lock_id*N + cols
        # load data to SRAM
        x     = tl.load(X  + cols, mask=mask, other=0).to(tl.float32)
        dy    = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
        w     = tl.load(W  + cols, mask=mask).to(tl.float32)
        mean  = tl.load(M + row)
        rstd  = tl.load(V + row)
        # compute dx
        xhat  = (x - mean)*rstd
        wdy   = w * dy
        xhat  = tl.where(mask, xhat, 0.)
        wdy   = tl.where(mask, wdy , 0.)
        mean1 = tl.sum(xhat * wdy, axis=0) / N
        mean2 = tl.sum(wdy, axis=0) / N
        dx    = (wdy - (xhat*mean1 + mean2))*rstd
        # write-back dx
        tl.store(DX + cols, dx, mask=mask)
        # accumulate partial sums for dw/db
        partial_dw = (dy*xhat).to(w.dtype)
        partial_db = (dy).to(w.dtype)
        while tl.atomic_cas(Lock, 0, 1) == 1:
            pass
        count = tl.load(Count)
        # first store doesn't accumulate
        if count == 0:
            tl.atomic_xchg(Count, 1)
        else:
            partial_dw += tl.load(DW, mask=mask)
            partial_db += tl.load(DB, mask=mask)
        tl.store(DW, partial_dw, mask=mask)
        tl.store(DB, partial_db, mask=mask)
        # release lock
        tl.atomic_xchg(Lock, 0)
    # Backward pass (total DW + total DB)
    @triton.jit
    def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N, **meta):
        pid = tl.program_id(0)
        BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
        BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
        cols = pid*BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
        dw   = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        db   = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        for i in range(0, M, BLOCK_SIZE_M):
            rows = i + tl.arange(0, meta['BLOCK_SIZE_M'])
            mask = (rows[:, None] < M) & (cols[None, :] < N)
            offs = rows[:, None]*N + cols[None, :]
            dw += tl.load(DW + offs, mask=mask, other=0.)
            db += tl.load(DB + offs, mask=mask, other=0.)
        sum_dw = tl.sum(dw, axis=0)
        sum_db = tl.sum(db, axis=0)
        tl.store(FINAL_DW + cols, sum_dw, mask=cols<N)
        tl.store(FINAL_DB + cols, sum_db, mask=cols<N)
    class LayerNorm(torch.autograd.Function):
        @staticmethod
        def forward(ctx, x, normalized_shape, weight, bias, eps):
            # allocate output
            y = torch.empty_like(x)
            # reshape input data into 2D tensor
            x_arg = x.reshape(-1, x.shape[-1])
            M, N = x_arg.shape
            mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
            rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
            # Less than 64KB per feature: enqueue fused kernel
            MAX_FUSED_SIZE = 65536 // x.element_size()
            BLOCK_SIZE     = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
            if N > BLOCK_SIZE:
                raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
            # heuristics for number of warps
            num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
            # enqueue kernel
            _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd, 
                                        x_arg.stride(0), N, eps, 
                                        BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
            ctx.save_for_backward(x, weight, bias, mean, rstd)
            ctx.BLOCK_SIZE = BLOCK_SIZE
            ctx.num_warps  = num_warps
            ctx.eps        = eps
            return y
        @staticmethod
        def backward(ctx, dy):
            x, w, b, m, v = ctx.saved_tensors
            # heuristics for amount of parallel reduction stream for DG/DB
            N = w.shape[0]
            GROUP_SIZE_M = 64
            if N <= 8192: GROUP_SIZE_M = 96
            if N <= 4096: GROUP_SIZE_M = 128
            if N <= 1024: GROUP_SIZE_M = 256
            # allocate output
            locks = torch.zeros(2*GROUP_SIZE_M, dtype=torch.int32, device='cuda')
            _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
            _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
            dw  = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
            db  = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
            dx = torch.empty_like(dy)
            # enqueue kernel using forward pass heuristics
            # also compute partial sums for DW and DB
            x_arg = x.reshape(-1, x.shape[-1])
            M, N = x_arg.shape
            _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
                                           x_arg.stride(0), N, ctx.eps,
                                           BLOCK_SIZE_N=ctx.BLOCK_SIZE, 
                                           GROUP_SIZE_M=GROUP_SIZE_M,
                                           num_warps=ctx.num_warps)
            grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
            # accumulate partial sums in separate kernel
            _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N, 
                                       BLOCK_SIZE_M = 32, 
                                       BLOCK_SIZE_N = 128)
            return dx, None, dw, db, None
    layer_norm = LayerNorm.apply
    def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
        # create data
        x_shape = (M, N)
        w_shape = (x_shape[-1], )
        weight  = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        bias    = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        x       = -2.3 + 0.5*torch.randn(x_shape, dtype=dtype, device='cuda')
        dy      = .1*torch.randn_like(x)
        x.requires_grad_(True)
        # forward pass
        y_tri = layer_norm(x, w_shape, weight, bias, eps)
        y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
        # backward pass (triton)
        y_tri.backward(dy, retain_graph=True)
        dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
        x.grad, weight.grad, bias.grad = None, None, None
        # backward pass (torch)
        y_ref.backward(dy, retain_graph=True)
        dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
        # compare
        triton.testing.assert_almost_equal(y_tri, y_ref)
        triton.testing.assert_almost_equal(dx_tri, dx_ref)
        triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)
        triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)
    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=['N'],
            x_vals=[512 * i for i in range(2, 32)],
            line_arg='provider',
            line_vals=['triton', 'torch', 'apex'],
            line_names=['Triton', 'Torch', 'Apex'],
            styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
            ylabel='GB/s',
            plot_name='layer-norm-backward',
            args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
        )
    )
    def bench_layer_norm(M, N, dtype, provider, mode='backward',eps=1e-5, device='cuda'):
        # create data
        x_shape = (M, N)
        w_shape = (x_shape[-1], )
        weight  = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        bias    = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        x       = -2.3 + 0.5*torch.randn(x_shape, dtype=dtype, device='cuda')
        dy      = .1*torch.randn_like(x)
        x.requires_grad_(True)
        # utility functions
        if provider == 'triton':
            y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
        if provider == 'torch':
            y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
        if provider == 'apex':
            import apex
            apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
            y_fwd = lambda: apex_layer_norm(x)
        # forward pass
        if mode == 'forward':
            gbps = lambda ms: 2*x.numel()*x.element_size()/ms*1e-6
            ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)
        # backward pass
        if mode == 'backward':
            gbps = lambda ms: 3*x.numel()*x.element_size()/ms*1e-6
            y = y_fwd()
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True), 
                                                         grad_to_none=[x], rep=500)
        return gbps(ms), gbps(max_ms), gbps(min_ms)
    bench_layer_norm.run(save_path='.', print_data=True)
 .. rst-class:: sphx-glr-timing
   **Total running time of the script:** ( 2 minutes  15.279 seconds)
 .. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example
  .. container:: sphx-glr-download sphx-glr-download-python
     :download:`Download Python source code: 05-layer-norm.py <05-layer-norm.py>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
     :download:`Download Jupyter notebook: 05-layer-norm.ipynb <05-layer-norm.ipynb>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/index.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/index.rst.txt
@@ -0,0 +1,144 @@
 :orphan:
 .. _sphx_glr_getting-started_tutorials:
 Tutorials
 ==================
 Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. 
 .. raw:: html
    <div class="sphx-glr-thumbcontainer" tooltip="- The basic programming model of Triton - The triton.jit decorator, which is used to define Tri...">
 .. only:: html
 .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_01-vector-add_thumb.png
     :alt: Vector Addition
     :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py`
 .. raw:: html
    </div>
 .. toctree::
   :hidden:
   /getting-started/tutorials/01-vector-add
 .. raw:: html
    <div class="sphx-glr-thumbcontainer" tooltip="- The benefits of kernel fusion for bandwidth-bound operations. - Reduction operators in Triton...">
 .. only:: html
 .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_02-fused-softmax_thumb.png
     :alt: Fused Softmax
     :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py`
 .. raw:: html
    </div>
 .. toctree::
   :hidden:
   /getting-started/tutorials/02-fused-softmax
 .. raw:: html
    <div class="sphx-glr-thumbcontainer" tooltip="- Block-level matrix multiplications - Multi-dimensional pointer arithmetic - Program re-orderi...">
 .. only:: html
 .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_03-matrix-multiplication_thumb.png
     :alt: Matrix Multiplication
     :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py`
 .. raw:: html
    </div>
 .. toctree::
   :hidden:
   /getting-started/tutorials/03-matrix-multiplication
 .. raw:: html
    <div class="sphx-glr-thumbcontainer" tooltip="In this tutorial, you will write a memory-efficient implementation of dropout whose state will ...">
 .. only:: html
 .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_04-low-memory-dropout_thumb.png
     :alt: Low-Memory Dropout
     :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py`
 .. raw:: html
    </div>
 .. toctree::
   :hidden:
   /getting-started/tutorials/04-low-memory-dropout
 .. raw:: html
    <div class="sphx-glr-thumbcontainer" tooltip="Layer Normalization">
 .. only:: html
 .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_05-layer-norm_thumb.png
     :alt: Layer Normalization
     :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py`
 .. raw:: html
    </div>
 .. toctree::
   :hidden:
   /getting-started/tutorials/05-layer-norm
 .. raw:: html
    <div class="sphx-glr-clear"></div>
 .. only :: html
 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-gallery
  .. container:: sphx-glr-download sphx-glr-download-python
    :download:`Download all examples in Python source code: tutorials_python.zip </getting-started/tutorials/tutorials_python.zip>`
  .. container:: sphx-glr-download sphx-glr-download-jupyter
    :download:`Download all examples in Jupyter notebooks: tutorials_jupyter.zip </getting-started/tutorials/tutorials_jupyter.zip>`
 .. only:: html
 .. rst-class:: sphx-glr-signature
    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
--- a/v1.1.2/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/v1.1.2/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -0,0 +1,20 @@
 :orphan:
 .. _sphx_glr_getting-started_tutorials_sg_execution_times:
 Computation times
 =================
 **13:02.599** total execution time for **getting-started_tutorials** files:
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 05:35.980 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:24.992 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``)                       | 02:15.279 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:46.338 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
 | :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.010 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
--- a/v1.1.2/_sources/index.rst.txt
+++ b/v1.1.2/_sources/index.rst.txt
--- a/v1.1.2/_sources/programming-guide/chapter-1/introduction.rst.txt
+++ b/v1.1.2/_sources/programming-guide/chapter-1/introduction.rst.txt
--- a/v1.1.2/_sources/programming-guide/chapter-2/related-work.rst.txt
+++ b/v1.1.2/_sources/programming-guide/chapter-2/related-work.rst.txt
--- a/v1.1.2/_sources/python-api/generated/triton.Config.rst.txt
+++ b/v1.1.2/_sources/python-api/generated/triton.Config.rst.txt
--- a/v1.1.2/_sources/python-api/generated/triton.autotune.rst.txt
+++ b/v1.1.2/_sources/python-api/generated/triton.autotune.rst.txt
--- a/v1.1.2/_sources/python-api/generated/triton.heuristics.rst.txt
+++ b/v1.1.2/_sources/python-api/generated/triton.heuristics.rst.txt
--- a/Show More
+++ b/Show More