diff --git a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
index d2f33ed8f..f16e090cb 100644
--- a/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
+++ b/_downloads/034d953b6214fedce6ea03803c712b89/02-fused-softmax.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import torch\n\n\n@torch.jit.script\ndef naive_softmax(x):\n    \"\"\"Compute row-wise softmax of X using native pytorch\n\n    We subtract the maximum element in order to avoid overflows. Softmax is invariant to\n    this shift.\n    \"\"\"\n    # read  MN elements ; write M  elements\n    x_max = x.max(dim=1)[0]\n    # read MN + M elements ; write MN elements\n    z = x - x_max[:, None]\n    # read  MN elements ; write MN elements\n    numerator = torch.exp(z)\n    # read  MN elements ; write M  elements\n    denominator = numerator.sum(dim=1)\n    # read MN + M elements ; write MN elements\n    ret = numerator / denominator[:, None]\n    # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements\n    return ret"
+        "import torch\n\nimport triton\nimport triton.language as tl\n\n\n@torch.jit.script\ndef naive_softmax(x):\n    \"\"\"Compute row-wise softmax of X using native pytorch\n\n    We subtract the maximum element in order to avoid overflows. Softmax is invariant to\n    this shift.\n    \"\"\"\n    # read  MN elements ; write M  elements\n    x_max = x.max(dim=1)[0]\n    # read MN + M elements ; write MN elements\n    z = x - x_max[:, None]\n    # read  MN elements ; write MN elements\n    numerator = torch.exp(z)\n    # read  MN elements ; write M  elements\n    denominator = numerator.sum(dim=1)\n    # read MN + M elements ; write MN elements\n    ret = numerator / denominator[:, None]\n    # in total: read 5MN + 2M elements ; wrote 3MN + 2M elements\n    return ret"
       ]
     },
     {
@@ -58,7 +58,7 @@
       },
       "outputs": [],
       "source": [
-        "import triton\nimport triton.language as tl\n\n\n@triton.jit\ndef softmax_kernel(\n    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta\n):\n    # The rows of the softmax are independent, so we parallelize across those\n    row_idx = tl.program_id(0)\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    # The stride represents how much we need to increase the pointer to advance 1 row\n    row_start_ptr = input_ptr + row_idx * input_row_stride\n    # The block size is the next power of two greater than n_cols, so we can fit each\n    # row in a single block\n    col_offsets = tl.arange(0, BLOCK_SIZE)\n    input_ptrs = row_start_ptr + col_offsets\n    # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols\n    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))\n    # Substract maximum for numerical stability\n    row_minus_max = row - tl.max(row, axis=0)\n    # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)\n    numerator = tl.exp(row_minus_max)\n    denominator = tl.sum(numerator, axis=0)\n    softmax_output = numerator / denominator\n    # Write back output to DRAM\n    output_row_start_ptr = output_ptr + row_idx * output_row_stride\n    output_ptrs = output_row_start_ptr + col_offsets\n    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)"
+        "@triton.jit\ndef softmax_kernel(\n    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,\n    BLOCK_SIZE: tl.constexpr\n):\n    # The rows of the softmax are independent, so we parallelize across those\n    row_idx = tl.program_id(0)\n    # The stride represents how much we need to increase the pointer to advance 1 row\n    row_start_ptr = input_ptr + row_idx * input_row_stride\n    # The block size is the next power of two greater than n_cols, so we can fit each\n    # row in a single block\n    col_offsets = tl.arange(0, BLOCK_SIZE)\n    input_ptrs = row_start_ptr + col_offsets\n    # Load the row into SRAM, using a mask since BLOCK_SIZE may be > than n_cols\n    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))\n    # Substract maximum for numerical stability\n    row_minus_max = row - tl.max(row, axis=0)\n    # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)\n    numerator = tl.exp(row_minus_max)\n    denominator = tl.sum(numerator, axis=0)\n    softmax_output = numerator / denominator\n    # Write back output to DRAM\n    output_row_start_ptr = output_ptr + row_idx * output_row_stride\n    output_ptrs = output_row_start_ptr + col_offsets\n    tl.store(output_ptrs, softmax_output, mask=col_offsets < n_cols)"
       ]
     },
     {
@@ -101,7 +101,7 @@
       },
       "outputs": [],
       "source": [
-        "torch.manual_seed(0)\nx = torch.randn(1823, 781, device='cuda')\ny_triton = softmax(x)\ny_torch = torch.softmax(x, axis=1)\nprint(torch.allclose(y_triton, y_torch))"
+        "torch.manual_seed(0)\nx = torch.randn(1823, 781, device='cuda')\ny_triton = softmax(x)\ny_torch = torch.softmax(x, axis=1)\nassert torch.allclose(y_triton, y_torch), (y_triton, y_torch)"
       ]
     },
     {
@@ -133,7 +133,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "In the above plot, we can see that:\n\n - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.\n - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. \n   Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.\n\n"
+        "In the above plot, we can see that:\n\n - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.\n - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.\n   Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.\n\n"
       ]
     }
   ],
diff --git a/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py b/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
index 4446cf6e9..d684106f1 100644
--- a/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
+++ b/_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py
@@ -13,6 +13,7 @@ In this tutorial, you will write a simple vector addition using Triton and learn
 # --------------------------
 
 import torch
+
 import triton
 import triton.language as tl
 
@@ -23,9 +24,9 @@ def add_kernel(
     y_ptr,  # *Pointer* to second input vector
     output_ptr,  # *Pointer* to output vector
     n_elements,  # Size of the vector
-    **meta,  # Optional meta-parameters for the kernel
+    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
+                 # NOTE: `constexpr` so it can be used as a shape value
 ):
-    BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process
     # There are multiple 'program's processing different data. We identify which program
     # we are here
     pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
@@ -37,7 +38,7 @@ def add_kernel(
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
     # Create a mask to guard memory operations against out-of-bounds accesses
     mask = offsets < n_elements
-    # Load x and y from DRAM, masking out any extar elements in case the input is not a
+    # Load x and y from DRAM, masking out any extra elements in case the input is not a
     # multiple of the block size
     x = tl.load(x_ptr + offsets, mask=mask)
     y = tl.load(y_ptr + offsets, mask=mask)
@@ -64,7 +65,7 @@ def add(x: torch.Tensor, y: torch.Tensor):
     #  - each torch.tensor object is implicitly converted into a pointer to its first element.
     #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
     #  - don't forget to pass meta-parameters as keywords arguments
-    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
     # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
     # running asynchronously at this point.
     return output
diff --git a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip
index 2c92f7742..4017fedf9 100644
Binary files a/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ
diff --git a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip
index bfb39d31d..875850dce 100644
Binary files a/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ
diff --git a/_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py b/_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py
new file mode 100644
index 000000000..1cefc60b9
--- /dev/null
+++ b/_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py
@@ -0,0 +1,261 @@
+"""
+Layer Normalization
+====================
+"""
+
+import torch
+
+import triton
+import triton.language as tl
+
+try:
+    # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
+    # should not be added to extras_require in setup.py.
+    import apex
+    HAS_APEX = True
+except ModuleNotFoundError:
+    HAS_APEX = False
+
+
+# Forward Pass
+@triton.jit
+def _layer_norm_fwd_fused(X, Y, W, B, M, V, stride, N, eps,
+                          BLOCK_SIZE: tl.constexpr):
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_SIZE)
+    mask = cols < N
+    # offset data pointers to start at the row of interest
+    X += row * stride
+    Y += row * stride
+    # load data and cast to float32
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    # compute mean
+    mean = tl.sum(x, axis=0) / N
+    # compute std
+    xmean = tl.where(mask, x - mean, 0.)
+    var = tl.sum(xmean * xmean, axis=0) / N
+    rstd = 1 / tl.sqrt(var + eps)
+    xhat = xmean * rstd
+    # write-back mean/rstd
+    tl.store(M + row, mean)
+    tl.store(V + row, rstd)
+    # multiply by weight and add bias
+    w = tl.load(W + cols, mask=mask)
+    b = tl.load(B + cols, mask=mask)
+    y = xhat * w + b
+    # write-back
+    tl.store(Y + cols, y, mask=mask)
+
+
+# Backward pass (DX + partial DW + partial DB)
+@triton.jit
+def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock, stride, N, eps,
+                             GROUP_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
+    # position of elements processed by this program
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_SIZE_N)
+    mask = cols < N
+    # offset data pointers to start at the row of interest
+    X += row * stride
+    DY += row * stride
+    DX += row * stride
+    # offset locks and weight/bias gradient pointer
+    # each kernel instance accumulates partial sums for
+    # DW and DB into one of GROUP_SIZE_M independent buffers
+    # these buffers stay in the L2, which allow this kernel
+    # to be fast
+    lock_id = row % GROUP_SIZE_M
+    Lock += lock_id
+    Count = Lock + GROUP_SIZE_M
+    DW = DW + lock_id * N + cols
+    DB = DB + lock_id * N + cols
+    # load data to SRAM
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    mean = tl.load(M + row)
+    rstd = tl.load(V + row)
+    # compute dx
+    xhat = (x - mean) * rstd
+    wdy = w * dy
+    xhat = tl.where(mask, xhat, 0.)
+    wdy = tl.where(mask, wdy, 0.)
+    mean1 = tl.sum(xhat * wdy, axis=0) / N
+    mean2 = tl.sum(wdy, axis=0) / N
+    dx = (wdy - (xhat * mean1 + mean2)) * rstd
+    # write-back dx
+    tl.store(DX + cols, dx, mask=mask)
+    # accumulate partial sums for dw/db
+    partial_dw = (dy * xhat).to(w.dtype)
+    partial_db = (dy).to(w.dtype)
+    while tl.atomic_cas(Lock, 0, 1) == 1:
+        pass
+    count = tl.load(Count)
+    # first store doesn't accumulate
+    if count == 0:
+        tl.atomic_xchg(Count, 1)
+    else:
+        partial_dw += tl.load(DW, mask=mask)
+        partial_db += tl.load(DB, mask=mask)
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+    # release lock
+    tl.atomic_xchg(Lock, 0)
+
+# Backward pass (total DW + total DB)
+
+
+@triton.jit
+def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N,
+                         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
+    pid = tl.program_id(0)
+    cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+    dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for i in range(0, M, BLOCK_SIZE_M):
+        rows = i + tl.arange(0, BLOCK_SIZE_M)
+        mask = (rows[:, None] < M) & (cols[None, :] < N)
+        offs = rows[:, None] * N + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.)
+        db += tl.load(DB + offs, mask=mask, other=0.)
+    sum_dw = tl.sum(dw, axis=0)
+    sum_db = tl.sum(db, axis=0)
+    tl.store(FINAL_DW + cols, sum_dw, mask=cols < N)
+    tl.store(FINAL_DB + cols, sum_db, mask=cols < N)
+
+
+class LayerNorm(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x, normalized_shape, weight, bias, eps):
+        # allocate output
+        y = torch.empty_like(x)
+        # reshape input data into 2D tensor
+        x_arg = x.reshape(-1, x.shape[-1])
+        M, N = x_arg.shape
+        mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
+        rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
+        # Less than 64KB per feature: enqueue fused kernel
+        MAX_FUSED_SIZE = 65536 // x.element_size()
+        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+        if N > BLOCK_SIZE:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+        # heuristics for number of warps
+        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+        # enqueue kernel
+        _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd,
+                                    x_arg.stride(0), N, eps,
+                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
+        ctx.save_for_backward(x, weight, bias, mean, rstd)
+        ctx.BLOCK_SIZE = BLOCK_SIZE
+        ctx.num_warps = num_warps
+        ctx.eps = eps
+        return y
+
+    @staticmethod
+    def backward(ctx, dy):
+        x, w, b, m, v = ctx.saved_tensors
+        # heuristics for amount of parallel reduction stream for DG/DB
+        N = w.shape[0]
+        GROUP_SIZE_M = 64
+        if N <= 8192: GROUP_SIZE_M = 96
+        if N <= 4096: GROUP_SIZE_M = 128
+        if N <= 1024: GROUP_SIZE_M = 256
+        # allocate output
+        locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda')
+        _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+        _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+        dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+        db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+        dx = torch.empty_like(dy)
+        # enqueue kernel using forward pass heuristics
+        # also compute partial sums for DW and DB
+        x_arg = x.reshape(-1, x.shape[-1])
+        M, N = x_arg.shape
+        _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
+                                       x_arg.stride(0), N, ctx.eps,
+                                       BLOCK_SIZE_N=ctx.BLOCK_SIZE,
+                                       GROUP_SIZE_M=GROUP_SIZE_M,
+                                       num_warps=ctx.num_warps)
+        grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
+        # accumulate partial sums in separate kernel
+        _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N,
+                                   BLOCK_SIZE_M=32,
+                                   BLOCK_SIZE_N=128)
+        return dx, None, dw, db, None
+
+
+layer_norm = LayerNorm.apply
+
+
+def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
+    # create data
+    x_shape = (M, N)
+    w_shape = (x_shape[-1], )
+    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    dy = .1 * torch.randn_like(x)
+    x.requires_grad_(True)
+    # forward pass
+    y_tri = layer_norm(x, w_shape, weight, bias, eps)
+    y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
+    # backward pass (triton)
+    y_tri.backward(dy, retain_graph=True)
+    dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
+    x.grad, weight.grad, bias.grad = None, None, None
+    # backward pass (torch)
+    y_ref.backward(dy, retain_graph=True)
+    dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
+    # compare
+    triton.testing.assert_almost_equal(y_tri, y_ref)
+    triton.testing.assert_almost_equal(dx_tri, dx_ref)
+    triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)
+    triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)
+
+
+@triton.testing.perf_report(
+    triton.testing.Benchmark(
+        x_names=['N'],
+        x_vals=[512 * i for i in range(2, 32)],
+        line_arg='provider',
+        line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),
+        line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),
+        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
+        ylabel='GB/s',
+        plot_name='layer-norm-backward',
+        args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
+    )
+)
+def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'):
+    # create data
+    x_shape = (M, N)
+    w_shape = (x_shape[-1], )
+    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+    dy = .1 * torch.randn_like(x)
+    x.requires_grad_(True)
+    # utility functions
+    if provider == 'triton':
+        y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
+    if provider == 'torch':
+        y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
+    if provider == 'apex':
+        apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
+        y_fwd = lambda: apex_layer_norm(x)
+    # forward pass
+    if mode == 'forward':
+        gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6
+        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)
+    # backward pass
+    if mode == 'backward':
+        gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6
+        y = y_fwd()
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),
+                                                     grad_to_none=[x], rep=500)
+    return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+bench_layer_norm.run(save_path='.', print_data=True)
diff --git a/_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb b/_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb
new file mode 100644
index 000000000..4d381dafc
--- /dev/null
+++ b/_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb
@@ -0,0 +1,54 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "%matplotlib inline"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "\n# Layer Normalization\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "collapsed": false
+      },
+      "outputs": [],
+      "source": [
+        "import torch\n\nimport triton\nimport triton.language as tl\n\ntry:\n    # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it\n    # should not be added to extras_require in setup.py.\n    import apex\n    HAS_APEX = True\nexcept ModuleNotFoundError:\n    HAS_APEX = False\n\n\n# Forward Pass\n@triton.jit\ndef _layer_norm_fwd_fused(X, Y, W, B, M, V, stride, N, eps,\n                          BLOCK_SIZE: tl.constexpr):\n    # position of elements processed by this program\n    row = tl.program_id(0)\n    cols = tl.arange(0, BLOCK_SIZE)\n    mask = cols < N\n    # offset data pointers to start at the row of interest\n    X += row * stride\n    Y += row * stride\n    # load data and cast to float32\n    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)\n    # compute mean\n    mean = tl.sum(x, axis=0) / N\n    # compute std\n    xmean = tl.where(mask, x - mean, 0.)\n    var = tl.sum(xmean * xmean, axis=0) / N\n    rstd = 1 / tl.sqrt(var + eps)\n    xhat = xmean * rstd\n    # write-back mean/rstd\n    tl.store(M + row, mean)\n    tl.store(V + row, rstd)\n    # multiply by weight and add bias\n    w = tl.load(W + cols, mask=mask)\n    b = tl.load(B + cols, mask=mask)\n    y = xhat * w + b\n    # write-back\n    tl.store(Y + cols, y, mask=mask)\n\n\n# Backward pass (DX + partial DW + partial DB)\n@triton.jit\ndef _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock, stride, N, eps,\n                             GROUP_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):\n    # position of elements processed by this program\n    row = tl.program_id(0)\n    cols = tl.arange(0, BLOCK_SIZE_N)\n    mask = cols < N\n    # offset data pointers to start at the row of interest\n    X += row * stride\n    DY += row * stride\n    DX += row * stride\n    # offset locks and weight/bias gradient pointer\n    # each kernel instance accumulates partial sums for\n    # DW and DB into one of GROUP_SIZE_M independent buffers\n    # these buffers stay in the L2, which allow this kernel\n    # to be fast\n    lock_id = row % GROUP_SIZE_M\n    Lock += lock_id\n    Count = Lock + GROUP_SIZE_M\n    DW = DW + lock_id * N + cols\n    DB = DB + lock_id * N + cols\n    # load data to SRAM\n    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)\n    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)\n    w = tl.load(W + cols, mask=mask).to(tl.float32)\n    mean = tl.load(M + row)\n    rstd = tl.load(V + row)\n    # compute dx\n    xhat = (x - mean) * rstd\n    wdy = w * dy\n    xhat = tl.where(mask, xhat, 0.)\n    wdy = tl.where(mask, wdy, 0.)\n    mean1 = tl.sum(xhat * wdy, axis=0) / N\n    mean2 = tl.sum(wdy, axis=0) / N\n    dx = (wdy - (xhat * mean1 + mean2)) * rstd\n    # write-back dx\n    tl.store(DX + cols, dx, mask=mask)\n    # accumulate partial sums for dw/db\n    partial_dw = (dy * xhat).to(w.dtype)\n    partial_db = (dy).to(w.dtype)\n    while tl.atomic_cas(Lock, 0, 1) == 1:\n        pass\n    count = tl.load(Count)\n    # first store doesn't accumulate\n    if count == 0:\n        tl.atomic_xchg(Count, 1)\n    else:\n        partial_dw += tl.load(DW, mask=mask)\n        partial_db += tl.load(DB, mask=mask)\n    tl.store(DW, partial_dw, mask=mask)\n    tl.store(DB, partial_db, mask=mask)\n    # release lock\n    tl.atomic_xchg(Lock, 0)\n\n# Backward pass (total DW + total DB)\n\n\n@triton.jit\ndef _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N,\n                         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):\n    pid = tl.program_id(0)\n    cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n    dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n    db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n    for i in range(0, M, BLOCK_SIZE_M):\n        rows = i + tl.arange(0, BLOCK_SIZE_M)\n        mask = (rows[:, None] < M) & (cols[None, :] < N)\n        offs = rows[:, None] * N + cols[None, :]\n        dw += tl.load(DW + offs, mask=mask, other=0.)\n        db += tl.load(DB + offs, mask=mask, other=0.)\n    sum_dw = tl.sum(dw, axis=0)\n    sum_db = tl.sum(db, axis=0)\n    tl.store(FINAL_DW + cols, sum_dw, mask=cols < N)\n    tl.store(FINAL_DB + cols, sum_db, mask=cols < N)\n\n\nclass LayerNorm(torch.autograd.Function):\n\n    @staticmethod\n    def forward(ctx, x, normalized_shape, weight, bias, eps):\n        # allocate output\n        y = torch.empty_like(x)\n        # reshape input data into 2D tensor\n        x_arg = x.reshape(-1, x.shape[-1])\n        M, N = x_arg.shape\n        mean = torch.empty((M, ), dtype=torch.float32, device='cuda')\n        rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')\n        # Less than 64KB per feature: enqueue fused kernel\n        MAX_FUSED_SIZE = 65536 // x.element_size()\n        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))\n        if N > BLOCK_SIZE:\n            raise RuntimeError(\"This layer norm doesn't support feature dim >= 64KB.\")\n        # heuristics for number of warps\n        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)\n        # enqueue kernel\n        _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd,\n                                    x_arg.stride(0), N, eps,\n                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)\n        ctx.save_for_backward(x, weight, bias, mean, rstd)\n        ctx.BLOCK_SIZE = BLOCK_SIZE\n        ctx.num_warps = num_warps\n        ctx.eps = eps\n        return y\n\n    @staticmethod\n    def backward(ctx, dy):\n        x, w, b, m, v = ctx.saved_tensors\n        # heuristics for amount of parallel reduction stream for DG/DB\n        N = w.shape[0]\n        GROUP_SIZE_M = 64\n        if N <= 8192: GROUP_SIZE_M = 96\n        if N <= 4096: GROUP_SIZE_M = 128\n        if N <= 1024: GROUP_SIZE_M = 256\n        # allocate output\n        locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda')\n        _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)\n        _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)\n        dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)\n        db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)\n        dx = torch.empty_like(dy)\n        # enqueue kernel using forward pass heuristics\n        # also compute partial sums for DW and DB\n        x_arg = x.reshape(-1, x.shape[-1])\n        M, N = x_arg.shape\n        _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,\n                                       x_arg.stride(0), N, ctx.eps,\n                                       BLOCK_SIZE_N=ctx.BLOCK_SIZE,\n                                       GROUP_SIZE_M=GROUP_SIZE_M,\n                                       num_warps=ctx.num_warps)\n        grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]\n        # accumulate partial sums in separate kernel\n        _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N,\n                                   BLOCK_SIZE_M=32,\n                                   BLOCK_SIZE_N=128)\n        return dx, None, dw, db, None\n\n\nlayer_norm = LayerNorm.apply\n\n\ndef test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):\n    # create data\n    x_shape = (M, N)\n    w_shape = (x_shape[-1], )\n    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)\n    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)\n    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')\n    dy = .1 * torch.randn_like(x)\n    x.requires_grad_(True)\n    # forward pass\n    y_tri = layer_norm(x, w_shape, weight, bias, eps)\n    y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)\n    # backward pass (triton)\n    y_tri.backward(dy, retain_graph=True)\n    dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]\n    x.grad, weight.grad, bias.grad = None, None, None\n    # backward pass (torch)\n    y_ref.backward(dy, retain_graph=True)\n    dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]\n    # compare\n    triton.testing.assert_almost_equal(y_tri, y_ref)\n    triton.testing.assert_almost_equal(dx_tri, dx_ref)\n    triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)\n    triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)\n\n\n@triton.testing.perf_report(\n    triton.testing.Benchmark(\n        x_names=['N'],\n        x_vals=[512 * i for i in range(2, 32)],\n        line_arg='provider',\n        line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),\n        line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),\n        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],\n        ylabel='GB/s',\n        plot_name='layer-norm-backward',\n        args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}\n    )\n)\ndef bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'):\n    # create data\n    x_shape = (M, N)\n    w_shape = (x_shape[-1], )\n    weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)\n    bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)\n    x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')\n    dy = .1 * torch.randn_like(x)\n    x.requires_grad_(True)\n    # utility functions\n    if provider == 'triton':\n        y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)\n    if provider == 'torch':\n        y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)\n    if provider == 'apex':\n        apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)\n        y_fwd = lambda: apex_layer_norm(x)\n    # forward pass\n    if mode == 'forward':\n        gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6\n        ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)\n    # backward pass\n    if mode == 'backward':\n        gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6\n        y = y_fwd()\n        ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),\n                                                     grad_to_none=[x], rep=500)\n    return gbps(ms), gbps(max_ms), gbps(min_ms)\n\n\nbench_layer_norm.run(save_path='.', print_data=True)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.8.10"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb b/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb
index c6688e1e6..f177fae9b 100644
--- a/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb
+++ b/_downloads/b51b68bc1c6b1a5e509f67800b6235af/03-matrix-multiplication.ipynb
@@ -29,7 +29,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Compute Kernel\n\nThe above algorithm is, actually, fairly straightforward to implement in Triton.\nThe main difficulty comes from the computation of the memory locations at which blocks\nof :code:`A` and :code:`B` must be read in the inner loop. For that, we need\nmulti-dimensional pointer arithmetics.\n\n### Pointer Arithmetics\n\nFor a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b\ny :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`.\nTherefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and\n:code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as:\n\n .. code-block:: python\n\n   &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] =  a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1);\n   &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] =  b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1);\n\nWhich means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as:\n\n .. code-block:: python\n\n   offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n   offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n   offs_k = tl.arange(0, BLOCK_SIZE_K)\n   a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)\n   b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)\n\nAnd then updated in the inner loop as follows:\n\n .. code-block:: python\n\n   pa += BLOCK_SIZE_K * stride_ak;\n   pb += BLOCK_SIZE_K * stride_bk;\n\n\n### L2 Cache Optimizations\n\nAs mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`\nblock of :code:`C`.\nIt is important to remember that the order in which these blocks are computed does\nmatter, since it affects the L2 cache hit rate of our program. and unfortunately, a\na simple row-major ordering\n\n .. code-block:: Python\n\n   pid = triton.program_id(0);\n   grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M;\n   grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N;\n   pid_m = pid / grid_n;\n   pid_n = pid % grid_n;\n\nis just not going to cut it.\n\nOne possible solution is to launch blocks in an order that promotes data reuse.\nThis can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before\nswitching to the next column:\n\n .. code-block:: python\n\n   # program ID\n   pid = tl.program_id(axis=0)\n   # number of program ids along the M axis\n   num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n   # number of programs ids along the N axis\n   num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n   # number of programs in group\n   num_pid_in_group = GROUP_SIZE_M * num_pid_n \n   # id of the group this program is in\n   group_id = pid // num_pid_in_group \n   # row-id of the first program in the group\n   first_pid_m = group_id * GROUP_SIZE_M \n   # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller\n   group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) \n   # *within groups*, programs are ordered in a column-major order\n   # row-id of the program in the *launch grid*\n   pid_m = first_pid_m + (pid % group_size_m)\n   # col-id of the program in the *launch grid*\n   pid_n = (pid % num_pid_in_group) // group_size_m\n\nFor example, in the following matmul where each matrix is 9 blocks by 9 blocks,\nwe can see that if we compute the output in row-major ordering, we need to load 90\nblocks into SRAM to compute the first 9 output blocks, but if we do it in grouped\nordering, we only need to load 54 blocks.\n  .. image:: grouped_vs_row_major_ordering.png\n\nIn practice, this can improve the performance of our matrix multiplication kernel by\nmore than 10\\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).\n\n\n"
+        "## Compute Kernel\n\nThe above algorithm is, actually, fairly straightforward to implement in Triton.\nThe main difficulty comes from the computation of the memory locations at which blocks\nof :code:`A` and :code:`B` must be read in the inner loop. For that, we need\nmulti-dimensional pointer arithmetics.\n\n### Pointer Arithmetics\n\nFor a row-major 2D tensor :code:`X`, the memory location of :code:`X[i, j]` is given b\ny :code:`&X[i, j] = X + i*stride_xi + j*stride_xj`.\nTherefore, blocks of pointers for :code:`A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K]` and\n:code:`B[k : k+BLOCK_SIZE_K, n : n+BLOCK_SIZE_N]` can be defined in pseudo-code as:\n\n .. code-block:: python\n\n   &A[m : m+BLOCK_SIZE_M, k:k+BLOCK_SIZE_K] =  a_ptr + (m : m+BLOCK_SIZE_M)[:, None]*A.stride(0) + (k : k+BLOCK_SIZE_K)[None, :]*A.stride(1);\n   &B[k : k+BLOCK_SIZE_K, n:n+BLOCK_SIZE_N] =  b_ptr + (k : k+BLOCK_SIZE_K)[:, None]*B.stride(0) + (n : n+BLOCK_SIZE_N)[None, :]*B.stride(1);\n\nWhich means that pointers for blocks of A and B can be initialized (i.e., :code:`k=0`) in Triton as:\n\n .. code-block:: python\n\n   offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n   offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n   offs_k = tl.arange(0, BLOCK_SIZE_K)\n   a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)\n   b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)\n\nAnd then updated in the inner loop as follows:\n\n .. code-block:: python\n\n   pa += BLOCK_SIZE_K * stride_ak;\n   pb += BLOCK_SIZE_K * stride_bk;\n\n\n### L2 Cache Optimizations\n\nAs mentioned above, each program instance computes a :code:`[BLOCK_SIZE_M, BLOCK_SIZE_N]`\nblock of :code:`C`.\nIt is important to remember that the order in which these blocks are computed does\nmatter, since it affects the L2 cache hit rate of our program. and unfortunately, a\na simple row-major ordering\n\n .. code-block:: Python\n\n   pid = triton.program_id(0);\n   grid_m = (M + BLOCK_SIZE_M - 1) // BLOCK_SIZE_M;\n   grid_n = (N + BLOCK_SIZE_N - 1) // BLOCK_SIZE_N;\n   pid_m = pid / grid_n;\n   pid_n = pid % grid_n;\n\nis just not going to cut it.\n\nOne possible solution is to launch blocks in an order that promotes data reuse.\nThis can be done by 'super-grouping' blocks in groups of :code:`GROUP_M` rows before\nswitching to the next column:\n\n .. code-block:: python\n\n   # program ID\n   pid = tl.program_id(axis=0)\n   # number of program ids along the M axis\n   num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n   # number of programs ids along the N axis\n   num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n   # number of programs in group\n   num_pid_in_group = GROUP_SIZE_M * num_pid_n\n   # id of the group this program is in\n   group_id = pid // num_pid_in_group\n   # row-id of the first program in the group\n   first_pid_m = group_id * GROUP_SIZE_M\n   # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller\n   group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)\n   # *within groups*, programs are ordered in a column-major order\n   # row-id of the program in the *launch grid*\n   pid_m = first_pid_m + (pid % group_size_m)\n   # col-id of the program in the *launch grid*\n   pid_n = (pid % num_pid_in_group) // group_size_m\n\nFor example, in the following matmul where each matrix is 9 blocks by 9 blocks,\nwe can see that if we compute the output in row-major ordering, we need to load 90\nblocks into SRAM to compute the first 9 output blocks, but if we do it in grouped\nordering, we only need to load 54 blocks.\n  .. image:: grouped_vs_row_major_ordering.png\n\nIn practice, this can improve the performance of our matrix multiplication kernel by\nmore than 10\\% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).\n\n\n"
       ]
     },
     {
@@ -47,7 +47,7 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nimport triton\nimport triton.language as tl\n\n# %\n# :code:`triton.jit`'ed functions can be auto-tuned by using the `triton.autotune`\n# decorator, which consumes:\n#   - A list of :code:`triton.Config` objects that define different configurations of\n#       meta-parameters (e.g., BLOCK_SIZE_M) and compilation options (e.g., num_warps) to try\n#   - An autotuning *key* whose change in values will trigger evaluation of all the\n#       provided configs\n\n@triton.autotune(\n    configs=[\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),\n        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),\n        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),\n        triton.Config({'BLOCK_SIZE_M': 32 , 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),\n    ],\n    key=['M', 'N', 'K'],\n)\n# %\n# We can now define our kernel as normal, using all the techniques presented above\n@triton.jit\ndef matmul_kernel(\n    # Pointers to matrices\n    a_ptr, b_ptr, c_ptr,\n    # Matrix dimensions\n    M, N, K,\n    # The stride variables represent how much to increase the ptr by when moving by 1\n    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr\n    # by to get the element one row down (A has M rows)\n    stride_am, stride_ak,\n    stride_bk, stride_bn,\n    stride_cm, stride_cn,\n    # Meta-parameters\n    **meta,\n):\n    \"\"\"Kernel for computing the matmul C = A x B.\n    A has shape (M, K), B has shape (K, N) and C has shape (M, N)\n    \"\"\"\n    # extract meta-parameters\n    BLOCK_SIZE_M = meta['BLOCK_SIZE_M']\n    BLOCK_SIZE_N = meta['BLOCK_SIZE_N']\n    BLOCK_SIZE_K = meta['BLOCK_SIZE_K']\n    GROUP_SIZE_M = 8\n\n    # -----------------------------------------------------------\n    # Map program ids `pid` to the block of C it should compute.\n    # This is done in a grouped ordering to promote L2 data reuse\n    # See above `L2 Cache Optimizations` section for details\n    pid = tl.program_id(axis=0)\n    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n    num_pid_in_group = GROUP_SIZE_M * num_pid_n \n    group_id = pid // num_pid_in_group \n    first_pid_m = group_id * GROUP_SIZE_M \n    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) \n    pid_m = first_pid_m + (pid % group_size_m)\n    pid_n = (pid % num_pid_in_group) // group_size_m\n\n    # ----------------------------------------------------------\n    # Create pointers for the first blocks of A and B.\n    # We will advance this pointer as we move in the K direction \n    # and accumulate\n    # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers\n    # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers\n    # see above `Pointer Arithmetics` section for details\n    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n    offs_k = tl.arange(0, BLOCK_SIZE_K)\n    a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)\n    b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)\n\n    # -----------------------------------------------------------\n    # Iterate to compute a block of the C matrix\n    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block\n    # of fp32 values for higher accuracy.\n    # `accumulator` will be converted back to fp16 after the loop\n    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n    for k in range(0, K, BLOCK_SIZE_K):\n        # Note that for simplicity, we don't apply a mask here. \n        # This means that if K is not a multiple of BLOCK_SIZE_K, \n        # this will access out-of-bounds memory and produce an\n        # error or (worse!) incorrect results.\n        a = tl.load(a_ptrs)\n        b = tl.load(b_ptrs)\n        # We accumulate along the K dimension\n        accumulator += tl.dot(a, b)\n        # Advance the ptrs to the next K block\n        a_ptrs += BLOCK_SIZE_K * stride_ak\n        b_ptrs += BLOCK_SIZE_K * stride_bk\n    # you can fuse arbitrary activation functions here\n    # while the accumulator is still in FP32 !\n    if meta['ACTIVATION']: \n        accumulator = meta['ACTIVATION'](accumulator)\n    c = accumulator.to(tl.float16)\n\n    # -----------------------------------------------------------\n    # Write back the block of the output matrix C\n    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]\n    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)\n    tl.store(c_ptrs, c, mask=c_mask)\n\n\n# we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`\n@triton.jit\ndef leaky_relu(x):\n    return tl.where(x >= 0, x, 0.01 * x)"
+        "import torch\n\nimport triton\nimport triton.language as tl\n\n# %\n# :code:`triton.jit`'ed functions can be auto-tuned by using the `triton.autotune`\n# decorator, which consumes:\n#   - A list of :code:`triton.Config` objects that define different configurations of\n#       meta-parameters (e.g., BLOCK_SIZE_M) and compilation options (e.g., num_warps) to try\n#   - An autotuning *key* whose change in values will trigger evaluation of all the\n#       provided configs\n\n\n@triton.autotune(\n    configs=[\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),\n        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),\n        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),\n        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),\n        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),\n    ],\n    key=['M', 'N', 'K'],\n)\n@triton.jit\ndef matmul_kernel(\n    # Pointers to matrices\n    a_ptr, b_ptr, c_ptr,\n    # Matrix dimensions\n    M, N, K,\n    # The stride variables represent how much to increase the ptr by when moving by 1\n    # element in a particular dimension. E.g. stride_am is how much to increase a_ptr\n    # by to get the element one row down (A has M rows)\n    stride_am, stride_ak,\n    stride_bk, stride_bn,\n    stride_cm, stride_cn,\n    # Meta-parameters\n    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,\n    GROUP_SIZE_M: tl.constexpr,\n    ACTIVATION: tl.constexpr,\n):\n    \"\"\"Kernel for computing the matmul C = A x B.\n    A has shape (M, K), B has shape (K, N) and C has shape (M, N)\n    \"\"\"\n    # -----------------------------------------------------------\n    # Map program ids `pid` to the block of C it should compute.\n    # This is done in a grouped ordering to promote L2 data reuse\n    # See above `L2 Cache Optimizations` section for details\n    pid = tl.program_id(axis=0)\n    num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)\n    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)\n    num_pid_in_group = GROUP_SIZE_M * num_pid_n\n    group_id = pid // num_pid_in_group\n    first_pid_m = group_id * GROUP_SIZE_M\n    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)\n    pid_m = first_pid_m + (pid % group_size_m)\n    pid_n = (pid % num_pid_in_group) // group_size_m\n\n    # ----------------------------------------------------------\n    # Create pointers for the first blocks of A and B.\n    # We will advance this pointer as we move in the K direction\n    # and accumulate\n    # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers\n    # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers\n    # see above `Pointer Arithmetics` section for details\n    offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n    offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n    offs_k = tl.arange(0, BLOCK_SIZE_K)\n    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)\n    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)\n\n    # -----------------------------------------------------------\n    # Iterate to compute a block of the C matrix\n    # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block\n    # of fp32 values for higher accuracy.\n    # `accumulator` will be converted back to fp16 after the loop\n    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)\n    for k in range(0, K, BLOCK_SIZE_K):\n        # Note that for simplicity, we don't apply a mask here.\n        # This means that if K is not a multiple of BLOCK_SIZE_K,\n        # this will access out-of-bounds memory and produce an\n        # error or (worse!) incorrect results.\n        a = tl.load(a_ptrs)\n        b = tl.load(b_ptrs)\n        # We accumulate along the K dimension\n        accumulator += tl.dot(a, b)\n        # Advance the ptrs to the next K block\n        a_ptrs += BLOCK_SIZE_K * stride_ak\n        b_ptrs += BLOCK_SIZE_K * stride_bk\n    # you can fuse arbitrary activation functions here\n    # while the accumulator is still in FP32!\n    if ACTIVATION:\n        accumulator = ACTIVATION(accumulator)\n    c = accumulator.to(tl.float16)\n\n    # -----------------------------------------------------------\n    # Write back the block of the output matrix C\n    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)\n    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)\n    c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]\n    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)\n    tl.store(c_ptrs, c, mask=c_mask)\n\n\n# we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`\n@triton.jit\ndef leaky_relu(x):\n    return tl.where(x >= 0, x, 0.01 * x)"
       ]
     },
     {
diff --git a/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb b/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb
index 546b0e795..5a6b1d8fa 100644
--- a/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb
+++ b/_downloads/bc847dec325798bdc436c4ef5ac8b78a/04-low-memory-dropout.ipynb
@@ -22,7 +22,7 @@
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Baseline\nThe *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance \nof deep neural networks in low-data regime (i.e. regularization).\n\nIt takes a vector as input and produces a vector of the same shape as output. Each scalar in the\noutput has a probability $p$ of being changed to zero and otherwise it is copied from the input.\nThis forces the network to perform well even when only $1 - p$ scalars from the input are available.\n\nAt evaluation time we want to use the full power of the network so we set $p=0$. Naively this would\nincrease the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease\nin the output softmax temperature). To prevent this we multiply the output by $\\frac{1}{1 - p}$, which\nkeeps the norm consistent regardless of the dropout probability.\n\nLet's first take a look at the baseline implementation.\n\n"
+        "## Baseline\nThe *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance\nof deep neural networks in low-data regime (i.e. regularization).\n\nIt takes a vector as input and produces a vector of the same shape as output. Each scalar in the\noutput has a probability $p$ of being changed to zero and otherwise it is copied from the input.\nThis forces the network to perform well even when only $1 - p$ scalars from the input are available.\n\nAt evaluation time we want to use the full power of the network so we set $p=0$. Naively this would\nincrease the norm of the output (which can be a bad thing, e.g. it can lead to artificial decrease\nin the output softmax temperature). To prevent this we multiply the output by $\\frac{1}{1 - p}$, which\nkeeps the norm consistent regardless of the dropout probability.\n\nLet's first take a look at the baseline implementation.\n\n"
       ]
     },
     {
@@ -33,14 +33,14 @@
       },
       "outputs": [],
       "source": [
-        "import tabulate\nimport torch\nimport triton\nimport triton.language as tl\n\n@triton.jit\ndef _dropout(\n        x_ptr, # pointer to the input\n        x_keep_ptr, # pointer to a mask of 0s and 1s\n        output_ptr, # pointer to the output\n        n_elements, # number of elements in the `x` tensor\n        p, # probability that an element of `x` is changed to zero\n        **meta,\n):\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    mask = offsets < n_elements\n    # Load data\n    x = tl.load(x_ptr + offsets, mask=mask)\n    x_keep = tl.load(x_keep_ptr + offsets, mask=mask)\n    # The line below is the crucial part, described in the paragraph above!\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    # Write-back output\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef dropout(x, x_keep, p):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)\n    return output\n\n# Input tensor\nx = torch.randn(size=(10,)).cuda()\n# Dropout mask\np = 0.5\nx_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()\n#\noutput = dropout(x, x_keep=x_keep, p=p)\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"keep mask\"] + x_keep.tolist(),\n    [\"output\"] + output.tolist()\n]))"
+        "import tabulate\nimport torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef _dropout(\n        x_ptr,  # pointer to the input\n        x_keep_ptr,  # pointer to a mask of 0s and 1s\n        output_ptr,  # pointer to the output\n        n_elements,  # number of elements in the `x` tensor\n        p,  # probability that an element of `x` is changed to zero\n        BLOCK_SIZE: tl.constexpr,\n):\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    mask = offsets < n_elements\n    # Load data\n    x = tl.load(x_ptr + offsets, mask=mask)\n    x_keep = tl.load(x_keep_ptr + offsets, mask=mask)\n    # The line below is the crucial part, described in the paragraph above!\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    # Write-back output\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef dropout(x, x_keep, p):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)\n    return output\n\n\n# Input tensor\nx = torch.randn(size=(10,)).cuda()\n# Dropout mask\np = 0.5\nx_keep = (torch.rand(size=(10,)) > p).to(torch.int32).cuda()\n#\noutput = dropout(x, x_keep=x_keep, p=p)\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"keep mask\"] + x_keep.tolist(),\n    [\"output\"] + output.tolist()\n]))"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {},
       "source": [
-        "## Seeded dropout\nAbove implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly\nwe need to store the dropout mask for backpropagation. Secondly, dropout state management can get\nvery tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in\nhttps://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation\nthat (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management\nof persisting randomness across multiple invocations of the kernel.\n\nPseudorandom number generation in Triton is simple! In this tutorial we will use the\n:code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` \nvalues in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides\nother `random number generation strategies <Random Number Generation>`.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).</p></div>\n\nLet's put it all together.\n\n"
+        "## Seeded dropout\nAbove implementation of dropout works fine, but it can be a bit awkward to deal with. Firstly\nwe need to store the dropout mask for backpropagation. Secondly, dropout state management can get\nvery tricky when using recompute/checkpointing (e.g. see all the notes about `preserve_rng_state` in\nhttps://pytorch.org/docs/1.9.0/checkpoint.html). In this tutorial we'll describe an alternative implementation\nthat (1) has a smaller memory footprint; (2) requires less data movement; and (3) simplifies the management\nof persisting randomness across multiple invocations of the kernel.\n\nPseudorandom number generation in Triton is simple! In this tutorial we will use the\n:code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32`\nvalues in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides\nother `random number generation strategies <Random Number Generation>`.\n\n<div class=\"alert alert-info\"><h4>Note</h4><p>Triton's implementation of PRNG is based on the Philox algorithm (described on [SALMON2011]_).</p></div>\n\nLet's put it all together.\n\n"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "@triton.jit\ndef _seeded_dropout(\n        x_ptr,\n        output_ptr,\n        n_elements,\n        p,\n        seed,\n        **meta,\n):\n    # compute memory offsets of elements handled by this instance\n    BLOCK_SIZE = meta['BLOCK_SIZE']\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # load data from x\n    mask = offsets < n_elements\n    x = tl.load(x_ptr + offsets, mask=mask)\n    # randomly prune it\n    random = tl.rand(seed, offsets)\n    x_keep = random > p\n    # write-back\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef seeded_dropout(x, p, seed):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)\n    return output\n\n\nx = torch.randn(size=(10,)).cuda()\n# Compare this to the baseline - dropout mask is never instantiated!\noutput = seeded_dropout(x, p=0.5, seed=123)\noutput2 = seeded_dropout(x, p=0.5, seed=123)\noutput3 = seeded_dropout(x, p=0.5, seed=512)\n\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"output (seed = 123)\"] + output.tolist(),\n    [\"output (seed = 123)\"] + output2.tolist(),\n    [\"output (seed = 512)\"] + output3.tolist()\n]))"
+        "@triton.jit\ndef _seeded_dropout(\n        x_ptr,\n        output_ptr,\n        n_elements,\n        p,\n        seed,\n        BLOCK_SIZE: tl.constexpr,\n):\n    # compute memory offsets of elements handled by this instance\n    pid = tl.program_id(axis=0)\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # load data from x\n    mask = offsets < n_elements\n    x = tl.load(x_ptr + offsets, mask=mask)\n    # randomly prune it\n    random = tl.rand(seed, offsets)\n    x_keep = random > p\n    # write-back\n    output = tl.where(x_keep, x / (1 - p), 0.0)\n    tl.store(output_ptr + offsets, output, mask=mask)\n\n\ndef seeded_dropout(x, p, seed):\n    output = torch.empty_like(x)\n    assert x.is_contiguous()\n    n_elements = x.numel()\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    _seeded_dropout[grid](x, output, n_elements, p, seed, BLOCK_SIZE=1024)\n    return output\n\n\nx = torch.randn(size=(10,)).cuda()\n# Compare this to the baseline - dropout mask is never instantiated!\noutput = seeded_dropout(x, p=0.5, seed=123)\noutput2 = seeded_dropout(x, p=0.5, seed=123)\noutput3 = seeded_dropout(x, p=0.5, seed=512)\n\nprint(tabulate.tabulate([\n    [\"input\"] + x.tolist(),\n    [\"output (seed = 123)\"] + output.tolist(),\n    [\"output (seed = 123)\"] + output2.tolist(),\n    [\"output (seed = 512)\"] + output3.tolist()\n]))"
       ]
     },
     {
diff --git a/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py b/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py
index d988746a7..cf172537a 100644
--- a/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py
+++ b/_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py
@@ -13,7 +13,7 @@ whose state is generally composed of a bit mask tensor of the same shape as the
 # %%
 # Baseline
 # -------------
-# The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance 
+# The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance
 # of deep neural networks in low-data regime (i.e. regularization).
 #
 # It takes a vector as input and produces a vector of the same shape as output. Each scalar in the
@@ -30,19 +30,20 @@ whose state is generally composed of a bit mask tensor of the same shape as the
 
 import tabulate
 import torch
+
 import triton
 import triton.language as tl
 
+
 @triton.jit
 def _dropout(
-        x_ptr, # pointer to the input
-        x_keep_ptr, # pointer to a mask of 0s and 1s
-        output_ptr, # pointer to the output
-        n_elements, # number of elements in the `x` tensor
-        p, # probability that an element of `x` is changed to zero
-        **meta,
+        x_ptr,  # pointer to the input
+        x_keep_ptr,  # pointer to a mask of 0s and 1s
+        output_ptr,  # pointer to the output
+        n_elements,  # number of elements in the `x` tensor
+        p,  # probability that an element of `x` is changed to zero
+        BLOCK_SIZE: tl.constexpr,
 ):
-    BLOCK_SIZE = meta['BLOCK_SIZE']
     pid = tl.program_id(axis=0)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
@@ -64,6 +65,7 @@ def dropout(x, x_keep, p):
     _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)
     return output
 
+
 # Input tensor
 x = torch.randn(size=(10,)).cuda()
 # Dropout mask
@@ -88,7 +90,7 @@ print(tabulate.tabulate([
 # of persisting randomness across multiple invocations of the kernel.
 #
 # Pseudorandom number generation in Triton is simple! In this tutorial we will use the
-# :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` 
+# :code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32`
 # values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides
 # other :ref:`random number generation strategies <Random Number Generation>`.
 #
@@ -97,6 +99,7 @@ print(tabulate.tabulate([
 #
 # Let's put it all together.
 
+
 @triton.jit
 def _seeded_dropout(
         x_ptr,
@@ -104,10 +107,9 @@ def _seeded_dropout(
         n_elements,
         p,
         seed,
-        **meta,
+        BLOCK_SIZE: tl.constexpr,
 ):
     # compute memory offsets of elements handled by this instance
-    BLOCK_SIZE = meta['BLOCK_SIZE']
     pid = tl.program_id(axis=0)
     block_start = pid * BLOCK_SIZE
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
diff --git a/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py b/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py
index 1d9fea638..f773a3787 100644
--- a/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py
+++ b/_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py
@@ -112,13 +112,13 @@ You will specifically learn about:
 #    # number of programs ids along the N axis
 #    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
 #    # number of programs in group
-#    num_pid_in_group = GROUP_SIZE_M * num_pid_n 
+#    num_pid_in_group = GROUP_SIZE_M * num_pid_n
 #    # id of the group this program is in
-#    group_id = pid // num_pid_in_group 
+#    group_id = pid // num_pid_in_group
 #    # row-id of the first program in the group
-#    first_pid_m = group_id * GROUP_SIZE_M 
+#    first_pid_m = group_id * GROUP_SIZE_M
 #    # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller
-#    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
+#    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
 #    # *within groups*, programs are ordered in a column-major order
 #    # row-id of the program in the *launch grid*
 #    pid_m = first_pid_m + (pid % group_size_m)
@@ -141,6 +141,7 @@ You will specifically learn about:
 #
 
 import torch
+
 import triton
 import triton.language as tl
 
@@ -152,23 +153,22 @@ import triton.language as tl
 #   - An autotuning *key* whose change in values will trigger evaluation of all the
 #       provided configs
 
+
 @triton.autotune(
     configs=[
         triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
         triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
         triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32 , 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
     ],
     key=['M', 'N', 'K'],
 )
-# %
-# We can now define our kernel as normal, using all the techniques presented above
 @triton.jit
 def matmul_kernel(
     # Pointers to matrices
@@ -182,17 +182,13 @@ def matmul_kernel(
     stride_bk, stride_bn,
     stride_cm, stride_cn,
     # Meta-parameters
-    **meta,
+    BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+    GROUP_SIZE_M: tl.constexpr,
+    ACTIVATION: tl.constexpr,
 ):
     """Kernel for computing the matmul C = A x B.
     A has shape (M, K), B has shape (K, N) and C has shape (M, N)
     """
-    # extract meta-parameters
-    BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
-    BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
-    BLOCK_SIZE_K = meta['BLOCK_SIZE_K']
-    GROUP_SIZE_M = 8
-
     # -----------------------------------------------------------
     # Map program ids `pid` to the block of C it should compute.
     # This is done in a grouped ordering to promote L2 data reuse
@@ -200,16 +196,16 @@ def matmul_kernel(
     pid = tl.program_id(axis=0)
     num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
     num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-    num_pid_in_group = GROUP_SIZE_M * num_pid_n 
-    group_id = pid // num_pid_in_group 
-    first_pid_m = group_id * GROUP_SIZE_M 
-    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
+    num_pid_in_group = GROUP_SIZE_M * num_pid_n
+    group_id = pid // num_pid_in_group
+    first_pid_m = group_id * GROUP_SIZE_M
+    group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
     pid_m = first_pid_m + (pid % group_size_m)
     pid_n = (pid % num_pid_in_group) // group_size_m
 
     # ----------------------------------------------------------
     # Create pointers for the first blocks of A and B.
-    # We will advance this pointer as we move in the K direction 
+    # We will advance this pointer as we move in the K direction
     # and accumulate
     # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
     # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
@@ -217,8 +213,8 @@ def matmul_kernel(
     offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
     offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
-    b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
+    a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+    b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
 
     # -----------------------------------------------------------
     # Iterate to compute a block of the C matrix
@@ -227,8 +223,8 @@ def matmul_kernel(
     # `accumulator` will be converted back to fp16 after the loop
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
     for k in range(0, K, BLOCK_SIZE_K):
-        # Note that for simplicity, we don't apply a mask here. 
-        # This means that if K is not a multiple of BLOCK_SIZE_K, 
+        # Note that for simplicity, we don't apply a mask here.
+        # This means that if K is not a multiple of BLOCK_SIZE_K,
         # this will access out-of-bounds memory and produce an
         # error or (worse!) incorrect results.
         a = tl.load(a_ptrs)
@@ -239,9 +235,9 @@ def matmul_kernel(
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
     # you can fuse arbitrary activation functions here
-    # while the accumulator is still in FP32 !
-    if meta['ACTIVATION']: 
-        accumulator = meta['ACTIVATION'](accumulator)
+    # while the accumulator is still in FP32!
+    if ACTIVATION:
+        accumulator = ACTIVATION(accumulator)
     c = accumulator.to(tl.float16)
 
     # -----------------------------------------------------------
diff --git a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
index 15efa7c81..7af24e18d 100644
--- a/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
+++ b/_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py
@@ -18,6 +18,9 @@ You will learn about:
 
 import torch
 
+import triton
+import triton.language as tl
+
 
 @torch.jit.script
 def naive_softmax(x):
@@ -59,17 +62,14 @@ def naive_softmax(x):
 # power-of-two number of elements, so we need to internally "pad" each row and guard the
 # memory operations properly if we want to handle any possible input shapes:
 
-import triton
-import triton.language as tl
-
 
 @triton.jit
 def softmax_kernel(
-    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta
+    output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
+    BLOCK_SIZE: tl.constexpr
 ):
     # The rows of the softmax are independent, so we parallelize across those
     row_idx = tl.program_id(0)
-    BLOCK_SIZE = meta['BLOCK_SIZE']
     # The stride represents how much we need to increase the pointer to advance 1 row
     row_start_ptr = input_ptr + row_idx * input_row_stride
     # The block size is the next power of two greater than n_cols, so we can fit each
@@ -134,9 +134,9 @@ torch.manual_seed(0)
 x = torch.randn(1823, 781, device='cuda')
 y_triton = softmax(x)
 y_torch = torch.softmax(x, axis=1)
-print(torch.allclose(y_triton, y_torch))
+assert torch.allclose(y_triton, y_torch), (y_triton, y_torch)
 
-#%%
+# %%
 # As expected, the results are identical.
 
 # %%
@@ -187,5 +187,5 @@ benchmark.run(show_plots=True, print_data=True)
 # In the above plot, we can see that:
 #
 #  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
-#  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. 
+#  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
 #    Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
diff --git a/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb b/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
index 97cda1a7b..a88ec6569 100644
--- a/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
+++ b/_downloads/f191ee1e78dc52eb5f7cba88f71cef2f/01-vector-add.ipynb
@@ -33,7 +33,7 @@
       },
       "outputs": [],
       "source": [
-        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n    x_ptr,  # *Pointer* to first input vector\n    y_ptr,  # *Pointer* to second input vector\n    output_ptr,  # *Pointer* to output vector\n    n_elements,  # Size of the vector\n    **meta,  # Optional meta-parameters for the kernel\n):\n    BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process\n    # There are multiple 'program's processing different data. We identify which program\n    # we are here\n    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0\n    # This program will process inputs that are offset from the initial data.\n    # for instance, if you had a vector of length 256 and block_size of 64, the programs\n    # would each access the elements [0:64, 64:128, 128:192, 192:256].\n    # Note that offsets is a list of pointers\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # Create a mask to guard memory operations against out-of-bounds accesses\n    mask = offsets < n_elements\n    # Load x and y from DRAM, masking out any extar elements in case the input is not a\n    # multiple of the block size\n    x = tl.load(x_ptr + offsets, mask=mask)\n    y = tl.load(y_ptr + offsets, mask=mask)\n    output = x + y\n    # Write x + y back to DRAM\n    tl.store(output_ptr + offsets, output, mask=mask)"
+        "import torch\n\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef add_kernel(\n    x_ptr,  # *Pointer* to first input vector\n    y_ptr,  # *Pointer* to second input vector\n    output_ptr,  # *Pointer* to output vector\n    n_elements,  # Size of the vector\n    BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process\n                 # NOTE: `constexpr` so it can be used as a shape value\n):\n    # There are multiple 'program's processing different data. We identify which program\n    # we are here\n    pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0\n    # This program will process inputs that are offset from the initial data.\n    # for instance, if you had a vector of length 256 and block_size of 64, the programs\n    # would each access the elements [0:64, 64:128, 128:192, 192:256].\n    # Note that offsets is a list of pointers\n    block_start = pid * BLOCK_SIZE\n    offsets = block_start + tl.arange(0, BLOCK_SIZE)\n    # Create a mask to guard memory operations against out-of-bounds accesses\n    mask = offsets < n_elements\n    # Load x and y from DRAM, masking out any extra elements in case the input is not a\n    # multiple of the block size\n    x = tl.load(x_ptr + offsets, mask=mask)\n    y = tl.load(y_ptr + offsets, mask=mask)\n    output = x + y\n    # Write x + y back to DRAM\n    tl.store(output_ptr + offsets, output, mask=mask)"
       ]
     },
     {
@@ -51,7 +51,7 @@
       },
       "outputs": [],
       "source": [
-        "def add(x: torch.Tensor, y: torch.Tensor):\n    # We need to preallocate the output\n    output = torch.empty_like(x)\n    assert x.is_cuda and y.is_cuda and output.is_cuda\n    n_elements = output.numel()\n    # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n    # In this case, we use a 1D grid where the size is the number of blocks\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    # NOTE:\n    #  - each torch.tensor object is implicitly converted into a pointer to its first element.\n    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n    #  - don't forget to pass meta-parameters as keywords arguments\n    pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)\n    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n    # running asynchronously at this point.\n    return output"
+        "def add(x: torch.Tensor, y: torch.Tensor):\n    # We need to preallocate the output\n    output = torch.empty_like(x)\n    assert x.is_cuda and y.is_cuda and output.is_cuda\n    n_elements = output.numel()\n    # The SPMD launch grid denotes the number of kernel instances that run in parallel.\n    # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int]\n    # In this case, we use a 1D grid where the size is the number of blocks\n    grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)\n    # NOTE:\n    #  - each torch.tensor object is implicitly converted into a pointer to its first element.\n    #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel\n    #  - don't forget to pass meta-parameters as keywords arguments\n    add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)\n    # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still\n    # running asynchronously at this point.\n    return output"
       ]
     },
     {
diff --git a/_images/sphx_glr_01-vector-add_001.png b/_images/sphx_glr_01-vector-add_001.png
index 2b16f415b..537a7126b 100644
Binary files a/_images/sphx_glr_01-vector-add_001.png and b/_images/sphx_glr_01-vector-add_001.png differ
diff --git a/_images/sphx_glr_01-vector-add_thumb.png b/_images/sphx_glr_01-vector-add_thumb.png
index b27cee5dc..4785bf526 100644
Binary files a/_images/sphx_glr_01-vector-add_thumb.png and b/_images/sphx_glr_01-vector-add_thumb.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_001.png b/_images/sphx_glr_02-fused-softmax_001.png
index ece2ffadf..342506648 100644
Binary files a/_images/sphx_glr_02-fused-softmax_001.png and b/_images/sphx_glr_02-fused-softmax_001.png differ
diff --git a/_images/sphx_glr_02-fused-softmax_thumb.png b/_images/sphx_glr_02-fused-softmax_thumb.png
index d97715a22..48661ffbd 100644
Binary files a/_images/sphx_glr_02-fused-softmax_thumb.png and b/_images/sphx_glr_02-fused-softmax_thumb.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_001.png b/_images/sphx_glr_03-matrix-multiplication_001.png
index 219f8128c..9f7135064 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_001.png and b/_images/sphx_glr_03-matrix-multiplication_001.png differ
diff --git a/_images/sphx_glr_03-matrix-multiplication_thumb.png b/_images/sphx_glr_03-matrix-multiplication_thumb.png
index 6690582d5..0a474e279 100644
Binary files a/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/_images/sphx_glr_03-matrix-multiplication_thumb.png differ
diff --git a/_images/sphx_glr_05-layer-norm_001.png b/_images/sphx_glr_05-layer-norm_001.png
new file mode 100644
index 000000000..40a15150b
Binary files /dev/null and b/_images/sphx_glr_05-layer-norm_001.png differ
diff --git a/_images/sphx_glr_05-layer-norm_thumb.png b/_images/sphx_glr_05-layer-norm_thumb.png
new file mode 100644
index 000000000..5f5140903
Binary files /dev/null and b/_images/sphx_glr_05-layer-norm_thumb.png differ
diff --git a/_sources/getting-started/installation.rst.txt b/_sources/getting-started/installation.rst.txt
index db6b6261b..20c4628bc 100644
--- a/_sources/getting-started/installation.rst.txt
+++ b/_sources/getting-started/installation.rst.txt
@@ -44,7 +44,7 @@ You can then test your installation by running the unit tests:
 
 .. code-block:: bash
 
-      pip install -r requirements-test.txt
+      pip install -e '.[tests]'
       pytest -vs test/unit/
 
 and the benchmarks
diff --git a/_sources/getting-started/tutorials/01-vector-add.rst.txt b/_sources/getting-started/tutorials/01-vector-add.rst.txt
index 6c8ab47b9..933d1eb9f 100644
--- a/_sources/getting-started/tutorials/01-vector-add.rst.txt
+++ b/_sources/getting-started/tutorials/01-vector-add.rst.txt
@@ -31,12 +31,13 @@ In this tutorial, you will write a simple vector addition using Triton and learn
 Compute Kernel
 --------------------------
 
-.. GENERATED FROM PYTHON SOURCE LINES 14-49
+.. GENERATED FROM PYTHON SOURCE LINES 14-50
 
 .. code-block:: default
 
 
     import torch
+
     import triton
     import triton.language as tl
 
@@ -47,9 +48,9 @@ Compute Kernel
         y_ptr,  # *Pointer* to second input vector
         output_ptr,  # *Pointer* to output vector
         n_elements,  # Size of the vector
-        **meta,  # Optional meta-parameters for the kernel
+        BLOCK_SIZE: tl.constexpr,  # Number of elements each program should process
+                     # NOTE: `constexpr` so it can be used as a shape value
     ):
-        BLOCK_SIZE = meta['BLOCK_SIZE']  # How many inputs each program should process
         # There are multiple 'program's processing different data. We identify which program
         # we are here
         pid = tl.program_id(axis=0)  # We use a 1D launch grid so axis is 0
@@ -61,7 +62,7 @@ Compute Kernel
         offsets = block_start + tl.arange(0, BLOCK_SIZE)
         # Create a mask to guard memory operations against out-of-bounds accesses
         mask = offsets < n_elements
-        # Load x and y from DRAM, masking out any extar elements in case the input is not a
+        # Load x and y from DRAM, masking out any extra elements in case the input is not a
         # multiple of the block size
         x = tl.load(x_ptr + offsets, mask=mask)
         y = tl.load(y_ptr + offsets, mask=mask)
@@ -77,12 +78,12 @@ Compute Kernel
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 50-52
+.. GENERATED FROM PYTHON SOURCE LINES 51-53
 
 Let's also declare a helper function to (1) allocate the `z` tensor
 and (2) enqueue the above kernel with appropriate grid/block sizes.
 
-.. GENERATED FROM PYTHON SOURCE LINES 52-73
+.. GENERATED FROM PYTHON SOURCE LINES 53-74
 
 .. code-block:: default
 
@@ -101,7 +102,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
         #  - each torch.tensor object is implicitly converted into a pointer to its first element.
         #  - `triton.jit`'ed functions can be index with a launch grid to obtain a callable GPU kernel
         #  - don't forget to pass meta-parameters as keywords arguments
-        pgm = add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
+        add_kernel[grid](x, y, output, n_elements, BLOCK_SIZE=1024)
         # We return a handle to z but, since `torch.cuda.synchronize()` hasn't been called, the kernel is still
         # running asynchronously at this point.
         return output
@@ -114,11 +115,11 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 74-75
+.. GENERATED FROM PYTHON SOURCE LINES 75-76
 
 We can now use the above function to compute the element-wise sum of two `torch.tensor` objects and test its correctness:
 
-.. GENERATED FROM PYTHON SOURCE LINES 75-89
+.. GENERATED FROM PYTHON SOURCE LINES 76-90
 
 .. code-block:: default
 
@@ -153,11 +154,11 @@ We can now use the above function to compute the element-wise sum of two `torch.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 90-91
+.. GENERATED FROM PYTHON SOURCE LINES 91-92
 
 Seems like we're good to go!
 
-.. GENERATED FROM PYTHON SOURCE LINES 93-98
+.. GENERATED FROM PYTHON SOURCE LINES 94-99
 
 Benchmark
 -----------
@@ -165,7 +166,7 @@ We can now benchmark our custom op on vectors of increasing sizes to get a sense
 To make things easier, Triton has a set of built-in utilities that allow us to concisely plot the performance of your custom ops
 for different problem sizes.
 
-.. GENERATED FROM PYTHON SOURCE LINES 98-127
+.. GENERATED FROM PYTHON SOURCE LINES 99-128
 
 .. code-block:: default
 
@@ -205,12 +206,12 @@ for different problem sizes.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 128-130
+.. GENERATED FROM PYTHON SOURCE LINES 129-131
 
 We can now run the decorated function above. Pass `print_data=True` to see the performance number, `show_plots=True` to plot them, and/or
 `save_path='/path/to/results/' to save them to disk along with raw CSV data
 
-.. GENERATED FROM PYTHON SOURCE LINES 130-131
+.. GENERATED FROM PYTHON SOURCE LINES 131-132
 
 .. code-block:: default
 
@@ -237,14 +238,14 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
     3       32768.0   76.800002   76.800002
     4       65536.0  127.999995  127.999995
     5      131072.0  219.428568  219.428568
-    6      262144.0  341.333321  384.000001
+    6      262144.0  341.333321  341.333321
     7      524288.0  472.615390  472.615390
     8     1048576.0  614.400016  614.400016
     9     2097152.0  722.823517  722.823517
     10    4194304.0  780.190482  780.190482
     11    8388608.0  812.429770  812.429770
     12   16777216.0  833.084721  833.084721
-    13   33554432.0  842.004273  843.811163
+    13   33554432.0  842.004273  842.004273
     14   67108864.0  847.448255  848.362445
     15  134217728.0  849.737435  850.656574
 
@@ -254,7 +255,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 1 minutes  52.411 seconds)
+   **Total running time of the script:** ( 1 minutes  50.312 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py:
diff --git a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
index b330e50f0..597010993 100644
--- a/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
+++ b/_sources/getting-started/tutorials/02-fused-softmax.rst.txt
@@ -35,13 +35,16 @@ Motivations
 Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 Let us consider instead the case of a simple (numerically stabilized) softmax operation:
 
-.. GENERATED FROM PYTHON SOURCE LINES 18-43
+.. GENERATED FROM PYTHON SOURCE LINES 18-46
 
 .. code-block:: default
 
 
     import torch
 
+    import triton
+    import triton.language as tl
+
 
     @torch.jit.script
     def naive_softmax(x):
@@ -71,7 +74,7 @@ Let us consider instead the case of a simple (numerically stabilized) softmax op
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 44-52
+.. GENERATED FROM PYTHON SOURCE LINES 47-55
 
 When implemented naively in PyTorch, computing :code:`y = naive_softmax(x)` for :math:`x \in R^{M \times N}`
 requires reading :math:`5MN + 2M` elements from DRAM and writing back :math:`3MN + 2M` elements.
@@ -82,7 +85,7 @@ expect a theoretical speed-up of ~4x (i.e., :math:`(8MN + 4M) / 2MN`).
 The `torch.jit.script` flags aims to perform this kind of "kernel fusion" automatically
 but, as we will see later, it is still far from ideal.
 
-.. GENERATED FROM PYTHON SOURCE LINES 54-61
+.. GENERATED FROM PYTHON SOURCE LINES 57-64
 
 Compute Kernel
 ----------------
@@ -92,22 +95,19 @@ Note that one important limitation of Triton is that each block must have a
 power-of-two number of elements, so we need to internally "pad" each row and guard the
 memory operations properly if we want to handle any possible input shapes:
 
-.. GENERATED FROM PYTHON SOURCE LINES 61-93
+.. GENERATED FROM PYTHON SOURCE LINES 64-93
 
 .. code-block:: default
 
 
-    import triton
-    import triton.language as tl
-
 
     @triton.jit
     def softmax_kernel(
-        output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols, **meta
+        output_ptr, input_ptr, input_row_stride, output_row_stride, n_cols,
+        BLOCK_SIZE: tl.constexpr
     ):
         # The rows of the softmax are independent, so we parallelize across those
         row_idx = tl.program_id(0)
-        BLOCK_SIZE = meta['BLOCK_SIZE']
         # The stride represents how much we need to increase the pointer to advance 1 row
         row_start_ptr = input_ptr + row_idx * input_row_stride
         # The block size is the next power of two greater than n_cols, so we can fit each
@@ -199,20 +199,12 @@ This will allow us to verify that our padding mechanism works.
     x = torch.randn(1823, 781, device='cuda')
     y_triton = softmax(x)
     y_torch = torch.softmax(x, axis=1)
-    print(torch.allclose(y_triton, y_torch))
+    assert torch.allclose(y_triton, y_torch), (y_triton, y_torch)
 
 
 
 
 
-.. rst-class:: sphx-glr-script-out
-
- Out:
-
- .. code-block:: none
-
-    True
-
 
 
 
@@ -286,17 +278,17 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
 
     softmax-performance:
               N      Triton  Torch (native)  Torch (jit)
-    0     256.0  512.000001      546.133347   188.321838
-    1     384.0  585.142862      585.142862   151.703707
-    2     512.0  655.360017      606.814814   154.566038
+    0     256.0  546.133347      512.000001   190.511628
+    1     384.0  614.400016      558.545450   153.600004
+    2     512.0  655.360017      585.142849   154.566038
     3     640.0  682.666684      640.000002   160.000000
-    4     768.0  702.171410      646.736871   163.839992
+    4     768.0  722.823517      664.216187   162.754967
     ..      ...         ...             ...          ...
-    93  12160.0  810.666687      405.755985   199.038365
-    94  12288.0  812.429770      415.661740   199.197579
-    95  12416.0  809.189387      412.149375   198.854847
-    96  12544.0  807.661970      412.971190   199.012395
-    97  12672.0  807.776923      412.097543   199.167004
+    93  12160.0  814.058574      406.179533   198.530610
+    94  12288.0  814.111783      415.661740   198.694297
+    95  12416.0  814.163950      412.149375   198.457532
+    96  12544.0  814.214963      412.546756   198.716830
+    97  12672.0  814.265046      412.097543   198.679085
 
     [98 rows x 4 columns]
 
@@ -308,13 +300,13 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t
 In the above plot, we can see that:
 
  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
- - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**. 
+ - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
    Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
 
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 3 minutes  26.243 seconds)
+   **Total running time of the script:** ( 3 minutes  22.431 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py:
diff --git a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
index 26a4d7d39..8bd7b6509 100644
--- a/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
+++ b/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt
@@ -132,13 +132,13 @@ switching to the next column:
    # number of programs ids along the N axis
    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
    # number of programs in group
-   num_pid_in_group = GROUP_SIZE_M * num_pid_n 
+   num_pid_in_group = GROUP_SIZE_M * num_pid_n
    # id of the group this program is in
-   group_id = pid // num_pid_in_group 
+   group_id = pid // num_pid_in_group
    # row-id of the first program in the group
-   first_pid_m = group_id * GROUP_SIZE_M 
+   first_pid_m = group_id * GROUP_SIZE_M
    # if `num_pid_m` isn't divisible by `GROUP_SIZE_M`, the last group is smaller
-   group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
+   group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
    # *within groups*, programs are ordered in a column-major order
    # row-id of the program in the *launch grid*
    pid_m = first_pid_m + (pid % group_size_m)
@@ -161,12 +161,13 @@ Final Result
 -------------
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 142-262
+.. GENERATED FROM PYTHON SOURCE LINES 142-258
 
 .. code-block:: default
 
 
     import torch
+
     import triton
     import triton.language as tl
 
@@ -178,23 +179,22 @@ Final Result
     #   - An autotuning *key* whose change in values will trigger evaluation of all the
     #       provided configs
 
+
     @triton.autotune(
         configs=[
             triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
             triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-            triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64,  'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+            triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+            triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
             triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-            triton.Config({'BLOCK_SIZE_M': 64 , 'BLOCK_SIZE_N': 32 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-            triton.Config({'BLOCK_SIZE_M': 32 , 'BLOCK_SIZE_N': 64 , 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
+            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+            triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+            triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
+            triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
+            triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
         ],
         key=['M', 'N', 'K'],
     )
-    # %
-    # We can now define our kernel as normal, using all the techniques presented above
     @triton.jit
     def matmul_kernel(
         # Pointers to matrices
@@ -208,17 +208,13 @@ Final Result
         stride_bk, stride_bn,
         stride_cm, stride_cn,
         # Meta-parameters
-        **meta,
+        BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr,
+        GROUP_SIZE_M: tl.constexpr,
+        ACTIVATION: tl.constexpr,
     ):
         """Kernel for computing the matmul C = A x B.
         A has shape (M, K), B has shape (K, N) and C has shape (M, N)
         """
-        # extract meta-parameters
-        BLOCK_SIZE_M = meta['BLOCK_SIZE_M']
-        BLOCK_SIZE_N = meta['BLOCK_SIZE_N']
-        BLOCK_SIZE_K = meta['BLOCK_SIZE_K']
-        GROUP_SIZE_M = 8
-
         # -----------------------------------------------------------
         # Map program ids `pid` to the block of C it should compute.
         # This is done in a grouped ordering to promote L2 data reuse
@@ -226,16 +222,16 @@ Final Result
         pid = tl.program_id(axis=0)
         num_pid_m = tl.cdiv(M, BLOCK_SIZE_M)
         num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
-        num_pid_in_group = GROUP_SIZE_M * num_pid_n 
-        group_id = pid // num_pid_in_group 
-        first_pid_m = group_id * GROUP_SIZE_M 
-        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) 
+        num_pid_in_group = GROUP_SIZE_M * num_pid_n
+        group_id = pid // num_pid_in_group
+        first_pid_m = group_id * GROUP_SIZE_M
+        group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M)
         pid_m = first_pid_m + (pid % group_size_m)
         pid_n = (pid % num_pid_in_group) // group_size_m
 
         # ----------------------------------------------------------
         # Create pointers for the first blocks of A and B.
-        # We will advance this pointer as we move in the K direction 
+        # We will advance this pointer as we move in the K direction
         # and accumulate
         # a_ptrs is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers
         # b_ptrs is a block of [BLOCK_SIZE_K, BLOCK_SIZE_n] pointers
@@ -243,8 +239,8 @@ Final Result
         offs_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
         offs_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
         offs_k = tl.arange(0, BLOCK_SIZE_K)
-        a_ptrs = a_ptr + (offs_am[:, None]*stride_am + offs_k [None, :]*stride_ak)
-        b_ptrs = b_ptr + (offs_k [:, None]*stride_bk + offs_bn[None, :]*stride_bn)
+        a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak)
+        b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
 
         # -----------------------------------------------------------
         # Iterate to compute a block of the C matrix
@@ -253,8 +249,8 @@ Final Result
         # `accumulator` will be converted back to fp16 after the loop
         accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
         for k in range(0, K, BLOCK_SIZE_K):
-            # Note that for simplicity, we don't apply a mask here. 
-            # This means that if K is not a multiple of BLOCK_SIZE_K, 
+            # Note that for simplicity, we don't apply a mask here.
+            # This means that if K is not a multiple of BLOCK_SIZE_K,
             # this will access out-of-bounds memory and produce an
             # error or (worse!) incorrect results.
             a = tl.load(a_ptrs)
@@ -265,9 +261,9 @@ Final Result
             a_ptrs += BLOCK_SIZE_K * stride_ak
             b_ptrs += BLOCK_SIZE_K * stride_bk
         # you can fuse arbitrary activation functions here
-        # while the accumulator is still in FP32 !
-        if meta['ACTIVATION']: 
-            accumulator = meta['ACTIVATION'](accumulator)
+        # while the accumulator is still in FP32!
+        if ACTIVATION:
+            accumulator = ACTIVATION(accumulator)
         c = accumulator.to(tl.float16)
 
         # -----------------------------------------------------------
@@ -292,12 +288,12 @@ Final Result
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 263-265
+.. GENERATED FROM PYTHON SOURCE LINES 259-261
 
 We can now create a convenience wrapper function that only takes two input tensors
 and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel
 
-.. GENERATED FROM PYTHON SOURCE LINES 265-294
+.. GENERATED FROM PYTHON SOURCE LINES 261-290
 
 .. code-block:: default
 
@@ -337,14 +333,14 @@ and (1) checks any shape constraint; (2) allocates the output; (3) launches the
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 295-299
+.. GENERATED FROM PYTHON SOURCE LINES 291-295
 
 Unit Test
 -----------
 
 We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS)
 
-.. GENERATED FROM PYTHON SOURCE LINES 299-312
+.. GENERATED FROM PYTHON SOURCE LINES 295-308
 
 .. code-block:: default
 
@@ -392,7 +388,7 @@ We can test our custom matrix multiplication operation against a native torch im
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 313-319
+.. GENERATED FROM PYTHON SOURCE LINES 309-315
 
 Benchmark
 --------------
@@ -401,7 +397,7 @@ Square Matrix Performance
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 We can now compare the performance of our kernel against that of cuBLAS. Here we focus on square matrices, but feel free to arrange this script as you wish to benchmark any other matrix shape.
 
-.. GENERATED FROM PYTHON SOURCE LINES 319-360
+.. GENERATED FROM PYTHON SOURCE LINES 315-356
 
 .. code-block:: default
 
@@ -463,36 +459,36 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
     matmul-performance:
              M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
     0    256.0   2.730667  ...   2.978909              2.978909
-    1    384.0   7.372800  ...   8.507077              8.507077
-    2    512.0  14.563555  ...  16.384000             16.384000
+    1    384.0   7.372800  ...   8.507077              7.899428
+    2    512.0  14.563555  ...  15.420235             16.384000
     3    640.0  22.260869  ...  24.380953             24.380953
     4    768.0  32.768000  ...  34.028308             34.028308
-    5    896.0  37.971025  ...  40.140799             39.025776
-    6   1024.0  49.932191  ...  53.773130             52.428801
-    7   1152.0  44.566925  ...  46.656000             46.656000
+    5    896.0  37.971025  ...  39.025776             39.025776
+    6   1024.0  49.932191  ...  52.428801             52.428801
+    7   1152.0  45.242181  ...  46.656000             46.656000
     8   1280.0  51.200001  ...  56.888887             56.109587
-    9   1408.0  64.138541  ...  67.305878             67.305878
-    10  1536.0  80.430545  ...  79.526831             79.526831
-    11  1664.0  63.372618  ...  62.492442             62.492442
-    12  1792.0  72.983276  ...  72.047592             72.047592
-    13  1920.0  69.120002  ...  70.172588             70.172588
-    14  2048.0  73.908442  ...  76.959706             76.608294
-    15  2176.0  83.500614  ...  86.367588             85.632545
-    16  2304.0  68.446623  ...  76.809875             76.809875
-    17  2432.0  71.305746  ...  74.918570             85.393507
-    18  2560.0  78.019048  ...  80.908642             80.709358
-    19  2688.0  83.552988  ...  89.149366             89.464755
-    20  2816.0  82.602666  ...  83.074685             83.233226
-    21  2944.0  82.646820  ...  82.373605             82.784108
-    22  3072.0  81.943708  ...  88.612060             87.516392
-    23  3200.0  78.914919  ...  91.822093             93.567248
-    24  3328.0  81.530349  ...  84.003845             84.496824
-    25  3456.0  82.519518  ...  91.200871             90.943675
-    26  3584.0  85.633710  ...  94.847460             96.579370
-    27  3712.0  85.528545  ...  85.019017             87.706180
-    28  3840.0  81.980725  ...  86.130841             91.247522
-    29  3968.0  85.993854  ...  91.747320             86.053553
-    30  4096.0  93.727466  ...  88.417474             84.307617
+    9   1408.0  64.138541  ...  66.485074             65.684049
+    10  1536.0  79.526831  ...  79.526831             78.643199
+    11  1664.0  62.929456  ...  62.061463             62.061463
+    12  1792.0  72.983276  ...  72.047592             71.588687
+    13  1920.0  68.776119  ...  70.172588             69.818184
+    14  2048.0  73.262953  ...  76.959706             76.608294
+    15  2176.0  83.500614  ...  85.998493             85.269692
+    16  2304.0  68.446623  ...  76.319081             75.834511
+    17  2432.0  71.125224  ...  82.509438             84.877538
+    18  2560.0  77.833728  ...  80.709358             81.108913
+    19  2688.0  83.369354  ...  89.676257             89.464755
+    20  2816.0  83.233226  ...  82.446516             81.827785
+    21  2944.0  82.373605  ...  82.373605             81.298583
+    22  3072.0  82.062468  ...  88.060814             88.473602
+    23  3200.0  82.368085  ...  89.761569             94.955488
+    24  3328.0  80.889094  ...  80.527177             82.939284
+    25  3456.0  81.849303  ...  86.783176             91.304157
+    26  3584.0  87.042978  ...  98.375705             90.364394
+    27  3712.0  79.726532  ...  90.815768             85.820159
+    28  3840.0  82.592983  ...  88.191387             91.398346
+    29  3968.0  85.873762  ...  90.656713             83.867052
+    30  4096.0  92.563952  ...  82.441739             82.291681
 
     [31 rows x 5 columns]
 
@@ -502,7 +498,7 @@ We can now compare the performance of our kernel against that of cuBLAS. Here we
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 5 minutes  41.070 seconds)
+   **Total running time of the script:** ( 6 minutes  5.923 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py:
diff --git a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
index fe5ecad7d..1ab30fb65 100644
--- a/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
+++ b/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt
@@ -32,7 +32,7 @@ whose state is generally composed of a bit mask tensor of the same shape as the
 
 Baseline
 -------------
-The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance 
+The *dropout* operator was first introduced in [SRIVASTAVA2014]_ as a way to improve the performance
 of deep neural networks in low-data regime (i.e. regularization).
 
 It takes a vector as input and produces a vector of the same shape as output. Each scalar in the
@@ -46,7 +46,7 @@ keeps the norm consistent regardless of the dropout probability.
 
 Let's first take a look at the baseline implementation.
 
-.. GENERATED FROM PYTHON SOURCE LINES 29-80
+.. GENERATED FROM PYTHON SOURCE LINES 29-82
 
 .. code-block:: default
 
@@ -54,19 +54,20 @@ Let's first take a look at the baseline implementation.
 
     import tabulate
     import torch
+
     import triton
     import triton.language as tl
 
+
     @triton.jit
     def _dropout(
-            x_ptr, # pointer to the input
-            x_keep_ptr, # pointer to a mask of 0s and 1s
-            output_ptr, # pointer to the output
-            n_elements, # number of elements in the `x` tensor
-            p, # probability that an element of `x` is changed to zero
-            **meta,
+            x_ptr,  # pointer to the input
+            x_keep_ptr,  # pointer to a mask of 0s and 1s
+            output_ptr,  # pointer to the output
+            n_elements,  # number of elements in the `x` tensor
+            p,  # probability that an element of `x` is changed to zero
+            BLOCK_SIZE: tl.constexpr,
     ):
-        BLOCK_SIZE = meta['BLOCK_SIZE']
         pid = tl.program_id(axis=0)
         block_start = pid * BLOCK_SIZE
         offsets = block_start + tl.arange(0, BLOCK_SIZE)
@@ -88,6 +89,7 @@ Let's first take a look at the baseline implementation.
         _dropout[grid](x, x_keep, output, n_elements, p, BLOCK_SIZE=1024)
         return output
 
+
     # Input tensor
     x = torch.randn(size=(10,)).cuda()
     # Dropout mask
@@ -120,7 +122,7 @@ Let's first take a look at the baseline implementation.
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 81-99
+.. GENERATED FROM PYTHON SOURCE LINES 83-101
 
 Seeded dropout
 -------------
@@ -132,7 +134,7 @@ that (1) has a smaller memory footprint; (2) requires less data movement; and (3
 of persisting randomness across multiple invocations of the kernel.
 
 Pseudorandom number generation in Triton is simple! In this tutorial we will use the
-:code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32` 
+:code:`triton.language.rand` function which generates a block of uniformly distributed :code:`float32`
 values in [0, 1), given a seed and a block of :code:`int32` offsets. But if you need it, Triton also provides
 other :ref:`random number generation strategies <Random Number Generation>`.
 
@@ -141,11 +143,12 @@ other :ref:`random number generation strategies <Random Number Generation>`.
 
 Let's put it all together.
 
-.. GENERATED FROM PYTHON SOURCE LINES 99-147
+.. GENERATED FROM PYTHON SOURCE LINES 101-149
 
 .. code-block:: default
 
 
+
     @triton.jit
     def _seeded_dropout(
             x_ptr,
@@ -153,10 +156,9 @@ Let's put it all together.
             n_elements,
             p,
             seed,
-            **meta,
+            BLOCK_SIZE: tl.constexpr,
     ):
         # compute memory offsets of elements handled by this instance
-        BLOCK_SIZE = meta['BLOCK_SIZE']
         pid = tl.program_id(axis=0)
         block_start = pid * BLOCK_SIZE
         offsets = block_start + tl.arange(0, BLOCK_SIZE)
@@ -205,21 +207,21 @@ Let's put it all together.
 
     -------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
     input                -0.952835  0.371721  0.408716  1.42142  0.149397  -0.67086  -0.214186  -0.431969  -0.707878  -0.106434
-    output (seed = 123)   0         0.743443  0         2.84284  0.298794  -1.34172   0          0          0          0
-    output (seed = 123)   0         0.743443  0         2.84284  0.298794  -1.34172   0          0          0          0
-    output (seed = 512)  -1.90567   0.743443  0         2.84284  0.298794  -1.34172   0         -0.863938   0         -0.212868
+    output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
+    output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
+    output (seed = 512)   0         0         0.817432  2.84284  0         -1.34172  -0.428372   0          0          0
     -------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
 
 
 
 
-.. GENERATED FROM PYTHON SOURCE LINES 148-151
+.. GENERATED FROM PYTHON SOURCE LINES 150-153
 
 Et Voilà! We have a triton kernel that applies the same dropout mask provided the seed is the same!
 If you'd like explore further applications of pseudorandomness in GPU programming, we encourage you
 to explore the `triton/language/random` folder!
 
-.. GENERATED FROM PYTHON SOURCE LINES 153-158
+.. GENERATED FROM PYTHON SOURCE LINES 155-160
 
 Exercises
 -------------
@@ -227,7 +229,7 @@ Exercises
 2. Add support for striding.
 3. (challenge) Implement a kernel for sparse Johnson-Lindenstrauss transform which generates the projection matrix one the fly each time using a seed.
 
-.. GENERATED FROM PYTHON SOURCE LINES 160-165
+.. GENERATED FROM PYTHON SOURCE LINES 162-167
 
 References
 --------------
@@ -238,7 +240,7 @@ References
 
 .. rst-class:: sphx-glr-timing
 
-   **Total running time of the script:** ( 0 minutes  0.010 seconds)
+   **Total running time of the script:** ( 0 minutes  0.477 seconds)
 
 
 .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py:
diff --git a/_sources/getting-started/tutorials/05-layer-norm.rst.txt b/_sources/getting-started/tutorials/05-layer-norm.rst.txt
new file mode 100644
index 000000000..b99c822c8
--- /dev/null
+++ b/_sources/getting-started/tutorials/05-layer-norm.rst.txt
@@ -0,0 +1,370 @@
+
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "getting-started/tutorials/05-layer-norm.py"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. only:: html
+
+    .. note::
+        :class: sphx-glr-download-link-note
+
+        Click :ref:`here <sphx_glr_download_getting-started_tutorials_05-layer-norm.py>`
+        to download the full example code
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_getting-started_tutorials_05-layer-norm.py:
+
+
+Layer Normalization
+====================
+
+.. GENERATED FROM PYTHON SOURCE LINES 5-262
+
+
+
+.. image:: /getting-started/tutorials/images/sphx_glr_05-layer-norm_001.png
+    :alt: 05 layer norm
+    :class: sphx-glr-single-img
+
+
+.. rst-class:: sphx-glr-script-out
+
+ Out:
+
+ .. code-block:: none
+
+    layer-norm-backward:
+              N      Triton       Torch
+    0    1024.0  311.088617   99.497980
+    1    1536.0  347.773587  133.565214
+    2    2048.0  420.102553  162.217818
+    3    2560.0  455.111129  182.857144
+    4    3072.0  511.999982  191.501303
+    5    3584.0  551.384634  208.271186
+    6    4096.0  568.231237  220.907859
+    7    4608.0  502.690905  232.336141
+    8    5120.0  527.381977  243.326731
+    9    5632.0  540.671974  243.545956
+    10   6144.0  544.118087  249.081070
+    11   6656.0  528.953642  256.410903
+    12   7168.0  507.469040  262.243907
+    13   7680.0  481.253256  261.076480
+    14   8192.0  461.521112  269.326017
+    15   8704.0  417.791980  268.159180
+    16   9216.0  431.157889  273.404206
+    17   9728.0  442.181815  280.953074
+    18  10240.0  448.467168  286.767793
+    19  10752.0  427.231788  246.699797
+    20  11264.0  427.071098  245.313973
+    21  11776.0  420.571432  249.447482
+    22  12288.0  420.102570  254.673582
+    23  12800.0  414.016170  253.674644
+    24  13312.0  410.652963  252.759501
+    25  13824.0  403.620451  257.390218
+    26  14336.0  396.387109  254.862216
+    27  14848.0  382.351933  257.293872
+    28  15360.0  374.253788  257.790220
+    29  15872.0  368.046389  262.890274
+
+
+
+
+
+
+|
+
+.. code-block:: default
+
+
+    import torch
+
+    import triton
+    import triton.language as tl
+
+    try:
+        # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
+        # should not be added to extras_require in setup.py.
+        import apex
+        HAS_APEX = True
+    except ModuleNotFoundError:
+        HAS_APEX = False
+
+
+    # Forward Pass
+    @triton.jit
+    def _layer_norm_fwd_fused(X, Y, W, B, M, V, stride, N, eps,
+                              BLOCK_SIZE: tl.constexpr):
+        # position of elements processed by this program
+        row = tl.program_id(0)
+        cols = tl.arange(0, BLOCK_SIZE)
+        mask = cols < N
+        # offset data pointers to start at the row of interest
+        X += row * stride
+        Y += row * stride
+        # load data and cast to float32
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        # compute mean
+        mean = tl.sum(x, axis=0) / N
+        # compute std
+        xmean = tl.where(mask, x - mean, 0.)
+        var = tl.sum(xmean * xmean, axis=0) / N
+        rstd = 1 / tl.sqrt(var + eps)
+        xhat = xmean * rstd
+        # write-back mean/rstd
+        tl.store(M + row, mean)
+        tl.store(V + row, rstd)
+        # multiply by weight and add bias
+        w = tl.load(W + cols, mask=mask)
+        b = tl.load(B + cols, mask=mask)
+        y = xhat * w + b
+        # write-back
+        tl.store(Y + cols, y, mask=mask)
+
+
+    # Backward pass (DX + partial DW + partial DB)
+    @triton.jit
+    def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock, stride, N, eps,
+                                 GROUP_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
+        # position of elements processed by this program
+        row = tl.program_id(0)
+        cols = tl.arange(0, BLOCK_SIZE_N)
+        mask = cols < N
+        # offset data pointers to start at the row of interest
+        X += row * stride
+        DY += row * stride
+        DX += row * stride
+        # offset locks and weight/bias gradient pointer
+        # each kernel instance accumulates partial sums for
+        # DW and DB into one of GROUP_SIZE_M independent buffers
+        # these buffers stay in the L2, which allow this kernel
+        # to be fast
+        lock_id = row % GROUP_SIZE_M
+        Lock += lock_id
+        Count = Lock + GROUP_SIZE_M
+        DW = DW + lock_id * N + cols
+        DB = DB + lock_id * N + cols
+        # load data to SRAM
+        x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+        dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+        w = tl.load(W + cols, mask=mask).to(tl.float32)
+        mean = tl.load(M + row)
+        rstd = tl.load(V + row)
+        # compute dx
+        xhat = (x - mean) * rstd
+        wdy = w * dy
+        xhat = tl.where(mask, xhat, 0.)
+        wdy = tl.where(mask, wdy, 0.)
+        mean1 = tl.sum(xhat * wdy, axis=0) / N
+        mean2 = tl.sum(wdy, axis=0) / N
+        dx = (wdy - (xhat * mean1 + mean2)) * rstd
+        # write-back dx
+        tl.store(DX + cols, dx, mask=mask)
+        # accumulate partial sums for dw/db
+        partial_dw = (dy * xhat).to(w.dtype)
+        partial_db = (dy).to(w.dtype)
+        while tl.atomic_cas(Lock, 0, 1) == 1:
+            pass
+        count = tl.load(Count)
+        # first store doesn't accumulate
+        if count == 0:
+            tl.atomic_xchg(Count, 1)
+        else:
+            partial_dw += tl.load(DW, mask=mask)
+            partial_db += tl.load(DB, mask=mask)
+        tl.store(DW, partial_dw, mask=mask)
+        tl.store(DB, partial_db, mask=mask)
+        # release lock
+        tl.atomic_xchg(Lock, 0)
+
+    # Backward pass (total DW + total DB)
+
+
+    @triton.jit
+    def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N,
+                             BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
+        pid = tl.program_id(0)
+        cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
+        dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+        for i in range(0, M, BLOCK_SIZE_M):
+            rows = i + tl.arange(0, BLOCK_SIZE_M)
+            mask = (rows[:, None] < M) & (cols[None, :] < N)
+            offs = rows[:, None] * N + cols[None, :]
+            dw += tl.load(DW + offs, mask=mask, other=0.)
+            db += tl.load(DB + offs, mask=mask, other=0.)
+        sum_dw = tl.sum(dw, axis=0)
+        sum_db = tl.sum(db, axis=0)
+        tl.store(FINAL_DW + cols, sum_dw, mask=cols < N)
+        tl.store(FINAL_DB + cols, sum_db, mask=cols < N)
+
+
+    class LayerNorm(torch.autograd.Function):
+
+        @staticmethod
+        def forward(ctx, x, normalized_shape, weight, bias, eps):
+            # allocate output
+            y = torch.empty_like(x)
+            # reshape input data into 2D tensor
+            x_arg = x.reshape(-1, x.shape[-1])
+            M, N = x_arg.shape
+            mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
+            rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
+            # Less than 64KB per feature: enqueue fused kernel
+            MAX_FUSED_SIZE = 65536 // x.element_size()
+            BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
+            if N > BLOCK_SIZE:
+                raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+            # heuristics for number of warps
+            num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
+            # enqueue kernel
+            _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd,
+                                        x_arg.stride(0), N, eps,
+                                        BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
+            ctx.save_for_backward(x, weight, bias, mean, rstd)
+            ctx.BLOCK_SIZE = BLOCK_SIZE
+            ctx.num_warps = num_warps
+            ctx.eps = eps
+            return y
+
+        @staticmethod
+        def backward(ctx, dy):
+            x, w, b, m, v = ctx.saved_tensors
+            # heuristics for amount of parallel reduction stream for DG/DB
+            N = w.shape[0]
+            GROUP_SIZE_M = 64
+            if N <= 8192: GROUP_SIZE_M = 96
+            if N <= 4096: GROUP_SIZE_M = 128
+            if N <= 1024: GROUP_SIZE_M = 256
+            # allocate output
+            locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda')
+            _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+            _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+            dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+            db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+            dx = torch.empty_like(dy)
+            # enqueue kernel using forward pass heuristics
+            # also compute partial sums for DW and DB
+            x_arg = x.reshape(-1, x.shape[-1])
+            M, N = x_arg.shape
+            _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
+                                           x_arg.stride(0), N, ctx.eps,
+                                           BLOCK_SIZE_N=ctx.BLOCK_SIZE,
+                                           GROUP_SIZE_M=GROUP_SIZE_M,
+                                           num_warps=ctx.num_warps)
+            grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
+            # accumulate partial sums in separate kernel
+            _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N,
+                                       BLOCK_SIZE_M=32,
+                                       BLOCK_SIZE_N=128)
+            return dx, None, dw, db, None
+
+
+    layer_norm = LayerNorm.apply
+
+
+    def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
+        # create data
+        x_shape = (M, N)
+        w_shape = (x_shape[-1], )
+        weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+        bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+        x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+        dy = .1 * torch.randn_like(x)
+        x.requires_grad_(True)
+        # forward pass
+        y_tri = layer_norm(x, w_shape, weight, bias, eps)
+        y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
+        # backward pass (triton)
+        y_tri.backward(dy, retain_graph=True)
+        dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
+        x.grad, weight.grad, bias.grad = None, None, None
+        # backward pass (torch)
+        y_ref.backward(dy, retain_graph=True)
+        dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
+        # compare
+        triton.testing.assert_almost_equal(y_tri, y_ref)
+        triton.testing.assert_almost_equal(dx_tri, dx_ref)
+        triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)
+        triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)
+
+
+    @triton.testing.perf_report(
+        triton.testing.Benchmark(
+            x_names=['N'],
+            x_vals=[512 * i for i in range(2, 32)],
+            line_arg='provider',
+            line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),
+            line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),
+            styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
+            ylabel='GB/s',
+            plot_name='layer-norm-backward',
+            args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
+        )
+    )
+    def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'):
+        # create data
+        x_shape = (M, N)
+        w_shape = (x_shape[-1], )
+        weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+        bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
+        x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
+        dy = .1 * torch.randn_like(x)
+        x.requires_grad_(True)
+        # utility functions
+        if provider == 'triton':
+            y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
+        if provider == 'torch':
+            y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
+        if provider == 'apex':
+            apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
+            y_fwd = lambda: apex_layer_norm(x)
+        # forward pass
+        if mode == 'forward':
+            gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6
+            ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)
+        # backward pass
+        if mode == 'backward':
+            gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6
+            y = y_fwd()
+            ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),
+                                                         grad_to_none=[x], rep=500)
+        return gbps(ms), gbps(max_ms), gbps(min_ms)
+
+
+    bench_layer_norm.run(save_path='.', print_data=True)
+
+
+.. rst-class:: sphx-glr-timing
+
+   **Total running time of the script:** ( 1 minutes  23.770 seconds)
+
+
+.. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:
+
+
+.. only :: html
+
+ .. container:: sphx-glr-footer
+    :class: sphx-glr-footer-example
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-python
+
+     :download:`Download Python source code: 05-layer-norm.py <05-layer-norm.py>`
+
+
+
+  .. container:: sphx-glr-download sphx-glr-download-jupyter
+
+     :download:`Download Jupyter notebook: 05-layer-norm.ipynb <05-layer-norm.ipynb>`
+
+
+.. only:: html
+
+ .. rst-class:: sphx-glr-signature
+
+    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
diff --git a/_sources/getting-started/tutorials/index.rst.txt b/_sources/getting-started/tutorials/index.rst.txt
index 6b38e843b..c8a39cdaf 100644
--- a/_sources/getting-started/tutorials/index.rst.txt
+++ b/_sources/getting-started/tutorials/index.rst.txt
@@ -7,7 +7,15 @@
 Tutorials
 ==================
 
-Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one. 
+Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one.
+
+To install the dependencies for the tutorials:
+
+.. code-block:: bash
+
+    cd triton
+    pip install -e './python[tutorials]'
+
 
 
 .. raw:: html
@@ -93,6 +101,27 @@ Below is a gallery of tutorials for writing various basic operations with Triton
    :hidden:
 
    /getting-started/tutorials/04-low-memory-dropout
+
+.. raw:: html
+
+    <div class="sphx-glr-thumbcontainer" tooltip="Layer Normalization">
+
+.. only:: html
+
+ .. figure:: /getting-started/tutorials/images/thumb/sphx_glr_05-layer-norm_thumb.png
+     :alt: Layer Normalization
+
+     :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py`
+
+.. raw:: html
+
+    </div>
+
+
+.. toctree::
+   :hidden:
+
+   /getting-started/tutorials/05-layer-norm
 .. raw:: html
 
     <div class="sphx-glr-clear"></div>
diff --git a/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
index e23d45392..69753601d 100644
--- a/_sources/getting-started/tutorials/sg_execution_times.rst.txt
+++ b/_sources/getting-started/tutorials/sg_execution_times.rst.txt
@@ -5,14 +5,16 @@
 
 Computation times
 =================
-**10:59.734** total execution time for **getting-started_tutorials** files:
+**12:42.913** total execution time for **getting-started_tutorials** files:
 
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 05:41.070 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) | 06:05.923 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:26.243 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``)                 | 03:22.431 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:52.411 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``)                       | 01:50.312 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
-| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.010 | 0.0 MB |
+| :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``)                       | 01:23.770 | 0.0 MB |
++---------------------------------------------------------------------------------------------------------+-----------+--------+
+| :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``)       | 00:00.477 | 0.0 MB |
 +---------------------------------------------------------------------------------------------------------+-----------+--------+
diff --git a/getting-started/installation.html b/getting-started/installation.html
index 2a523db99..ab6965d17 100644
--- a/getting-started/installation.html
+++ b/getting-started/installation.html
@@ -208,7 +208,7 @@ pip install -e .
 </div>
 <p>Note that, if llvm-11 is not present on your system, the setup.py script will download the official LLVM11 static libraries link against that.</p>
 <p>You can then test your installation by running the unit tests:</p>
-<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install -r requirements-test.txt
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span>pip install -e <span class="s1">&#39;.[tests]&#39;</span>
 pytest -vs test/unit/
 </pre></div>
 </div>
diff --git a/getting-started/tutorials/01-vector-add.html b/getting-started/tutorials/01-vector-add.html
index 590baf2ac..d5d556357 100644
--- a/getting-started/tutorials/01-vector-add.html
+++ b/getting-started/tutorials/01-vector-add.html
@@ -104,6 +104,7 @@
 <li class="toctree-l2"><a class="reference internal" href="02-fused-softmax.html">Fused Softmax</a></li>
 <li class="toctree-l2"><a class="reference internal" href="03-matrix-multiplication.html">Matrix Multiplication</a></li>
 <li class="toctree-l2"><a class="reference internal" href="04-low-memory-dropout.html">Low-Memory Dropout</a></li>
+<li class="toctree-l2"><a class="reference internal" href="05-layer-norm.html">Layer Normalization</a></li>
 </ul>
 </li>
 </ul>
@@ -202,6 +203,7 @@ to download the full example code</p>
 <div class="section" id="compute-kernel">
 <h2>Compute Kernel<a class="headerlink" href="#compute-kernel" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
+
 <span class="kn">import</span> <span class="nn">triton</span>
 <span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
 
@@ -212,9 +214,9 @@ to download the full example code</p>
     <span class="n">y_ptr</span><span class="p">,</span>  <span class="c1"># *Pointer* to second input vector</span>
     <span class="n">output_ptr</span><span class="p">,</span>  <span class="c1"># *Pointer* to output vector</span>
     <span class="n">n_elements</span><span class="p">,</span>  <span class="c1"># Size of the vector</span>
-    <span class="o">**</span><span class="n">meta</span><span class="p">,</span>  <span class="c1"># Optional meta-parameters for the kernel</span>
+    <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>  <span class="c1"># Number of elements each program should process</span>
+                 <span class="c1"># NOTE: `constexpr` so it can be used as a shape value</span>
 <span class="p">):</span>
-    <span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE&#39;</span><span class="p">]</span>  <span class="c1"># How many inputs each program should process</span>
     <span class="c1"># There are multiple &#39;program&#39;s processing different data. We identify which program</span>
     <span class="c1"># we are here</span>
     <span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>  <span class="c1"># We use a 1D launch grid so axis is 0</span>
@@ -226,7 +228,7 @@ to download the full example code</p>
     <span class="n">offsets</span> <span class="o">=</span> <span class="n">block_start</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="p">)</span>
     <span class="c1"># Create a mask to guard memory operations against out-of-bounds accesses</span>
     <span class="n">mask</span> <span class="o">=</span> <span class="n">offsets</span> <span class="o">&lt;</span> <span class="n">n_elements</span>
-    <span class="c1"># Load x and y from DRAM, masking out any extar elements in case the input is not a</span>
+    <span class="c1"># Load x and y from DRAM, masking out any extra elements in case the input is not a</span>
     <span class="c1"># multiple of the block size</span>
     <span class="n">x</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">x_ptr</span> <span class="o">+</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
     <span class="n">y</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">y_ptr</span> <span class="o">+</span> <span class="n">offsets</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
@@ -250,7 +252,7 @@ and (2) enqueue the above kernel with appropriate grid/block sizes.</p>
     <span class="c1">#  - each torch.tensor object is implicitly converted into a pointer to its first element.</span>
     <span class="c1">#  - `triton.jit`&#39;ed functions can be index with a launch grid to obtain a callable GPU kernel</span>
     <span class="c1">#  - don&#39;t forget to pass meta-parameters as keywords arguments</span>
-    <span class="n">pgm</span> <span class="o">=</span> <span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
+    <span class="n">add_kernel</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
     <span class="c1"># We return a handle to z but, since `torch.cuda.synchronize()` hasn&#39;t been called, the kernel is still</span>
     <span class="c1"># running asynchronously at this point.</span>
     <span class="k">return</span> <span class="n">output</span>
@@ -326,19 +328,19 @@ for different problem sizes.</p>
 3       32768.0   76.800002   76.800002
 4       65536.0  127.999995  127.999995
 5      131072.0  219.428568  219.428568
-6      262144.0  341.333321  384.000001
+6      262144.0  341.333321  341.333321
 7      524288.0  472.615390  472.615390
 8     1048576.0  614.400016  614.400016
 9     2097152.0  722.823517  722.823517
 10    4194304.0  780.190482  780.190482
 11    8388608.0  812.429770  812.429770
 12   16777216.0  833.084721  833.084721
-13   33554432.0  842.004273  843.811163
+13   33554432.0  842.004273  842.004273
 14   67108864.0  847.448255  848.362445
 15  134217728.0  849.737435  850.656574
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  52.411 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  50.312 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-01-vector-add-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/62d97d49a32414049819dd8bb8378080/01-vector-add.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">01-vector-add.py</span></code></a></p>
diff --git a/getting-started/tutorials/02-fused-softmax.html b/getting-started/tutorials/02-fused-softmax.html
index cd09a25fa..477e63f0b 100644
--- a/getting-started/tutorials/02-fused-softmax.html
+++ b/getting-started/tutorials/02-fused-softmax.html
@@ -107,6 +107,7 @@
 </li>
 <li class="toctree-l2"><a class="reference internal" href="03-matrix-multiplication.html">Matrix Multiplication</a></li>
 <li class="toctree-l2"><a class="reference internal" href="04-low-memory-dropout.html">Low-Memory Dropout</a></li>
+<li class="toctree-l2"><a class="reference internal" href="05-layer-norm.html">Layer Normalization</a></li>
 </ul>
 </li>
 </ul>
@@ -210,6 +211,9 @@ You will learn about:</p>
 Let us consider instead the case of a simple (numerically stabilized) softmax operation:</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
 
+<span class="kn">import</span> <span class="nn">triton</span>
+<span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
+
 
 <span class="nd">@torch</span><span class="o">.</span><span class="n">jit</span><span class="o">.</span><span class="n">script</span>
 <span class="k">def</span> <span class="nf">naive_softmax</span><span class="p">(</span><span class="n">x</span><span class="p">):</span>
@@ -248,17 +252,13 @@ normalizes it and writes back the result to the output Y.
 Note that one important limitation of Triton is that each block must have a
 power-of-two number of elements, so we need to internally “pad” each row and guard the
 memory operations properly if we want to handle any possible input shapes:</p>
-<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">triton</span>
-<span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
-
-
-<span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
 <span class="k">def</span> <span class="nf">softmax_kernel</span><span class="p">(</span>
-    <span class="n">output_ptr</span><span class="p">,</span> <span class="n">input_ptr</span><span class="p">,</span> <span class="n">input_row_stride</span><span class="p">,</span> <span class="n">output_row_stride</span><span class="p">,</span> <span class="n">n_cols</span><span class="p">,</span> <span class="o">**</span><span class="n">meta</span>
+    <span class="n">output_ptr</span><span class="p">,</span> <span class="n">input_ptr</span><span class="p">,</span> <span class="n">input_row_stride</span><span class="p">,</span> <span class="n">output_row_stride</span><span class="p">,</span> <span class="n">n_cols</span><span class="p">,</span>
+    <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span>
 <span class="p">):</span>
     <span class="c1"># The rows of the softmax are independent, so we parallelize across those</span>
     <span class="n">row_idx</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
-    <span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE&#39;</span><span class="p">]</span>
     <span class="c1"># The stride represents how much we need to increase the pointer to advance 1 row</span>
     <span class="n">row_start_ptr</span> <span class="o">=</span> <span class="n">input_ptr</span> <span class="o">+</span> <span class="n">row_idx</span> <span class="o">*</span> <span class="n">input_row_stride</span>
     <span class="c1"># The block size is the next power of two greater than n_cols, so we can fit each</span>
@@ -318,11 +318,7 @@ This will allow us to verify that our padding mechanism works.</p>
 <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="mi">1823</span><span class="p">,</span> <span class="mi">781</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
 <span class="n">y_triton</span> <span class="o">=</span> <span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
 <span class="n">y_torch</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">softmax</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
-<span class="nb">print</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">y_triton</span><span class="p">,</span> <span class="n">y_torch</span><span class="p">))</span>
-</pre></div>
-</div>
-<p class="sphx-glr-script-out">Out:</p>
-<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>True
+<span class="k">assert</span> <span class="n">torch</span><span class="o">.</span><span class="n">allclose</span><span class="p">(</span><span class="n">y_triton</span><span class="p">,</span> <span class="n">y_torch</span><span class="p">),</span> <span class="p">(</span><span class="n">y_triton</span><span class="p">,</span> <span class="n">y_torch</span><span class="p">)</span>
 </pre></div>
 </div>
 <p>As expected, the results are identical.</p>
@@ -373,17 +369,17 @@ We will then compare its performance against (1) <code class="code docutils lite
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>softmax-performance:
           N      Triton  Torch (native)  Torch (jit)
-0     256.0  512.000001      546.133347   188.321838
-1     384.0  585.142862      585.142862   151.703707
-2     512.0  655.360017      606.814814   154.566038
+0     256.0  546.133347      512.000001   190.511628
+1     384.0  614.400016      558.545450   153.600004
+2     512.0  655.360017      585.142849   154.566038
 3     640.0  682.666684      640.000002   160.000000
-4     768.0  702.171410      646.736871   163.839992
+4     768.0  722.823517      664.216187   162.754967
 ..      ...         ...             ...          ...
-93  12160.0  810.666687      405.755985   199.038365
-94  12288.0  812.429770      415.661740   199.197579
-95  12416.0  809.189387      412.149375   198.854847
-96  12544.0  807.661970      412.971190   199.012395
-97  12672.0  807.776923      412.097543   199.167004
+93  12160.0  814.058574      406.179533   198.530610
+94  12288.0  814.111783      415.661740   198.694297
+95  12416.0  814.163950      412.149375   198.457532
+96  12544.0  814.214963      412.546756   198.716830
+97  12672.0  814.265046      412.097543   198.679085
 
 [98 rows x 4 columns]
 </pre></div>
@@ -396,7 +392,7 @@ We will then compare its performance against (1) <code class="code docutils lite
 Note however that the PyTorch <cite>softmax</cite> operation is more general and will works on tensors of any shape.</p></li>
 </ul>
 </div></blockquote>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  26.243 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 3 minutes  22.431 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-02-fused-softmax-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d91442ac2982c4e0cc3ab0f43534afbc/02-fused-softmax.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">02-fused-softmax.py</span></code></a></p>
diff --git a/getting-started/tutorials/03-matrix-multiplication.html b/getting-started/tutorials/03-matrix-multiplication.html
index f1990e3c9..2bc388e30 100644
--- a/getting-started/tutorials/03-matrix-multiplication.html
+++ b/getting-started/tutorials/03-matrix-multiplication.html
@@ -114,6 +114,7 @@
 </ul>
 </li>
 <li class="toctree-l2"><a class="reference internal" href="04-low-memory-dropout.html">Low-Memory Dropout</a></li>
+<li class="toctree-l2"><a class="reference internal" href="05-layer-norm.html">Layer Normalization</a></li>
 </ul>
 </li>
 </ul>
@@ -332,6 +333,7 @@ more than 10% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).</
 <div class="section" id="final-result">
 <h2>Final Result<a class="headerlink" href="#final-result" title="Permalink to this headline">¶</a></h2>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
+
 <span class="kn">import</span> <span class="nn">triton</span>
 <span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
 
@@ -343,23 +345,22 @@ more than 10% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).</
 <span class="c1">#   - An autotuning *key* whose change in values will trigger evaluation of all the</span>
 <span class="c1">#       provided configs</span>
 
+
 <span class="nd">@triton</span><span class="o">.</span><span class="n">autotune</span><span class="p">(</span>
     <span class="n">configs</span><span class="o">=</span><span class="p">[</span>
         <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">8</span><span class="p">),</span>
         <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">3</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">8</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span>  <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">256</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
         <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">32</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">32</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
-        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">32</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span> <span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
+        <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">({</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">:</span> <span class="mi">64</span><span class="p">,</span> <span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">:</span> <span class="mi">32</span><span class="p">,</span> <span class="s1">&#39;GROUP_SIZE_M&#39;</span><span class="p">:</span> <span class="mi">8</span><span class="p">},</span> <span class="n">num_stages</span><span class="o">=</span><span class="mi">5</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">2</span><span class="p">),</span>
     <span class="p">],</span>
     <span class="n">key</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;M&#39;</span><span class="p">,</span> <span class="s1">&#39;N&#39;</span><span class="p">,</span> <span class="s1">&#39;K&#39;</span><span class="p">],</span>
 <span class="p">)</span>
-<span class="c1"># %</span>
-<span class="c1"># We can now define our kernel as normal, using all the techniques presented above</span>
 <span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
 <span class="k">def</span> <span class="nf">matmul_kernel</span><span class="p">(</span>
     <span class="c1"># Pointers to matrices</span>
@@ -373,17 +374,13 @@ more than 10% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).</
     <span class="n">stride_bk</span><span class="p">,</span> <span class="n">stride_bn</span><span class="p">,</span>
     <span class="n">stride_cm</span><span class="p">,</span> <span class="n">stride_cn</span><span class="p">,</span>
     <span class="c1"># Meta-parameters</span>
-    <span class="o">**</span><span class="n">meta</span><span class="p">,</span>
+    <span class="n">BLOCK_SIZE_M</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="n">BLOCK_SIZE_K</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>
+    <span class="n">GROUP_SIZE_M</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>
+    <span class="n">ACTIVATION</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>
 <span class="p">):</span>
     <span class="sd">&quot;&quot;&quot;Kernel for computing the matmul C = A x B.</span>
 <span class="sd">    A has shape (M, K), B has shape (K, N) and C has shape (M, N)</span>
 <span class="sd">    &quot;&quot;&quot;</span>
-    <span class="c1"># extract meta-parameters</span>
-    <span class="n">BLOCK_SIZE_M</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE_M&#39;</span><span class="p">]</span>
-    <span class="n">BLOCK_SIZE_N</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">]</span>
-    <span class="n">BLOCK_SIZE_K</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE_K&#39;</span><span class="p">]</span>
-    <span class="n">GROUP_SIZE_M</span> <span class="o">=</span> <span class="mi">8</span>
-
     <span class="c1"># -----------------------------------------------------------</span>
     <span class="c1"># Map program ids `pid` to the block of C it should compute.</span>
     <span class="c1"># This is done in a grouped ordering to promote L2 data reuse</span>
@@ -408,8 +405,8 @@ more than 10% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).</
     <span class="n">offs_am</span> <span class="o">=</span> <span class="n">pid_m</span> <span class="o">*</span> <span class="n">BLOCK_SIZE_M</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_M</span><span class="p">)</span>
     <span class="n">offs_bn</span> <span class="o">=</span> <span class="n">pid_n</span> <span class="o">*</span> <span class="n">BLOCK_SIZE_N</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">)</span>
     <span class="n">offs_k</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_K</span><span class="p">)</span>
-    <span class="n">a_ptrs</span> <span class="o">=</span> <span class="n">a_ptr</span> <span class="o">+</span> <span class="p">(</span><span class="n">offs_am</span><span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span><span class="o">*</span><span class="n">stride_am</span> <span class="o">+</span> <span class="n">offs_k</span> <span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span><span class="o">*</span><span class="n">stride_ak</span><span class="p">)</span>
-    <span class="n">b_ptrs</span> <span class="o">=</span> <span class="n">b_ptr</span> <span class="o">+</span> <span class="p">(</span><span class="n">offs_k</span> <span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span><span class="o">*</span><span class="n">stride_bk</span> <span class="o">+</span> <span class="n">offs_bn</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span><span class="o">*</span><span class="n">stride_bn</span><span class="p">)</span>
+    <span class="n">a_ptrs</span> <span class="o">=</span> <span class="n">a_ptr</span> <span class="o">+</span> <span class="p">(</span><span class="n">offs_am</span><span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">stride_am</span> <span class="o">+</span> <span class="n">offs_k</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span> <span class="o">*</span> <span class="n">stride_ak</span><span class="p">)</span>
+    <span class="n">b_ptrs</span> <span class="o">=</span> <span class="n">b_ptr</span> <span class="o">+</span> <span class="p">(</span><span class="n">offs_k</span><span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">stride_bk</span> <span class="o">+</span> <span class="n">offs_bn</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span> <span class="o">*</span> <span class="n">stride_bn</span><span class="p">)</span>
 
     <span class="c1"># -----------------------------------------------------------</span>
     <span class="c1"># Iterate to compute a block of the C matrix</span>
@@ -430,9 +427,9 @@ more than 10% on some hardware architecture (e.g., 220 to 245 TFLOPS on A100).</
         <span class="n">a_ptrs</span> <span class="o">+=</span> <span class="n">BLOCK_SIZE_K</span> <span class="o">*</span> <span class="n">stride_ak</span>
         <span class="n">b_ptrs</span> <span class="o">+=</span> <span class="n">BLOCK_SIZE_K</span> <span class="o">*</span> <span class="n">stride_bk</span>
     <span class="c1"># you can fuse arbitrary activation functions here</span>
-    <span class="c1"># while the accumulator is still in FP32 !</span>
-    <span class="k">if</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;ACTIVATION&#39;</span><span class="p">]:</span>
-        <span class="n">accumulator</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;ACTIVATION&#39;</span><span class="p">](</span><span class="n">accumulator</span><span class="p">)</span>
+    <span class="c1"># while the accumulator is still in FP32!</span>
+    <span class="k">if</span> <span class="n">ACTIVATION</span><span class="p">:</span>
+        <span class="n">accumulator</span> <span class="o">=</span> <span class="n">ACTIVATION</span><span class="p">(</span><span class="n">accumulator</span><span class="p">)</span>
     <span class="n">c</span> <span class="o">=</span> <span class="n">accumulator</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">tl</span><span class="o">.</span><span class="n">float16</span><span class="p">)</span>
 
     <span class="c1"># -----------------------------------------------------------</span>
@@ -568,41 +565,41 @@ torch_output=tensor([[  1.1045, -36.9688,  31.4688,  ..., -11.3906,  24.4531, -3
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>matmul-performance:
          M     cuBLAS  ...     Triton  Triton (+ LeakyReLU)
 0    256.0   2.730667  ...   2.978909              2.978909
-1    384.0   7.372800  ...   8.507077              8.507077
-2    512.0  14.563555  ...  16.384000             16.384000
+1    384.0   7.372800  ...   8.507077              7.899428
+2    512.0  14.563555  ...  15.420235             16.384000
 3    640.0  22.260869  ...  24.380953             24.380953
 4    768.0  32.768000  ...  34.028308             34.028308
-5    896.0  37.971025  ...  40.140799             39.025776
-6   1024.0  49.932191  ...  53.773130             52.428801
-7   1152.0  44.566925  ...  46.656000             46.656000
+5    896.0  37.971025  ...  39.025776             39.025776
+6   1024.0  49.932191  ...  52.428801             52.428801
+7   1152.0  45.242181  ...  46.656000             46.656000
 8   1280.0  51.200001  ...  56.888887             56.109587
-9   1408.0  64.138541  ...  67.305878             67.305878
-10  1536.0  80.430545  ...  79.526831             79.526831
-11  1664.0  63.372618  ...  62.492442             62.492442
-12  1792.0  72.983276  ...  72.047592             72.047592
-13  1920.0  69.120002  ...  70.172588             70.172588
-14  2048.0  73.908442  ...  76.959706             76.608294
-15  2176.0  83.500614  ...  86.367588             85.632545
-16  2304.0  68.446623  ...  76.809875             76.809875
-17  2432.0  71.305746  ...  74.918570             85.393507
-18  2560.0  78.019048  ...  80.908642             80.709358
-19  2688.0  83.552988  ...  89.149366             89.464755
-20  2816.0  82.602666  ...  83.074685             83.233226
-21  2944.0  82.646820  ...  82.373605             82.784108
-22  3072.0  81.943708  ...  88.612060             87.516392
-23  3200.0  78.914919  ...  91.822093             93.567248
-24  3328.0  81.530349  ...  84.003845             84.496824
-25  3456.0  82.519518  ...  91.200871             90.943675
-26  3584.0  85.633710  ...  94.847460             96.579370
-27  3712.0  85.528545  ...  85.019017             87.706180
-28  3840.0  81.980725  ...  86.130841             91.247522
-29  3968.0  85.993854  ...  91.747320             86.053553
-30  4096.0  93.727466  ...  88.417474             84.307617
+9   1408.0  64.138541  ...  66.485074             65.684049
+10  1536.0  79.526831  ...  79.526831             78.643199
+11  1664.0  62.929456  ...  62.061463             62.061463
+12  1792.0  72.983276  ...  72.047592             71.588687
+13  1920.0  68.776119  ...  70.172588             69.818184
+14  2048.0  73.262953  ...  76.959706             76.608294
+15  2176.0  83.500614  ...  85.998493             85.269692
+16  2304.0  68.446623  ...  76.319081             75.834511
+17  2432.0  71.125224  ...  82.509438             84.877538
+18  2560.0  77.833728  ...  80.709358             81.108913
+19  2688.0  83.369354  ...  89.676257             89.464755
+20  2816.0  83.233226  ...  82.446516             81.827785
+21  2944.0  82.373605  ...  82.373605             81.298583
+22  3072.0  82.062468  ...  88.060814             88.473602
+23  3200.0  82.368085  ...  89.761569             94.955488
+24  3328.0  80.889094  ...  80.527177             82.939284
+25  3456.0  81.849303  ...  86.783176             91.304157
+26  3584.0  87.042978  ...  98.375705             90.364394
+27  3712.0  79.726532  ...  90.815768             85.820159
+28  3840.0  82.592983  ...  88.191387             91.398346
+29  3968.0  85.873762  ...  90.656713             83.867052
+30  4096.0  92.563952  ...  82.441739             82.291681
 
 [31 rows x 5 columns]
 </pre></div>
 </div>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 5 minutes  41.070 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 6 minutes  5.923 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-03-matrix-multiplication-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/d5fee5b55a64e47f1b5724ec39adf171/03-matrix-multiplication.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">03-matrix-multiplication.py</span></code></a></p>
diff --git a/getting-started/tutorials/04-low-memory-dropout.html b/getting-started/tutorials/04-low-memory-dropout.html
index 8223753fa..64a4c6624 100644
--- a/getting-started/tutorials/04-low-memory-dropout.html
+++ b/getting-started/tutorials/04-low-memory-dropout.html
@@ -47,7 +47,7 @@
     
     <link rel="index" title="Index" href="../../genindex.html" />
     <link rel="search" title="Search" href="../../search.html" />
-    <link rel="next" title="triton" href="../../python-api/triton.html" />
+    <link rel="next" title="Layer Normalization" href="05-layer-norm.html" />
     <link rel="prev" title="Matrix Multiplication" href="03-matrix-multiplication.html" /> 
 </head>
 
@@ -107,6 +107,7 @@
 <li class="toctree-l3"><a class="reference internal" href="#references">References</a></li>
 </ul>
 </li>
+<li class="toctree-l2"><a class="reference internal" href="05-layer-norm.html">Layer Normalization</a></li>
 </ul>
 </li>
 </ul>
@@ -217,19 +218,20 @@ keeps the norm consistent regardless of the dropout probability.</p>
 <p>Let’s first take a look at the baseline implementation.</p>
 <div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">tabulate</span>
 <span class="kn">import</span> <span class="nn">torch</span>
+
 <span class="kn">import</span> <span class="nn">triton</span>
 <span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
 
+
 <span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
 <span class="k">def</span> <span class="nf">_dropout</span><span class="p">(</span>
-        <span class="n">x_ptr</span><span class="p">,</span> <span class="c1"># pointer to the input</span>
-        <span class="n">x_keep_ptr</span><span class="p">,</span> <span class="c1"># pointer to a mask of 0s and 1s</span>
-        <span class="n">output_ptr</span><span class="p">,</span> <span class="c1"># pointer to the output</span>
-        <span class="n">n_elements</span><span class="p">,</span> <span class="c1"># number of elements in the `x` tensor</span>
-        <span class="n">p</span><span class="p">,</span> <span class="c1"># probability that an element of `x` is changed to zero</span>
-        <span class="o">**</span><span class="n">meta</span><span class="p">,</span>
+        <span class="n">x_ptr</span><span class="p">,</span>  <span class="c1"># pointer to the input</span>
+        <span class="n">x_keep_ptr</span><span class="p">,</span>  <span class="c1"># pointer to a mask of 0s and 1s</span>
+        <span class="n">output_ptr</span><span class="p">,</span>  <span class="c1"># pointer to the output</span>
+        <span class="n">n_elements</span><span class="p">,</span>  <span class="c1"># number of elements in the `x` tensor</span>
+        <span class="n">p</span><span class="p">,</span>  <span class="c1"># probability that an element of `x` is changed to zero</span>
+        <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>
 <span class="p">):</span>
-    <span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE&#39;</span><span class="p">]</span>
     <span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
     <span class="n">block_start</span> <span class="o">=</span> <span class="n">pid</span> <span class="o">*</span> <span class="n">BLOCK_SIZE</span>
     <span class="n">offsets</span> <span class="o">=</span> <span class="n">block_start</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="p">)</span>
@@ -251,6 +253,7 @@ keeps the norm consistent regardless of the dropout probability.</p>
     <span class="n">_dropout</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">x</span><span class="p">,</span> <span class="n">x_keep</span><span class="p">,</span> <span class="n">output</span><span class="p">,</span> <span class="n">n_elements</span><span class="p">,</span> <span class="n">p</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="mi">1024</span><span class="p">)</span>
     <span class="k">return</span> <span class="n">output</span>
 
+
 <span class="c1"># Input tensor</span>
 <span class="n">x</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">size</span><span class="o">=</span><span class="p">(</span><span class="mi">10</span><span class="p">,))</span><span class="o">.</span><span class="n">cuda</span><span class="p">()</span>
 <span class="c1"># Dropout mask</span>
@@ -298,10 +301,9 @@ other <a class="reference internal" href="../../python-api/triton.language.html#
         <span class="n">n_elements</span><span class="p">,</span>
         <span class="n">p</span><span class="p">,</span>
         <span class="n">seed</span><span class="p">,</span>
-        <span class="o">**</span><span class="n">meta</span><span class="p">,</span>
+        <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span>
 <span class="p">):</span>
     <span class="c1"># compute memory offsets of elements handled by this instance</span>
-    <span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE&#39;</span><span class="p">]</span>
     <span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
     <span class="n">block_start</span> <span class="o">=</span> <span class="n">pid</span> <span class="o">*</span> <span class="n">BLOCK_SIZE</span>
     <span class="n">offsets</span> <span class="o">=</span> <span class="n">block_start</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="p">)</span>
@@ -342,9 +344,9 @@ other <a class="reference internal" href="../../python-api/triton.language.html#
 <p class="sphx-glr-script-out">Out:</p>
 <div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>-------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
 input                -0.952835  0.371721  0.408716  1.42142  0.149397  -0.67086  -0.214186  -0.431969  -0.707878  -0.106434
-output (seed = 123)   0         0.743443  0         2.84284  0.298794  -1.34172   0          0          0          0
-output (seed = 123)   0         0.743443  0         2.84284  0.298794  -1.34172   0          0          0          0
-output (seed = 512)  -1.90567   0.743443  0         2.84284  0.298794  -1.34172   0         -0.863938   0         -0.212868
+output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
+output (seed = 123)   0         0.743443  0         0        0         -1.34172   0          0         -1.41576   -0.212868
+output (seed = 512)   0         0         0.817432  2.84284  0         -1.34172  -0.428372   0          0          0
 -------------------  ---------  --------  --------  -------  --------  --------  ---------  ---------  ---------  ---------
 </pre></div>
 </div>
@@ -370,7 +372,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
 <dd><p>Nitish Srivastava and Geoffrey Hinton and Alex Krizhevsky and Ilya Sutskever and Ruslan Salakhutdinov, “Dropout: A Simple Way to Prevent Neural Networks from Overfitting”, JMLR 2014</p>
 </dd>
 </dl>
-<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.010 seconds)</p>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 0 minutes  0.477 seconds)</p>
 <div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-04-low-memory-dropout-py">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/c9aed78977a4c05741d675a38dde3d7d/04-low-memory-dropout.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">04-low-memory-dropout.py</span></code></a></p>
@@ -389,7 +391,7 @@ to explore the <cite>triton/language/random</cite> folder!</p>
           </div>
           <footer>
     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
-        <a href="../../python-api/triton.html" class="btn btn-neutral float-right" title="triton" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
+        <a href="05-layer-norm.html" class="btn btn-neutral float-right" title="Layer Normalization" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
         <a href="03-matrix-multiplication.html" class="btn btn-neutral float-left" title="Matrix Multiplication" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
     </div>
 
diff --git a/getting-started/tutorials/05-layer-norm.html b/getting-started/tutorials/05-layer-norm.html
new file mode 100644
index 000000000..5ab469402
--- /dev/null
+++ b/getting-started/tutorials/05-layer-norm.html
@@ -0,0 +1,550 @@
+
+
+<!DOCTYPE html>
+<html class="writer-html5" lang="en" >
+<head>
+  <meta charset="utf-8" />
+  
+  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+  
+  <title>Layer Normalization &mdash; Triton  documentation</title>
+  
+
+  
+  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/pygments.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/css/theme.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/gallery.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/gallery-binder.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/gallery-dataframe.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/gallery-rendered-html.css" type="text/css" />
+  <link rel="stylesheet" href="../../_static/css/custom.css" type="text/css" />
+
+  
+  
+
+  
+  
+
+  
+
+  
+  <!--[if lt IE 9]>
+    <script src="../../_static/js/html5shiv.min.js"></script>
+  <![endif]-->
+  
+    
+      <script type="text/javascript" id="documentation_options" data-url_root="../../" src="../../_static/documentation_options.js"></script>
+        <script data-url_root="../../" id="documentation_options" src="../../_static/documentation_options.js"></script>
+        <script src="../../_static/jquery.js"></script>
+        <script src="../../_static/underscore.js"></script>
+        <script src="../../_static/doctools.js"></script>
+    
+    <script type="text/javascript" src="../../_static/js/theme.js"></script>
+
+    
+    <link rel="index" title="Index" href="../../genindex.html" />
+    <link rel="search" title="Search" href="../../search.html" />
+    <link rel="next" title="triton" href="../../python-api/triton.html" />
+    <link rel="prev" title="Low-Memory Dropout" href="04-low-memory-dropout.html" /> 
+</head>
+
+<body class="wy-body-for-nav">
+
+   
+  <div class="wy-grid-for-nav">
+    
+    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
+      <div class="wy-side-scroll">
+        <div class="wy-side-nav-search" >
+          
+
+          
+            <a href="../../index.html" class="icon icon-home"> Triton
+          
+
+          
+          </a>
+
+          
+            
+            
+          
+
+          
+<div role="search">
+  <form id="rtd-search-form" class="wy-form" action="../../search.html" method="get">
+    <input type="text" name="q" placeholder="Search docs" />
+    <input type="hidden" name="check_keywords" value="yes" />
+    <input type="hidden" name="area" value="default" />
+  </form>
+</div>
+
+          
+        </div>
+
+        
+        <div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="main navigation">
+          
+            
+            
+              
+            
+            
+              <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
+<ul class="current">
+<li class="toctree-l1"><a class="reference internal" href="../installation.html">Installation</a></li>
+<li class="toctree-l1 current"><a class="reference internal" href="index.html">Tutorials</a><ul class="current">
+<li class="toctree-l2"><a class="reference internal" href="01-vector-add.html">Vector Addition</a></li>
+<li class="toctree-l2"><a class="reference internal" href="02-fused-softmax.html">Fused Softmax</a></li>
+<li class="toctree-l2"><a class="reference internal" href="03-matrix-multiplication.html">Matrix Multiplication</a></li>
+<li class="toctree-l2"><a class="reference internal" href="04-low-memory-dropout.html">Low-Memory Dropout</a></li>
+<li class="toctree-l2 current"><a class="current reference internal" href="#">Layer Normalization</a></li>
+</ul>
+</li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/triton.html">triton</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/triton.language.html">triton.language</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../python-api/triton.testing.html">triton.testing</a></li>
+</ul>
+<p class="caption" role="heading"><span class="caption-text">Programming Guide</span></p>
+<ul>
+<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/chapter-1/introduction.html">Introduction</a></li>
+<li class="toctree-l1"><a class="reference internal" href="../../programming-guide/chapter-2/related-work.html">Related Work</a></li>
+</ul>
+
+            
+          
+        </div>
+        
+      </div>
+    </nav>
+
+    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap">
+
+      
+      <nav class="wy-nav-top" aria-label="top navigation">
+        
+          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
+          <a href="../../index.html">Triton</a>
+        
+      </nav>
+
+
+      <div class="wy-nav-content">
+        
+        <div class="rst-content">
+        
+          
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+<div role="navigation" aria-label="breadcrumbs navigation">
+
+  <ul class="wy-breadcrumbs">
+    
+      <li><a href="../../index.html" class="icon icon-home"></a> &raquo;</li>
+        
+          <li><a href="index.html">Tutorials</a> &raquo;</li>
+        
+      <li>Layer Normalization</li>
+    
+    
+      <li class="wy-breadcrumbs-aside">
+        
+          
+            <a href="../../_sources/getting-started/tutorials/05-layer-norm.rst.txt" rel="nofollow"> View page source</a>
+          
+        
+      </li>
+    
+  </ul>
+
+  
+  <hr/>
+</div>
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            
+  <div class="sphx-glr-download-link-note admonition note">
+<p class="admonition-title">Note</p>
+<p>Click <a class="reference internal" href="#sphx-glr-download-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">here</span></a>
+to download the full example code</p>
+</div>
+<div class="sphx-glr-example-title section" id="layer-normalization">
+<span id="sphx-glr-getting-started-tutorials-05-layer-norm-py"></span><h1>Layer Normalization<a class="headerlink" href="#layer-normalization" title="Permalink to this headline">¶</a></h1>
+<img alt="05 layer norm" class="sphx-glr-single-img" src="../../_images/sphx_glr_05-layer-norm_001.png" />
+<p class="sphx-glr-script-out">Out:</p>
+<div class="sphx-glr-script-out highlight-none notranslate"><div class="highlight"><pre><span></span>layer-norm-backward:
+          N      Triton       Torch
+0    1024.0  311.088617   99.497980
+1    1536.0  347.773587  133.565214
+2    2048.0  420.102553  162.217818
+3    2560.0  455.111129  182.857144
+4    3072.0  511.999982  191.501303
+5    3584.0  551.384634  208.271186
+6    4096.0  568.231237  220.907859
+7    4608.0  502.690905  232.336141
+8    5120.0  527.381977  243.326731
+9    5632.0  540.671974  243.545956
+10   6144.0  544.118087  249.081070
+11   6656.0  528.953642  256.410903
+12   7168.0  507.469040  262.243907
+13   7680.0  481.253256  261.076480
+14   8192.0  461.521112  269.326017
+15   8704.0  417.791980  268.159180
+16   9216.0  431.157889  273.404206
+17   9728.0  442.181815  280.953074
+18  10240.0  448.467168  286.767793
+19  10752.0  427.231788  246.699797
+20  11264.0  427.071098  245.313973
+21  11776.0  420.571432  249.447482
+22  12288.0  420.102570  254.673582
+23  12800.0  414.016170  253.674644
+24  13312.0  410.652963  252.759501
+25  13824.0  403.620451  257.390218
+26  14336.0  396.387109  254.862216
+27  14848.0  382.351933  257.293872
+28  15360.0  374.253788  257.790220
+29  15872.0  368.046389  262.890274
+</pre></div>
+</div>
+<div class="line-block">
+<div class="line"><br /></div>
+</div>
+<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="kn">import</span> <span class="nn">torch</span>
+
+<span class="kn">import</span> <span class="nn">triton</span>
+<span class="kn">import</span> <span class="nn">triton.language</span> <span class="k">as</span> <span class="nn">tl</span>
+
+<span class="k">try</span><span class="p">:</span>
+    <span class="c1"># This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it</span>
+    <span class="c1"># should not be added to extras_require in setup.py.</span>
+    <span class="kn">import</span> <span class="nn">apex</span>
+    <span class="n">HAS_APEX</span> <span class="o">=</span> <span class="kc">True</span>
+<span class="k">except</span> <span class="ne">ModuleNotFoundError</span><span class="p">:</span>
+    <span class="n">HAS_APEX</span> <span class="o">=</span> <span class="kc">False</span>
+
+
+<span class="c1"># Forward Pass</span>
+<span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
+<span class="k">def</span> <span class="nf">_layer_norm_fwd_fused</span><span class="p">(</span><span class="n">X</span><span class="p">,</span> <span class="n">Y</span><span class="p">,</span> <span class="n">W</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">V</span><span class="p">,</span> <span class="n">stride</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">eps</span><span class="p">,</span>
+                          <span class="n">BLOCK_SIZE</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">):</span>
+    <span class="c1"># position of elements processed by this program</span>
+    <span class="n">row</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">cols</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE</span><span class="p">)</span>
+    <span class="n">mask</span> <span class="o">=</span> <span class="n">cols</span> <span class="o">&lt;</span> <span class="n">N</span>
+    <span class="c1"># offset data pointers to start at the row of interest</span>
+    <span class="n">X</span> <span class="o">+=</span> <span class="n">row</span> <span class="o">*</span> <span class="n">stride</span>
+    <span class="n">Y</span> <span class="o">+=</span> <span class="n">row</span> <span class="o">*</span> <span class="n">stride</span>
+    <span class="c1"># load data and cast to float32</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">X</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">,</span> <span class="n">other</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="c1"># compute mean</span>
+    <span class="n">mean</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="o">/</span> <span class="n">N</span>
+    <span class="c1"># compute std</span>
+    <span class="n">xmean</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">x</span> <span class="o">-</span> <span class="n">mean</span><span class="p">,</span> <span class="mf">0.</span><span class="p">)</span>
+    <span class="n">var</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">xmean</span> <span class="o">*</span> <span class="n">xmean</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="o">/</span> <span class="n">N</span>
+    <span class="n">rstd</span> <span class="o">=</span> <span class="mi">1</span> <span class="o">/</span> <span class="n">tl</span><span class="o">.</span><span class="n">sqrt</span><span class="p">(</span><span class="n">var</span> <span class="o">+</span> <span class="n">eps</span><span class="p">)</span>
+    <span class="n">xhat</span> <span class="o">=</span> <span class="n">xmean</span> <span class="o">*</span> <span class="n">rstd</span>
+    <span class="c1"># write-back mean/rstd</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">M</span> <span class="o">+</span> <span class="n">row</span><span class="p">,</span> <span class="n">mean</span><span class="p">)</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">V</span> <span class="o">+</span> <span class="n">row</span><span class="p">,</span> <span class="n">rstd</span><span class="p">)</span>
+    <span class="c1"># multiply by weight and add bias</span>
+    <span class="n">w</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">W</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="n">b</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">B</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="n">y</span> <span class="o">=</span> <span class="n">xhat</span> <span class="o">*</span> <span class="n">w</span> <span class="o">+</span> <span class="n">b</span>
+    <span class="c1"># write-back</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">Y</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+
+
+<span class="c1"># Backward pass (DX + partial DW + partial DB)</span>
+<span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
+<span class="k">def</span> <span class="nf">_layer_norm_bwd_dx_fused</span><span class="p">(</span><span class="n">DX</span><span class="p">,</span> <span class="n">DY</span><span class="p">,</span> <span class="n">DW</span><span class="p">,</span> <span class="n">DB</span><span class="p">,</span> <span class="n">X</span><span class="p">,</span> <span class="n">W</span><span class="p">,</span> <span class="n">B</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">V</span><span class="p">,</span> <span class="n">Lock</span><span class="p">,</span> <span class="n">stride</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">eps</span><span class="p">,</span>
+                             <span class="n">GROUP_SIZE_M</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">):</span>
+    <span class="c1"># position of elements processed by this program</span>
+    <span class="n">row</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">cols</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">)</span>
+    <span class="n">mask</span> <span class="o">=</span> <span class="n">cols</span> <span class="o">&lt;</span> <span class="n">N</span>
+    <span class="c1"># offset data pointers to start at the row of interest</span>
+    <span class="n">X</span> <span class="o">+=</span> <span class="n">row</span> <span class="o">*</span> <span class="n">stride</span>
+    <span class="n">DY</span> <span class="o">+=</span> <span class="n">row</span> <span class="o">*</span> <span class="n">stride</span>
+    <span class="n">DX</span> <span class="o">+=</span> <span class="n">row</span> <span class="o">*</span> <span class="n">stride</span>
+    <span class="c1"># offset locks and weight/bias gradient pointer</span>
+    <span class="c1"># each kernel instance accumulates partial sums for</span>
+    <span class="c1"># DW and DB into one of GROUP_SIZE_M independent buffers</span>
+    <span class="c1"># these buffers stay in the L2, which allow this kernel</span>
+    <span class="c1"># to be fast</span>
+    <span class="n">lock_id</span> <span class="o">=</span> <span class="n">row</span> <span class="o">%</span> <span class="n">GROUP_SIZE_M</span>
+    <span class="n">Lock</span> <span class="o">+=</span> <span class="n">lock_id</span>
+    <span class="n">Count</span> <span class="o">=</span> <span class="n">Lock</span> <span class="o">+</span> <span class="n">GROUP_SIZE_M</span>
+    <span class="n">DW</span> <span class="o">=</span> <span class="n">DW</span> <span class="o">+</span> <span class="n">lock_id</span> <span class="o">*</span> <span class="n">N</span> <span class="o">+</span> <span class="n">cols</span>
+    <span class="n">DB</span> <span class="o">=</span> <span class="n">DB</span> <span class="o">+</span> <span class="n">lock_id</span> <span class="o">*</span> <span class="n">N</span> <span class="o">+</span> <span class="n">cols</span>
+    <span class="c1"># load data to SRAM</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">X</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">,</span> <span class="n">other</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="n">dy</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">DY</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">,</span> <span class="n">other</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="n">w</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">W</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="n">mean</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">M</span> <span class="o">+</span> <span class="n">row</span><span class="p">)</span>
+    <span class="n">rstd</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">V</span> <span class="o">+</span> <span class="n">row</span><span class="p">)</span>
+    <span class="c1"># compute dx</span>
+    <span class="n">xhat</span> <span class="o">=</span> <span class="p">(</span><span class="n">x</span> <span class="o">-</span> <span class="n">mean</span><span class="p">)</span> <span class="o">*</span> <span class="n">rstd</span>
+    <span class="n">wdy</span> <span class="o">=</span> <span class="n">w</span> <span class="o">*</span> <span class="n">dy</span>
+    <span class="n">xhat</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">xhat</span><span class="p">,</span> <span class="mf">0.</span><span class="p">)</span>
+    <span class="n">wdy</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">where</span><span class="p">(</span><span class="n">mask</span><span class="p">,</span> <span class="n">wdy</span><span class="p">,</span> <span class="mf">0.</span><span class="p">)</span>
+    <span class="n">mean1</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">xhat</span> <span class="o">*</span> <span class="n">wdy</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="o">/</span> <span class="n">N</span>
+    <span class="n">mean2</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">wdy</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span> <span class="o">/</span> <span class="n">N</span>
+    <span class="n">dx</span> <span class="o">=</span> <span class="p">(</span><span class="n">wdy</span> <span class="o">-</span> <span class="p">(</span><span class="n">xhat</span> <span class="o">*</span> <span class="n">mean1</span> <span class="o">+</span> <span class="n">mean2</span><span class="p">))</span> <span class="o">*</span> <span class="n">rstd</span>
+    <span class="c1"># write-back dx</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">DX</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">dx</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="c1"># accumulate partial sums for dw/db</span>
+    <span class="n">partial_dw</span> <span class="o">=</span> <span class="p">(</span><span class="n">dy</span> <span class="o">*</span> <span class="n">xhat</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">w</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="n">partial_db</span> <span class="o">=</span> <span class="p">(</span><span class="n">dy</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">w</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="k">while</span> <span class="n">tl</span><span class="o">.</span><span class="n">atomic_cas</span><span class="p">(</span><span class="n">Lock</span><span class="p">,</span> <span class="mi">0</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span> <span class="o">==</span> <span class="mi">1</span><span class="p">:</span>
+        <span class="k">pass</span>
+    <span class="n">count</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">Count</span><span class="p">)</span>
+    <span class="c1"># first store doesn&#39;t accumulate</span>
+    <span class="k">if</span> <span class="n">count</span> <span class="o">==</span> <span class="mi">0</span><span class="p">:</span>
+        <span class="n">tl</span><span class="o">.</span><span class="n">atomic_xchg</span><span class="p">(</span><span class="n">Count</span><span class="p">,</span> <span class="mi">1</span><span class="p">)</span>
+    <span class="k">else</span><span class="p">:</span>
+        <span class="n">partial_dw</span> <span class="o">+=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">DW</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+        <span class="n">partial_db</span> <span class="o">+=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">DB</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">DW</span><span class="p">,</span> <span class="n">partial_dw</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">DB</span><span class="p">,</span> <span class="n">partial_db</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">)</span>
+    <span class="c1"># release lock</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">atomic_xchg</span><span class="p">(</span><span class="n">Lock</span><span class="p">,</span> <span class="mi">0</span><span class="p">)</span>
+
+<span class="c1"># Backward pass (total DW + total DB)</span>
+
+
+<span class="nd">@triton</span><span class="o">.</span><span class="n">jit</span>
+<span class="k">def</span> <span class="nf">_layer_norm_bwd_dwdb</span><span class="p">(</span><span class="n">DW</span><span class="p">,</span> <span class="n">DB</span><span class="p">,</span> <span class="n">FINAL_DW</span><span class="p">,</span> <span class="n">FINAL_DB</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span>
+                         <span class="n">BLOCK_SIZE_M</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">:</span> <span class="n">tl</span><span class="o">.</span><span class="n">constexpr</span><span class="p">):</span>
+    <span class="n">pid</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">program_id</span><span class="p">(</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">cols</span> <span class="o">=</span> <span class="n">pid</span> <span class="o">*</span> <span class="n">BLOCK_SIZE_N</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">)</span>
+    <span class="n">dw</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">BLOCK_SIZE_M</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="n">db</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">zeros</span><span class="p">((</span><span class="n">BLOCK_SIZE_M</span><span class="p">,</span> <span class="n">BLOCK_SIZE_N</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">tl</span><span class="o">.</span><span class="n">float32</span><span class="p">)</span>
+    <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">M</span><span class="p">,</span> <span class="n">BLOCK_SIZE_M</span><span class="p">):</span>
+        <span class="n">rows</span> <span class="o">=</span> <span class="n">i</span> <span class="o">+</span> <span class="n">tl</span><span class="o">.</span><span class="n">arange</span><span class="p">(</span><span class="mi">0</span><span class="p">,</span> <span class="n">BLOCK_SIZE_M</span><span class="p">)</span>
+        <span class="n">mask</span> <span class="o">=</span> <span class="p">(</span><span class="n">rows</span><span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">&lt;</span> <span class="n">M</span><span class="p">)</span> <span class="o">&amp;</span> <span class="p">(</span><span class="n">cols</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span> <span class="o">&lt;</span> <span class="n">N</span><span class="p">)</span>
+        <span class="n">offs</span> <span class="o">=</span> <span class="n">rows</span><span class="p">[:,</span> <span class="kc">None</span><span class="p">]</span> <span class="o">*</span> <span class="n">N</span> <span class="o">+</span> <span class="n">cols</span><span class="p">[</span><span class="kc">None</span><span class="p">,</span> <span class="p">:]</span>
+        <span class="n">dw</span> <span class="o">+=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">DW</span> <span class="o">+</span> <span class="n">offs</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">,</span> <span class="n">other</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
+        <span class="n">db</span> <span class="o">+=</span> <span class="n">tl</span><span class="o">.</span><span class="n">load</span><span class="p">(</span><span class="n">DB</span> <span class="o">+</span> <span class="n">offs</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">mask</span><span class="p">,</span> <span class="n">other</span><span class="o">=</span><span class="mf">0.</span><span class="p">)</span>
+    <span class="n">sum_dw</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">dw</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">sum_db</span> <span class="o">=</span> <span class="n">tl</span><span class="o">.</span><span class="n">sum</span><span class="p">(</span><span class="n">db</span><span class="p">,</span> <span class="n">axis</span><span class="o">=</span><span class="mi">0</span><span class="p">)</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">FINAL_DW</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">sum_dw</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">cols</span> <span class="o">&lt;</span> <span class="n">N</span><span class="p">)</span>
+    <span class="n">tl</span><span class="o">.</span><span class="n">store</span><span class="p">(</span><span class="n">FINAL_DB</span> <span class="o">+</span> <span class="n">cols</span><span class="p">,</span> <span class="n">sum_db</span><span class="p">,</span> <span class="n">mask</span><span class="o">=</span><span class="n">cols</span> <span class="o">&lt;</span> <span class="n">N</span><span class="p">)</span>
+
+
+<span class="k">class</span> <span class="nc">LayerNorm</span><span class="p">(</span><span class="n">torch</span><span class="o">.</span><span class="n">autograd</span><span class="o">.</span><span class="n">Function</span><span class="p">):</span>
+
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span> <span class="nf">forward</span><span class="p">(</span><span class="n">ctx</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">normalized_shape</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">eps</span><span class="p">):</span>
+        <span class="c1"># allocate output</span>
+        <span class="n">y</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+        <span class="c1"># reshape input data into 2D tensor</span>
+        <span class="n">x_arg</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
+        <span class="n">M</span><span class="p">,</span> <span class="n">N</span> <span class="o">=</span> <span class="n">x_arg</span><span class="o">.</span><span class="n">shape</span>
+        <span class="n">mean</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">M</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+        <span class="n">rstd</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">M</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">float32</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+        <span class="c1"># Less than 64KB per feature: enqueue fused kernel</span>
+        <span class="n">MAX_FUSED_SIZE</span> <span class="o">=</span> <span class="mi">65536</span> <span class="o">//</span> <span class="n">x</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span>
+        <span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="n">MAX_FUSED_SIZE</span><span class="p">,</span> <span class="n">triton</span><span class="o">.</span><span class="n">next_power_of_2</span><span class="p">(</span><span class="n">N</span><span class="p">))</span>
+        <span class="k">if</span> <span class="n">N</span> <span class="o">&gt;</span> <span class="n">BLOCK_SIZE</span><span class="p">:</span>
+            <span class="k">raise</span> <span class="ne">RuntimeError</span><span class="p">(</span><span class="s2">&quot;This layer norm doesn&#39;t support feature dim &gt;= 64KB.&quot;</span><span class="p">)</span>
+        <span class="c1"># heuristics for number of warps</span>
+        <span class="n">num_warps</span> <span class="o">=</span> <span class="nb">min</span><span class="p">(</span><span class="nb">max</span><span class="p">(</span><span class="n">BLOCK_SIZE</span> <span class="o">//</span> <span class="mi">256</span><span class="p">,</span> <span class="mi">1</span><span class="p">),</span> <span class="mi">8</span><span class="p">)</span>
+        <span class="c1"># enqueue kernel</span>
+        <span class="n">_layer_norm_fwd_fused</span><span class="p">[(</span><span class="n">M</span><span class="p">,)](</span><span class="n">x_arg</span><span class="p">,</span> <span class="n">y</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="n">rstd</span><span class="p">,</span>
+                                    <span class="n">x_arg</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">N</span><span class="p">,</span> <span class="n">eps</span><span class="p">,</span>
+                                    <span class="n">BLOCK_SIZE</span><span class="o">=</span><span class="n">BLOCK_SIZE</span><span class="p">,</span> <span class="n">num_warps</span><span class="o">=</span><span class="n">num_warps</span><span class="p">)</span>
+        <span class="n">ctx</span><span class="o">.</span><span class="n">save_for_backward</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">mean</span><span class="p">,</span> <span class="n">rstd</span><span class="p">)</span>
+        <span class="n">ctx</span><span class="o">.</span><span class="n">BLOCK_SIZE</span> <span class="o">=</span> <span class="n">BLOCK_SIZE</span>
+        <span class="n">ctx</span><span class="o">.</span><span class="n">num_warps</span> <span class="o">=</span> <span class="n">num_warps</span>
+        <span class="n">ctx</span><span class="o">.</span><span class="n">eps</span> <span class="o">=</span> <span class="n">eps</span>
+        <span class="k">return</span> <span class="n">y</span>
+
+    <span class="nd">@staticmethod</span>
+    <span class="k">def</span> <span class="nf">backward</span><span class="p">(</span><span class="n">ctx</span><span class="p">,</span> <span class="n">dy</span><span class="p">):</span>
+        <span class="n">x</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">v</span> <span class="o">=</span> <span class="n">ctx</span><span class="o">.</span><span class="n">saved_tensors</span>
+        <span class="c1"># heuristics for amount of parallel reduction stream for DG/DB</span>
+        <span class="n">N</span> <span class="o">=</span> <span class="n">w</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
+        <span class="n">GROUP_SIZE_M</span> <span class="o">=</span> <span class="mi">64</span>
+        <span class="k">if</span> <span class="n">N</span> <span class="o">&lt;=</span> <span class="mi">8192</span><span class="p">:</span> <span class="n">GROUP_SIZE_M</span> <span class="o">=</span> <span class="mi">96</span>
+        <span class="k">if</span> <span class="n">N</span> <span class="o">&lt;=</span> <span class="mi">4096</span><span class="p">:</span> <span class="n">GROUP_SIZE_M</span> <span class="o">=</span> <span class="mi">128</span>
+        <span class="k">if</span> <span class="n">N</span> <span class="o">&lt;=</span> <span class="mi">1024</span><span class="p">:</span> <span class="n">GROUP_SIZE_M</span> <span class="o">=</span> <span class="mi">256</span>
+        <span class="c1"># allocate output</span>
+        <span class="n">locks</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">zeros</span><span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="n">GROUP_SIZE_M</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">torch</span><span class="o">.</span><span class="n">int32</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+        <span class="n">_dw</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">GROUP_SIZE_M</span><span class="p">,</span> <span class="n">w</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+        <span class="n">_db</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">GROUP_SIZE_M</span><span class="p">,</span> <span class="n">w</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">]),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+        <span class="n">dw</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">w</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+        <span class="n">db</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty</span><span class="p">((</span><span class="n">w</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="n">w</span><span class="o">.</span><span class="n">device</span><span class="p">)</span>
+        <span class="n">dx</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">empty_like</span><span class="p">(</span><span class="n">dy</span><span class="p">)</span>
+        <span class="c1"># enqueue kernel using forward pass heuristics</span>
+        <span class="c1"># also compute partial sums for DW and DB</span>
+        <span class="n">x_arg</span> <span class="o">=</span> <span class="n">x</span><span class="o">.</span><span class="n">reshape</span><span class="p">(</span><span class="o">-</span><span class="mi">1</span><span class="p">,</span> <span class="n">x</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">])</span>
+        <span class="n">M</span><span class="p">,</span> <span class="n">N</span> <span class="o">=</span> <span class="n">x_arg</span><span class="o">.</span><span class="n">shape</span>
+        <span class="n">_layer_norm_bwd_dx_fused</span><span class="p">[(</span><span class="n">M</span><span class="p">,)](</span><span class="n">dx</span><span class="p">,</span> <span class="n">dy</span><span class="p">,</span> <span class="n">_dw</span><span class="p">,</span> <span class="n">_db</span><span class="p">,</span> <span class="n">x</span><span class="p">,</span> <span class="n">w</span><span class="p">,</span> <span class="n">b</span><span class="p">,</span> <span class="n">m</span><span class="p">,</span> <span class="n">v</span><span class="p">,</span> <span class="n">locks</span><span class="p">,</span>
+                                       <span class="n">x_arg</span><span class="o">.</span><span class="n">stride</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">N</span><span class="p">,</span> <span class="n">ctx</span><span class="o">.</span><span class="n">eps</span><span class="p">,</span>
+                                       <span class="n">BLOCK_SIZE_N</span><span class="o">=</span><span class="n">ctx</span><span class="o">.</span><span class="n">BLOCK_SIZE</span><span class="p">,</span>
+                                       <span class="n">GROUP_SIZE_M</span><span class="o">=</span><span class="n">GROUP_SIZE_M</span><span class="p">,</span>
+                                       <span class="n">num_warps</span><span class="o">=</span><span class="n">ctx</span><span class="o">.</span><span class="n">num_warps</span><span class="p">)</span>
+        <span class="n">grid</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">meta</span><span class="p">:</span> <span class="p">[</span><span class="n">triton</span><span class="o">.</span><span class="n">cdiv</span><span class="p">(</span><span class="n">N</span><span class="p">,</span> <span class="n">meta</span><span class="p">[</span><span class="s1">&#39;BLOCK_SIZE_N&#39;</span><span class="p">])]</span>
+        <span class="c1"># accumulate partial sums in separate kernel</span>
+        <span class="n">_layer_norm_bwd_dwdb</span><span class="p">[</span><span class="n">grid</span><span class="p">](</span><span class="n">_dw</span><span class="p">,</span> <span class="n">_db</span><span class="p">,</span> <span class="n">dw</span><span class="p">,</span> <span class="n">db</span><span class="p">,</span> <span class="n">GROUP_SIZE_M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span>
+                                   <span class="n">BLOCK_SIZE_M</span><span class="o">=</span><span class="mi">32</span><span class="p">,</span>
+                                   <span class="n">BLOCK_SIZE_N</span><span class="o">=</span><span class="mi">128</span><span class="p">)</span>
+        <span class="k">return</span> <span class="n">dx</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="n">dw</span><span class="p">,</span> <span class="n">db</span><span class="p">,</span> <span class="kc">None</span>
+
+
+<span class="n">layer_norm</span> <span class="o">=</span> <span class="n">LayerNorm</span><span class="o">.</span><span class="n">apply</span>
+
+
+<span class="k">def</span> <span class="nf">test_layer_norm</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="mf">1e-5</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">):</span>
+    <span class="c1"># create data</span>
+    <span class="n">x_shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
+    <span class="n">w_shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">x_shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span> <span class="p">)</span>
+    <span class="n">weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">w_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">w_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="o">-</span><span class="mf">2.3</span> <span class="o">+</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">x_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+    <span class="n">dy</span> <span class="o">=</span> <span class="mf">.1</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+    <span class="n">x</span><span class="o">.</span><span class="n">requires_grad_</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
+    <span class="c1"># forward pass</span>
+    <span class="n">y_tri</span> <span class="o">=</span> <span class="n">layer_norm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">w_shape</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">eps</span><span class="p">)</span>
+    <span class="n">y_ref</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">layer_norm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">w_shape</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">eps</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">dtype</span><span class="p">)</span>
+    <span class="c1"># backward pass (triton)</span>
+    <span class="n">y_tri</span><span class="o">.</span><span class="n">backward</span><span class="p">(</span><span class="n">dy</span><span class="p">,</span> <span class="n">retain_graph</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">dx_tri</span><span class="p">,</span> <span class="n">dw_tri</span><span class="p">,</span> <span class="n">db_tri</span> <span class="o">=</span> <span class="p">[</span><span class="n">_</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="p">[</span><span class="n">x</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">]]</span>
+    <span class="n">x</span><span class="o">.</span><span class="n">grad</span><span class="p">,</span> <span class="n">weight</span><span class="o">.</span><span class="n">grad</span><span class="p">,</span> <span class="n">bias</span><span class="o">.</span><span class="n">grad</span> <span class="o">=</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span><span class="p">,</span> <span class="kc">None</span>
+    <span class="c1"># backward pass (torch)</span>
+    <span class="n">y_ref</span><span class="o">.</span><span class="n">backward</span><span class="p">(</span><span class="n">dy</span><span class="p">,</span> <span class="n">retain_graph</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">dx_ref</span><span class="p">,</span> <span class="n">dw_ref</span><span class="p">,</span> <span class="n">db_ref</span> <span class="o">=</span> <span class="p">[</span><span class="n">_</span><span class="o">.</span><span class="n">grad</span><span class="o">.</span><span class="n">clone</span><span class="p">()</span> <span class="k">for</span> <span class="n">_</span> <span class="ow">in</span> <span class="p">[</span><span class="n">x</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">]]</span>
+    <span class="c1"># compare</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_almost_equal</span><span class="p">(</span><span class="n">y_tri</span><span class="p">,</span> <span class="n">y_ref</span><span class="p">)</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_almost_equal</span><span class="p">(</span><span class="n">dx_tri</span><span class="p">,</span> <span class="n">dx_ref</span><span class="p">)</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_almost_equal</span><span class="p">(</span><span class="n">db_tri</span><span class="p">,</span> <span class="n">db_ref</span><span class="p">,</span> <span class="n">decimal</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">assert_almost_equal</span><span class="p">(</span><span class="n">dw_tri</span><span class="p">,</span> <span class="n">dw_ref</span><span class="p">,</span> <span class="n">decimal</span><span class="o">=</span><span class="mi">1</span><span class="p">)</span>
+
+
+<span class="nd">@triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">perf_report</span><span class="p">(</span>
+    <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">Benchmark</span><span class="p">(</span>
+        <span class="n">x_names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;N&#39;</span><span class="p">],</span>
+        <span class="n">x_vals</span><span class="o">=</span><span class="p">[</span><span class="mi">512</span> <span class="o">*</span> <span class="n">i</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">2</span><span class="p">,</span> <span class="mi">32</span><span class="p">)],</span>
+        <span class="n">line_arg</span><span class="o">=</span><span class="s1">&#39;provider&#39;</span><span class="p">,</span>
+        <span class="n">line_vals</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;triton&#39;</span><span class="p">,</span> <span class="s1">&#39;torch&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="p">([</span><span class="s1">&#39;apex&#39;</span><span class="p">]</span> <span class="k">if</span> <span class="n">HAS_APEX</span> <span class="k">else</span> <span class="p">[]),</span>
+        <span class="n">line_names</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Triton&#39;</span><span class="p">,</span> <span class="s1">&#39;Torch&#39;</span><span class="p">]</span> <span class="o">+</span> <span class="p">([</span><span class="s1">&#39;Apex&#39;</span><span class="p">]</span> <span class="k">if</span> <span class="n">HAS_APEX</span> <span class="k">else</span> <span class="p">[]),</span>
+        <span class="n">styles</span><span class="o">=</span><span class="p">[(</span><span class="s1">&#39;blue&#39;</span><span class="p">,</span> <span class="s1">&#39;-&#39;</span><span class="p">),</span> <span class="p">(</span><span class="s1">&#39;green&#39;</span><span class="p">,</span> <span class="s1">&#39;-&#39;</span><span class="p">),</span> <span class="p">(</span><span class="s1">&#39;orange&#39;</span><span class="p">,</span> <span class="s1">&#39;-&#39;</span><span class="p">)],</span>
+        <span class="n">ylabel</span><span class="o">=</span><span class="s1">&#39;GB/s&#39;</span><span class="p">,</span>
+        <span class="n">plot_name</span><span class="o">=</span><span class="s1">&#39;layer-norm-backward&#39;</span><span class="p">,</span>
+        <span class="n">args</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;M&#39;</span><span class="p">:</span> <span class="mi">4096</span><span class="p">,</span> <span class="s1">&#39;dtype&#39;</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">float16</span><span class="p">,</span> <span class="s1">&#39;mode&#39;</span><span class="p">:</span> <span class="s1">&#39;backward&#39;</span><span class="p">}</span>
+    <span class="p">)</span>
+<span class="p">)</span>
+<span class="k">def</span> <span class="nf">bench_layer_norm</span><span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">,</span> <span class="n">dtype</span><span class="p">,</span> <span class="n">provider</span><span class="p">,</span> <span class="n">mode</span><span class="o">=</span><span class="s1">&#39;backward&#39;</span><span class="p">,</span> <span class="n">eps</span><span class="o">=</span><span class="mf">1e-5</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">):</span>
+    <span class="c1"># create data</span>
+    <span class="n">x_shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">M</span><span class="p">,</span> <span class="n">N</span><span class="p">)</span>
+    <span class="n">w_shape</span> <span class="o">=</span> <span class="p">(</span><span class="n">x_shape</span><span class="p">[</span><span class="o">-</span><span class="mi">1</span><span class="p">],</span> <span class="p">)</span>
+    <span class="n">weight</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">w_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">bias</span> <span class="o">=</span> <span class="n">torch</span><span class="o">.</span><span class="n">rand</span><span class="p">(</span><span class="n">w_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">,</span> <span class="n">requires_grad</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+    <span class="n">x</span> <span class="o">=</span> <span class="o">-</span><span class="mf">2.3</span> <span class="o">+</span> <span class="mf">0.5</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn</span><span class="p">(</span><span class="n">x_shape</span><span class="p">,</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">,</span> <span class="n">device</span><span class="o">=</span><span class="s1">&#39;cuda&#39;</span><span class="p">)</span>
+    <span class="n">dy</span> <span class="o">=</span> <span class="mf">.1</span> <span class="o">*</span> <span class="n">torch</span><span class="o">.</span><span class="n">randn_like</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+    <span class="n">x</span><span class="o">.</span><span class="n">requires_grad_</span><span class="p">(</span><span class="kc">True</span><span class="p">)</span>
+    <span class="c1"># utility functions</span>
+    <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;triton&#39;</span><span class="p">:</span>
+        <span class="n">y_fwd</span> <span class="o">=</span> <span class="k">lambda</span><span class="p">:</span> <span class="n">layer_norm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">w_shape</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">eps</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;torch&#39;</span><span class="p">:</span>
+        <span class="n">y_fwd</span> <span class="o">=</span> <span class="k">lambda</span><span class="p">:</span> <span class="n">torch</span><span class="o">.</span><span class="n">nn</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">layer_norm</span><span class="p">(</span><span class="n">x</span><span class="p">,</span> <span class="n">w_shape</span><span class="p">,</span> <span class="n">weight</span><span class="p">,</span> <span class="n">bias</span><span class="p">,</span> <span class="n">eps</span><span class="p">)</span>
+    <span class="k">if</span> <span class="n">provider</span> <span class="o">==</span> <span class="s1">&#39;apex&#39;</span><span class="p">:</span>
+        <span class="n">apex_layer_norm</span> <span class="o">=</span> <span class="n">apex</span><span class="o">.</span><span class="n">normalization</span><span class="o">.</span><span class="n">FusedLayerNorm</span><span class="p">(</span><span class="n">w_shape</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">device</span><span class="p">)</span><span class="o">.</span><span class="n">to</span><span class="p">(</span><span class="n">x</span><span class="o">.</span><span class="n">dtype</span><span class="p">)</span>
+        <span class="n">y_fwd</span> <span class="o">=</span> <span class="k">lambda</span><span class="p">:</span> <span class="n">apex_layer_norm</span><span class="p">(</span><span class="n">x</span><span class="p">)</span>
+    <span class="c1"># forward pass</span>
+    <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s1">&#39;forward&#39;</span><span class="p">:</span>
+        <span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">ms</span><span class="p">:</span> <span class="mi">2</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">numel</span><span class="p">()</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">/</span> <span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-6</span>
+        <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="n">y_fwd</span><span class="p">,</span> <span class="n">rep</span><span class="o">=</span><span class="mi">500</span><span class="p">)</span>
+    <span class="c1"># backward pass</span>
+    <span class="k">if</span> <span class="n">mode</span> <span class="o">==</span> <span class="s1">&#39;backward&#39;</span><span class="p">:</span>
+        <span class="n">gbps</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">ms</span><span class="p">:</span> <span class="mi">3</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">numel</span><span class="p">()</span> <span class="o">*</span> <span class="n">x</span><span class="o">.</span><span class="n">element_size</span><span class="p">()</span> <span class="o">/</span> <span class="n">ms</span> <span class="o">*</span> <span class="mf">1e-6</span>
+        <span class="n">y</span> <span class="o">=</span> <span class="n">y_fwd</span><span class="p">()</span>
+        <span class="n">ms</span><span class="p">,</span> <span class="n">min_ms</span><span class="p">,</span> <span class="n">max_ms</span> <span class="o">=</span> <span class="n">triton</span><span class="o">.</span><span class="n">testing</span><span class="o">.</span><span class="n">do_bench</span><span class="p">(</span><span class="k">lambda</span><span class="p">:</span> <span class="n">y</span><span class="o">.</span><span class="n">backward</span><span class="p">(</span><span class="n">dy</span><span class="p">,</span> <span class="n">retain_graph</span><span class="o">=</span><span class="kc">True</span><span class="p">),</span>
+                                                     <span class="n">grad_to_none</span><span class="o">=</span><span class="p">[</span><span class="n">x</span><span class="p">],</span> <span class="n">rep</span><span class="o">=</span><span class="mi">500</span><span class="p">)</span>
+    <span class="k">return</span> <span class="n">gbps</span><span class="p">(</span><span class="n">ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">max_ms</span><span class="p">),</span> <span class="n">gbps</span><span class="p">(</span><span class="n">min_ms</span><span class="p">)</span>
+
+
+<span class="n">bench_layer_norm</span><span class="o">.</span><span class="n">run</span><span class="p">(</span><span class="n">save_path</span><span class="o">=</span><span class="s1">&#39;.&#39;</span><span class="p">,</span> <span class="n">print_data</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
+</pre></div>
+</div>
+<p class="sphx-glr-timing"><strong>Total running time of the script:</strong> ( 1 minutes  23.770 seconds)</p>
+<div class="sphx-glr-footer class sphx-glr-footer-example docutils container" id="sphx-glr-download-getting-started-tutorials-05-layer-norm-py">
+<div class="sphx-glr-download sphx-glr-download-python docutils container">
+<p><a class="reference download internal" download="" href="../../_downloads/935c0dd0fbeb4b2e69588471cbb2d4b2/05-layer-norm.py"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">05-layer-norm.py</span></code></a></p>
+</div>
+<div class="sphx-glr-download sphx-glr-download-jupyter docutils container">
+<p><a class="reference download internal" download="" href="../../_downloads/ae7fff29e1b574187bc930ed94bcc353/05-layer-norm.ipynb"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">Jupyter</span> <span class="pre">notebook:</span> <span class="pre">05-layer-norm.ipynb</span></code></a></p>
+</div>
+</div>
+<p class="sphx-glr-signature"><a class="reference external" href="https://sphinx-gallery.github.io">Gallery generated by Sphinx-Gallery</a></p>
+</div>
+
+
+           </div>
+           
+          </div>
+          <footer>
+    <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
+        <a href="../../python-api/triton.html" class="btn btn-neutral float-right" title="triton" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
+        <a href="04-low-memory-dropout.html" class="btn btn-neutral float-left" title="Low-Memory Dropout" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+    </div>
+
+  <hr/>
+
+  <div role="contentinfo">
+    <p>
+        &#169; Copyright 2020, Philippe Tillet.
+
+    </p>
+  </div>
+    
+    
+    
+    Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
+    
+    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
+    
+    provided by <a href="https://readthedocs.org">Read the Docs</a>. 
+
+</footer>
+        </div>
+      </div>
+
+    </section>
+
+  </div>
+  
+
+  <script type="text/javascript">
+      jQuery(function () {
+          SphinxRtdTheme.Navigation.enable(true);
+      });
+  </script>
+
+  
+  
+    
+   
+
+</body>
+</html>
\ No newline at end of file
diff --git a/getting-started/tutorials/index.html b/getting-started/tutorials/index.html
index ca57053ea..a2208871b 100644
--- a/getting-started/tutorials/index.html
+++ b/getting-started/tutorials/index.html
@@ -100,6 +100,7 @@
 <li class="toctree-l2"><a class="reference internal" href="02-fused-softmax.html">Fused Softmax</a></li>
 <li class="toctree-l2"><a class="reference internal" href="03-matrix-multiplication.html">Matrix Multiplication</a></li>
 <li class="toctree-l2"><a class="reference internal" href="04-low-memory-dropout.html">Low-Memory Dropout</a></li>
+<li class="toctree-l2"><a class="reference internal" href="05-layer-norm.html">Layer Normalization</a></li>
 </ul>
 </li>
 </ul>
@@ -183,6 +184,11 @@
   <div class="section" id="tutorials">
 <span id="sphx-glr-getting-started-tutorials"></span><h1>Tutorials<a class="headerlink" href="#tutorials" title="Permalink to this headline">¶</a></h1>
 <p>Below is a gallery of tutorials for writing various basic operations with Triton. It is recommended that you read through the tutorials in order, starting with the simplest one.</p>
+<p>To install the dependencies for the tutorials:</p>
+<div class="highlight-bash notranslate"><div class="highlight"><pre><span></span><span class="nb">cd</span> triton
+pip install -e <span class="s1">&#39;./python[tutorials]&#39;</span>
+</pre></div>
+</div>
 <div class="sphx-glr-thumbcontainer" tooltip="- The basic programming model of Triton - The triton.jit decorator, which is used to define Tri..."><div class="figure align-default" id="id1">
 <img alt="Vector Addition" src="../../_images/sphx_glr_01-vector-add_thumb.png" />
 <p class="caption"><span class="caption-text"><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a></span><a class="headerlink" href="#id1" title="Permalink to this image">¶</a></p>
@@ -207,6 +213,12 @@
 </div>
 </div><div class="toctree-wrapper compound">
 </div>
+<div class="sphx-glr-thumbcontainer" tooltip="Layer Normalization"><div class="figure align-default" id="id5">
+<img alt="Layer Normalization" src="../../_images/sphx_glr_05-layer-norm_thumb.png" />
+<p class="caption"><span class="caption-text"><a class="reference internal" href="05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">Layer Normalization</span></a></span><a class="headerlink" href="#id5" title="Permalink to this image">¶</a></p>
+</div>
+</div><div class="toctree-wrapper compound">
+</div>
 <div class="sphx-glr-clear"></div><div class="sphx-glr-footer class sphx-glr-footer-gallery docutils container">
 <div class="sphx-glr-download sphx-glr-download-python docutils container">
 <p><a class="reference download internal" download="" href="../../_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip"><code class="xref download docutils literal notranslate"><span class="pre">Download</span> <span class="pre">all</span> <span class="pre">examples</span> <span class="pre">in</span> <span class="pre">Python</span> <span class="pre">source</span> <span class="pre">code:</span> <span class="pre">tutorials_python.zip</span></code></a></p>
diff --git a/getting-started/tutorials/sg_execution_times.html b/getting-started/tutorials/sg_execution_times.html
index 6b033de63..68d52b76d 100644
--- a/getting-started/tutorials/sg_execution_times.html
+++ b/getting-started/tutorials/sg_execution_times.html
@@ -174,7 +174,7 @@
             
   <div class="section" id="computation-times">
 <span id="sphx-glr-getting-started-tutorials-sg-execution-times"></span><h1>Computation times<a class="headerlink" href="#computation-times" title="Permalink to this headline">¶</a></h1>
-<p><strong>10:59.734</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
+<p><strong>12:42.913</strong> total execution time for <strong>getting-started_tutorials</strong> files:</p>
 <table class="docutils align-default">
 <colgroup>
 <col style="width: 85%" />
@@ -183,19 +183,23 @@
 </colgroup>
 <tbody>
 <tr class="row-odd"><td><p><a class="reference internal" href="03-matrix-multiplication.html#sphx-glr-getting-started-tutorials-03-matrix-multiplication-py"><span class="std std-ref">Matrix Multiplication</span></a> (<code class="docutils literal notranslate"><span class="pre">03-matrix-multiplication.py</span></code>)</p></td>
-<td><p>05:41.070</p></td>
+<td><p>06:05.923</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-even"><td><p><a class="reference internal" href="02-fused-softmax.html#sphx-glr-getting-started-tutorials-02-fused-softmax-py"><span class="std std-ref">Fused Softmax</span></a> (<code class="docutils literal notranslate"><span class="pre">02-fused-softmax.py</span></code>)</p></td>
-<td><p>03:26.243</p></td>
+<td><p>03:22.431</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 <tr class="row-odd"><td><p><a class="reference internal" href="01-vector-add.html#sphx-glr-getting-started-tutorials-01-vector-add-py"><span class="std std-ref">Vector Addition</span></a> (<code class="docutils literal notranslate"><span class="pre">01-vector-add.py</span></code>)</p></td>
-<td><p>01:52.411</p></td>
+<td><p>01:50.312</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
-<tr class="row-even"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
-<td><p>00:00.010</p></td>
+<tr class="row-even"><td><p><a class="reference internal" href="05-layer-norm.html#sphx-glr-getting-started-tutorials-05-layer-norm-py"><span class="std std-ref">Layer Normalization</span></a> (<code class="docutils literal notranslate"><span class="pre">05-layer-norm.py</span></code>)</p></td>
+<td><p>01:23.770</p></td>
+<td><p>0.0 MB</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal" href="04-low-memory-dropout.html#sphx-glr-getting-started-tutorials-04-low-memory-dropout-py"><span class="std std-ref">Low-Memory Dropout</span></a> (<code class="docutils literal notranslate"><span class="pre">04-low-memory-dropout.py</span></code>)</p></td>
+<td><p>00:00.477</p></td>
 <td><p>0.0 MB</p></td>
 </tr>
 </tbody>
diff --git a/objects.inv b/objects.inv
index f3bd9cd94..90aa24b7a 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/python-api/generated/triton.Config.html b/python-api/generated/triton.Config.html
index 45d91a582..a40bba081 100644
--- a/python-api/generated/triton.Config.html
+++ b/python-api/generated/triton.Config.html
@@ -186,7 +186,7 @@
 <h1>triton.Config<a class="headerlink" href="#triton-config" title="Permalink to this headline">¶</a></h1>
 <dl class="py class">
 <dt class="sig sig-object py" id="triton.Config">
-<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">triton.</span></span><span class="sig-name descname"><span class="pre">Config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">meta</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_warps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">4</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_stages</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.Config" title="Permalink to this definition">¶</a></dt>
+<em class="property"><span class="pre">class</span> </em><span class="sig-prename descclassname"><span class="pre">triton.</span></span><span class="sig-name descname"><span class="pre">Config</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kwargs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_warps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">4</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_stages</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_hook</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.Config" title="Permalink to this definition">¶</a></dt>
 <dd><p>An object that represents a possible kernel configuration for the auto-tuner to try.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Variables</dt>
@@ -197,12 +197,14 @@
 cooperatively execute using <cite>8 * 32 = 256</cite> threads.</p></li>
 <li><p><strong>num_stages</strong> – the number of stages that the compiler should use when software-pipelining loops.
 Mostly useful for matrix multiplication workloads on SM80+ GPUs.</p></li>
+<li><p><strong>pre_hook</strong> – a function that will be called before the kernel is called. Parameters of this
+function are args.</p></li>
 </ul>
 </dd>
 </dl>
 <dl class="py method">
 <dt class="sig sig-object py" id="triton.Config.__init__">
-<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">meta</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_warps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">4</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_stages</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.Config.__init__" title="Permalink to this definition">¶</a></dt>
+<span class="sig-name descname"><span class="pre">__init__</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">self</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">kwargs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_warps</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">4</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">num_stages</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">2</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">pre_hook</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.Config.__init__" title="Permalink to this definition">¶</a></dt>
 <dd></dd></dl>
 
 <p class="rubric">Methods</p>
@@ -212,7 +214,7 @@ Mostly useful for matrix multiplication workloads on SM80+ GPUs.</p></li>
 <col style="width: 90%" />
 </colgroup>
 <tbody>
-<tr class="row-odd"><td><p><a class="reference internal" href="#triton.Config.__init__" title="triton.Config.__init__"><code class="xref py py-obj docutils literal notranslate"><span class="pre">__init__</span></code></a>(self, meta[, num_warps, num_stages])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal" href="#triton.Config.__init__" title="triton.Config.__init__"><code class="xref py py-obj docutils literal notranslate"><span class="pre">__init__</span></code></a>(self, kwargs[, num_warps, …])</p></td>
 <td><p></p></td>
 </tr>
 </tbody>
diff --git a/python-api/generated/triton.autotune.html b/python-api/generated/triton.autotune.html
index 2706e100b..7425396e1 100644
--- a/python-api/generated/triton.autotune.html
+++ b/python-api/generated/triton.autotune.html
@@ -186,7 +186,7 @@
 <h1>triton.autotune<a class="headerlink" href="#triton-autotune" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.autotune">
-<span class="sig-prename descclassname"><span class="pre">triton.</span></span><span class="sig-name descname"><span class="pre">autotune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">configs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">key</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">reset_to_zero</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.autotune" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.</span></span><span class="sig-name descname"><span class="pre">autotune</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">configs</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">key</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">prune_configs_by</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">reset_to_zero</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.autotune" title="Permalink to this definition">¶</a></dt>
 <dd><p>Decorator for auto-tuning a <code class="code docutils literal notranslate"><span class="pre">triton.jit</span></code>’d function.</p>
 <div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="nd">@triton</span><span class="o">.</span><span class="n">autotune</span><span class="p">(</span><span class="n">configs</span><span class="o">=</span><span class="p">[</span>
     <span class="n">triton</span><span class="o">.</span><span class="n">Config</span><span class="p">(</span><span class="n">meta</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;BLOCK_SIZE&#39;</span><span class="p">:</span> <span class="mi">128</span><span class="p">},</span> <span class="n">num_warps</span><span class="o">=</span><span class="mi">4</span><span class="p">),</span>
@@ -211,6 +211,10 @@ reset the value of the provided tensor to <cite>zero</cite> before running any c
 <dd class="field-even"><ul class="simple">
 <li><p><strong>configs</strong> (<em>list</em><em>[</em><a class="reference internal" href="triton.Config.html#triton.Config" title="triton.Config"><em>triton.Config</em></a><em>]</em>) – a list of <code class="code docutils literal notranslate"><span class="pre">triton.Config</span></code> objects</p></li>
 <li><p><strong>key</strong> (<em>list</em><em>[</em><em>str</em><em>]</em>) – a list of argument names whose change in value will trigger the evaluation of all provided configs.</p></li>
+<li><p><strong>prune_configs_by</strong> – a dict of functions that are used to prune configs, fields:
+‘perf_model’: performance model used to predicate running time with different configs, returns running time
+‘top_k’: number of configs to bench
+‘prune_num_stages_by’(optional): a function used to prune num_stages. It take configs:List[Config] as its input, and returns pruned configs.</p></li>
 <li><p><strong>reset_to_zero</strong> (<em>list</em><em>[</em><em>str</em><em>]</em>) – a list of argument names whose value will be reset to zero before evaluating any configs.</p></li>
 </ul>
 </dd>
diff --git a/python-api/generated/triton.language.dot.html b/python-api/generated/triton.language.dot.html
index a6e46443d..1efc841d7 100644
--- a/python-api/generated/triton.language.dot.html
+++ b/python-api/generated/triton.language.dot.html
@@ -197,14 +197,14 @@
 <h1>triton.language.dot<a class="headerlink" href="#triton-language-dot" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.dot">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">dot</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.dot" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">dot</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">input</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">allow_tf32</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">True</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.dot" title="Permalink to this definition">¶</a></dt>
 <dd><p>Returns the matrix product of two blocks.</p>
 <p>The two blocks must be two dimensionals and have compatible inner dimensions.</p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
 <dd class="field-odd"><ul class="simple">
-<li><p><strong>input</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) – The first block to be multiplied.</p></li>
-<li><p><strong>other</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) – The second block to be multiplied.</p></li>
+<li><p><strong>input</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) – The first block to be multiplied.</p></li>
+<li><p><strong>other</strong> (2D block of scalar-type in {<code class="code docutils literal notranslate"><span class="pre">float16</span></code>, <code class="code docutils literal notranslate"><span class="pre">bfloat16</span></code>, <code class="code docutils literal notranslate"><span class="pre">float32</span></code>}) – The second block to be multiplied.</p></li>
 </ul>
 </dd>
 </dl>
diff --git a/python-api/generated/triton.language.load.html b/python-api/generated/triton.language.load.html
index 899f654e2..ad5c55ba1 100644
--- a/python-api/generated/triton.language.load.html
+++ b/python-api/generated/triton.language.load.html
@@ -200,7 +200,7 @@
 <h1>triton.language.load<a class="headerlink" href="#triton-language-load" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.load">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.load" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">load</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">pointer</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">mask</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">other</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">cache_modifier</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">''</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">volatile</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.load" title="Permalink to this definition">¶</a></dt>
 <dd><p>Return a block of data whose values are, elementwise, loaded from memory at location defined by <code class="code docutils literal notranslate"><span class="pre">pointer</span></code>.</p>
 <p><code class="code docutils literal notranslate"><span class="pre">mask</span></code> and <code class="code docutils literal notranslate"><span class="pre">other</span></code> are implicitly broadcast to <code class="code docutils literal notranslate"><span class="pre">pointer.shape</span></code>.</p>
 <p><code class="code docutils literal notranslate"><span class="pre">other</span></code> is implicitly typecast to <code class="code docutils literal notranslate"><span class="pre">pointer.dtype.element_ty</span></code>.</p>
@@ -210,9 +210,11 @@
 <li><p><strong>pointer</strong> (<em>Block of dtype=triton.PointerDType</em>) – Pointers to the data to be loaded.</p></li>
 <li><p><strong>mask</strong> (<em>Block of triton.int1</em><em>, </em><em>optional</em>) – if mask[idx] is false, do not load the data at address <code class="code docutils literal notranslate"><span class="pre">pointer[idx]</span></code>.</p></li>
 <li><p><strong>other</strong> (<em>Block</em><em>, </em><em>optional</em>) – if mask[idx] is false, return other[idx]</p></li>
+<li><p><strong>cache_modifier</strong> – changes cache option in nvidia ptx</p></li>
 </ul>
 </dd>
 </dl>
+<p>‘type cache_modifier: str, optional</p>
 </dd></dl>
 
 </div>
diff --git a/python-api/generated/triton.language.rand.html b/python-api/generated/triton.language.rand.html
index a992fc228..c30beb23a 100644
--- a/python-api/generated/triton.language.rand.html
+++ b/python-api/generated/triton.language.rand.html
@@ -201,7 +201,7 @@
 <h1>triton.language.rand<a class="headerlink" href="#triton-language-rand" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.rand">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">rand</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.rand" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">rand</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_rounds</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">10</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.rand" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a <code class="code docutils literal notranslate"><span class="pre">seed</span></code> scalar and an <code class="code docutils literal notranslate"><span class="pre">offset</span></code> block,
 returns a block of random <code class="code docutils literal notranslate"><span class="pre">float32</span></code> in <span class="math notranslate nohighlight">\(U(0, 1)\)</span></p>
 <dl class="field-list simple">
diff --git a/python-api/generated/triton.language.randint.html b/python-api/generated/triton.language.randint.html
index 729710d1e..30cf8cf1c 100644
--- a/python-api/generated/triton.language.randint.html
+++ b/python-api/generated/triton.language.randint.html
@@ -200,7 +200,7 @@
 <h1>triton.language.randint<a class="headerlink" href="#triton-language-randint" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.randint">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randint</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randint" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randint</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_rounds</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">10</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randint" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a <code class="code docutils literal notranslate"><span class="pre">seed</span></code> scalar and an <code class="code docutils literal notranslate"><span class="pre">offset</span></code> block, returns a single
 block of random <code class="code docutils literal notranslate"><span class="pre">int32</span></code>.</p>
 <p>If you need multiple streams of random numbers,
diff --git a/python-api/generated/triton.language.randint4x.html b/python-api/generated/triton.language.randint4x.html
index 42957430a..0c4dfa6ae 100644
--- a/python-api/generated/triton.language.randint4x.html
+++ b/python-api/generated/triton.language.randint4x.html
@@ -200,7 +200,7 @@
 <h1>triton.language.randint4x<a class="headerlink" href="#triton-language-randint4x" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.randint4x">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randint4x</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randint4x" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randint4x</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_rounds</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">10</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randint4x" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a <code class="code docutils literal notranslate"><span class="pre">seed</span></code> scalar and an <code class="code docutils literal notranslate"><span class="pre">offset</span></code> block, returns four
 blocks of random <code class="code docutils literal notranslate"><span class="pre">int32</span></code>.</p>
 <p>This is the maximally efficient entry point
diff --git a/python-api/generated/triton.language.randn.html b/python-api/generated/triton.language.randn.html
index c70d6b19c..b4c0f76b5 100644
--- a/python-api/generated/triton.language.randn.html
+++ b/python-api/generated/triton.language.randn.html
@@ -201,7 +201,7 @@
 <h1>triton.language.randn<a class="headerlink" href="#triton-language-randn" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.randn">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randn</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randn" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">randn</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">seed</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">offset</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">n_rounds</span></span><span class="p"><span class="pre">:</span></span> <span class="n"><span class="pre">triton.language.core.constexpr</span></span> <span class="o"><span class="pre">=</span></span> <span class="default_value"><span class="pre">10</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.randn" title="Permalink to this definition">¶</a></dt>
 <dd><p>Given a <code class="code docutils literal notranslate"><span class="pre">seed</span></code> scalar and an <code class="code docutils literal notranslate"><span class="pre">offset</span></code> block,
 returns a block of random <code class="code docutils literal notranslate"><span class="pre">float32</span></code> in <span class="math notranslate nohighlight">\(\mathcal{N}(0, 1)\)</span></p>
 <dl class="field-list simple">
diff --git a/python-api/generated/triton.language.softmax.html b/python-api/generated/triton.language.softmax.html
index 5927e1dcc..d888b687f 100644
--- a/python-api/generated/triton.language.softmax.html
+++ b/python-api/generated/triton.language.softmax.html
@@ -203,7 +203,7 @@
 <h1>triton.language.softmax<a class="headerlink" href="#triton-language-softmax" title="Permalink to this headline">¶</a></h1>
 <dl class="py function">
 <dt class="sig sig-object py" id="triton.language.softmax">
-<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition">¶</a></dt>
+<span class="sig-prename descclassname"><span class="pre">triton.language.</span></span><span class="sig-name descname"><span class="pre">softmax</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">x</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">ieee_rounding</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#triton.language.softmax" title="Permalink to this definition">¶</a></dt>
 <dd><p>Computes the element-wise softmax of <code class="code docutils literal notranslate"><span class="pre">x</span></code></p>
 <dl class="field-list simple">
 <dt class="field-odd">Parameters</dt>
diff --git a/python-api/triton.html b/python-api/triton.html
index 9ea31e6a2..cf0c32e8b 100644
--- a/python-api/triton.html
+++ b/python-api/triton.html
@@ -47,7 +47,7 @@
     <link rel="index" title="Index" href="../genindex.html" />
     <link rel="search" title="Search" href="../search.html" />
     <link rel="next" title="triton.jit" href="generated/triton.jit.html" />
-    <link rel="prev" title="Low-Memory Dropout" href="../getting-started/tutorials/04-low-memory-dropout.html" /> 
+    <link rel="prev" title="Layer Normalization" href="../getting-started/tutorials/05-layer-norm.html" /> 
 </head>
 
 <body class="wy-body-for-nav">
@@ -211,7 +211,7 @@
           <footer>
     <div class="rst-footer-buttons" role="navigation" aria-label="footer navigation">
         <a href="generated/triton.jit.html" class="btn btn-neutral float-right" title="triton.jit" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
-        <a href="../getting-started/tutorials/04-low-memory-dropout.html" class="btn btn-neutral float-left" title="Low-Memory Dropout" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
+        <a href="../getting-started/tutorials/05-layer-norm.html" class="btn btn-neutral float-left" title="Layer Normalization" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
     </div>
 
   <hr/>
diff --git a/searchindex.js b/searchindex.js
index 626d78851..9e05a6b96 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[10,1,1,""]},"triton.language":{arange:[14,2,1,""],atomic_add:[15,2,1,""],atomic_cas:[16,2,1,""],atomic_max:[17,2,1,""],atomic_min:[18,2,1,""],atomic_xchg:[19,2,1,""],broadcast_to:[20,2,1,""],cos:[21,2,1,""],dot:[22,2,1,""],exp:[23,2,1,""],load:[24,2,1,""],log:[25,2,1,""],max:[26,2,1,""],maximum:[27,2,1,""],min:[28,2,1,""],minimum:[29,2,1,""],multiple_of:[30,2,1,""],num_programs:[31,2,1,""],program_id:[32,2,1,""],rand:[33,2,1,""],randint4x:[35,2,1,""],randint:[34,2,1,""],randn:[36,2,1,""],ravel:[37,2,1,""],reshape:[38,2,1,""],sigmoid:[39,2,1,""],sin:[40,2,1,""],softmax:[41,2,1,""],sqrt:[42,2,1,""],store:[43,2,1,""],sum:[44,2,1,""],where:[45,2,1,""],zeros:[46,2,1,""]},"triton.testing":{Benchmark:[47,0,1,""],do_bench:[48,2,1,""],perf_report:[49,2,1,""]},"triton.testing.Benchmark":{__init__:[47,1,1,""]},triton:{Config:[10,0,1,""],autotune:[11,2,1,""],heuristics:[12,2,1,""],jit:[13,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,6,8,9,31,32,33,36,46,48],"00":6,"0000":3,"000000":2,"000001":[1,2],"000002":2,"003845":3,"004273":1,"01":[1,3,6],"010":[4,6],"012395":2,"019017":3,"019048":3,"02":[2,6],"025776":3,"028308":3,"03":[3,6],"038365":2,"04":[4,6],"047592":3,"05":6,"053553":3,"0625":3,"070":[3,6],"074685":3,"08199":4,"08452":4,"084721":1,"0938":3,"097543":2,"0f":9,"0s":4,"1":[1,2,3,4,7,9,12,31,32,33,36],"10":[1,3,4,6],"100":[2,48],"1024":[1,3,4,11],"1045":3,"1048576":1,"106434":4,"109587":3,"11":[0,1,3],"1152":3,"12":[1,3],"120002":3,"12160":2,"12288":2,"123":4,"12416":2,"12544":2,"12672":2,"127":1,"128":[1,2,3,11],"1280":3,"13":[1,3],"130841":3,"131072":1,"1328":3,"133347":2,"134217728":1,"13686":4,"138541":3,"14":[1,3],"140799":3,"1408":3,"142862":2,"149366":3,"149375":2,"149397":4,"15":[1,3],"151":2,"1536":3,"154":2,"16":[2,3,9,46],"160":2,"163":2,"16384":1,"1664":3,"167004":2,"16777216":1,"17":3,"171410":2,"172588":3,"17879":4,"1792":3,"18":3,"1823":2,"188":2,"189387":2,"19":[1,3],"190482":1,"192":1,"1920":3,"197579":2,"198":2,"1982":9,"1983":8,"1984":9,"1989":9,"199":2,"1991":[8,9],"1999":9,"1d":[1,2,3],"1e":[1,2,3],"1s":4,"2":[1,2,3,4,7,9,10,12,31,32,48],"20":[3,48],"200000":1,"200001":3,"2004":9,"2006":9,"200871":3,"2011":4,"2012":9,"2013":8,"2014":[4,8],"2016":[8,9],"2017":8,"2018":[8,9],"2019":9,"2021":[8,9],"2048":[2,3],"2097152":1,"21":3,"212868":4,"2141":1,"214186":4,"2176":3,"219":1,"22":3,"220":3,"23":3,"2304":3,"233226":3,"24":3,"243":[2,6],"2432":3,"245":3,"247522":3,"25":[3,48],"256":[1,2,3,10],"2560":3,"26":[2,3,6],"260869":3,"262144":1,"2656":3,"2688":3,"27":3,"28":[1,3],"2812":3,"2816":3,"2891":3,"29":3,"293429":4,"2944":3,"298794":4,"2d":[3,22],"2m":2,"2mn":2,"3":[0,1,2,3,4,9],"30":3,"305746":3,"305878":3,"3072":3,"3076":1,"307617":3,"31":3,"3125":3,"32":[3,10],"3200":3,"321838":2,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"3477":3,"3516":3,"3555":3,"3584":3,"36":3,"360017":2,"362445":1,"367588":3,"37":3,"3712":3,"3713":1,"371721":4,"372618":3,"372800":3,"373605":3,"38":1,"380953":3,"384":[1,2,3],"3840":3,"384000":3,"39":3,"3906":3,"393507":3,"3968":3,"3984":3,"3986":4,"3d":[31,32],"3mn":2,"4":[1,2,3,9,10,11,34],"40":3,"400001":1,"400016":1,"4023":3,"403344":4,"403347":4,"405":2,"4062":3,"408716":4,"4096":[1,2,3],"41":[3,6],"411":[1,6],"412":2,"415":2,"417474":3,"4194304":1,"42142":4,"428568":1,"428801":3,"429770":[1,2],"430545":3,"431969":4,"44":3,"446623":3,"448255":1,"4492":3,"4531":3,"46":3,"4609":3,"464755":3,"4688":3,"472":1,"49":3,"492442":3,"4940":1,"496824":3,"4m":2,"4x":2,"5":[1,3,4,9,48],"5000":3,"500614":3,"507077":3,"51":3,"512":[2,3,4],"516392":3,"519518":3,"52":[1,3,6],"524288":1,"526831":3,"528545":3,"53":3,"530349":3,"5312":3,"54":3,"541":4,"546":2,"552988":3,"56":3,"563555":3,"566038":2,"566925":3,"567248":3,"568431":4,"579370":3,"585":2,"5859":3,"586858":4,"5898":3,"59":6,"5mn":2,"6":[0,1,3],"600000":1,"602666":3,"606":2,"608294":3,"6094":3,"612060":3,"614":1,"615390":1,"62":3,"63":3,"632545":3,"633710":3,"64":[1,3],"640":[2,3],"646":2,"646820":3,"655":2,"65536":1,"656000":3,"656574":1,"661740":2,"661970":2,"666684":2,"666687":2,"67":3,"67086":4,"67108864":1,"6724":1,"68":3,"682":2,"69":3,"6953":3,"7":[0,1,3,9],"70":3,"702":2,"7031":3,"703707":2,"706180":3,"7070":3,"707878":4,"709358":3,"71":3,"719258":4,"72":3,"722":1,"727466":3,"73":3,"730667":3,"734":6,"736871":2,"737435":1,"74":3,"743443":4,"747320":3,"7500":3,"755985":2,"76":[1,3],"768":[2,3],"768000":3,"773130":3,"776923":2,"78":3,"780":1,"781":2,"784108":3,"79":3,"79719":4,"8":[1,2,3,9,10,11,46,48],"80":[3,48],"800002":1,"806694":4,"807":2,"809":2,"809875":3,"81":3,"810":2,"811163":1,"812":[1,2],"814814":2,"8192":1,"82":3,"822093":3,"823517":1,"83":3,"833":1,"838026":4,"8388608":1,"839992":2,"84":3,"842":1,"84284":4,"843":1,"847":1,"847460":3,"848":1,"849":1,"85":3,"850":1,"854847":2,"86":3,"863938":4,"87":3,"88":3,"8828":3,"8867":3,"888887":3,"89":3,"8906":3,"8945":3,"896":3,"8mn":2,"9":[0,1,2,3,4],"90":3,"90567":4,"908442":3,"908642":3,"91":3,"914919":3,"918570":3,"9219":3,"93":[2,3],"932191":3,"9375":3,"94":[2,3],"943675":3,"943708":3,"9492":3,"95":2,"952835":4,"9531":3,"959706":3,"96":[2,3],"9688":3,"97":2,"971025":3,"971190":2,"9733":1,"978909":3,"98":2,"9805":3,"980725":3,"983276":3,"98432":1,"9844":3,"993854":3,"999995":1,"abstract":[8,9],"break":9,"byte":2,"case":[1,2,8,9,12,15,16,17,18,19],"class":[2,8,9,10,47],"default":48,"do":[2,3,8,9,24,43],"float":[2,8,9,48],"function":[1,2,3,4,9,11,12,13,47,48,49],"import":[1,2,3,4,8,9],"int":[1,8,9,12,14,20,31,32,38,46,48],"new":[20,38,46],"return":[1,2,3,4,14,15,16,17,18,19,22,24,26,28,31,32,33,34,35,36,37,44,45,46,48,49],"static":[0,8,9],"super":3,"switch":3,"true":[1,2,3,45],"try":[3,10],"var":9,"voil\u00e0":4,"while":[3,8],A:[3,4,8,9],And:[0,3],As:[2,3,4,8,9],At:[4,9],But:4,By:48,For:[3,8,9,10],If:[4,9,34,43,45,47],In:[1,2,3,4,9],It:[1,3,4,5,7,9,13],Of:8,On:9,One:3,The:[1,2,3,4,8,9,15,16,17,18,19,20,22,31,32,33,34,35,36,38,43,45,49],There:1,These:9,To:[1,4,8,9,11],__expf:2,__init__:[10,47],_dropout:4,_matmul:3,_seeded_dropout:4,a100:[3,9],a_ptr:3,ab:1,abl:9,about:[1,2,3,4,7],abov:[1,2,3,4,9,11],academ:8,acc:[3,8,9],acceler:8,access:[1,3,8,9,13],accomod:3,accordingli:9,account:9,accumul:[3,9],accuraci:[3,8],achiev:[3,8,9],across:[2,4,8,9],activ:3,actual:[3,8,9],add:[1,4,6,15],add_kernel:1,addit:[2,5,6,8,48],addition:9,address:[8,24],adopt:9,advanc:[2,3,8],advoc:9,affect:3,affin:9,after:3,against:[0,1,2,3,7],aggress:[8,9],agnost:[8,9],ahead:9,aim:[2,7],al:[8,9],alex:4,algebra:9,algorithm:[3,4,8,9],alia:9,all:[2,3,4,5,8,9,11,26,28,30,44,47],allclos:[2,3],allen1984:9,allen:9,alloc:[1,2,3,8],allow:[1,2,8,9],along:[1,3,26,28,31,32,44,48],also:[1,2,3,4,8,9],altern:4,alwai:[9,45],amd:8,amen:9,amount:8,ampl:9,an:[1,2,3,4,8,9,10,15,16,17,18,19,33,34,35,36],analog:1,analysi:[8,9],analyz:9,ancourt1991:9,ancourt:9,ani:[1,2,3,9,11,12,47],anoth:[2,9],anytim:11,apart:9,api:47,appear:47,appli:[3,4,8,9],applic:[4,9,12],approach:[8,9],appropri:1,approxim:2,ar:[0,1,2,3,4,8,9,11,13,24,30,43,45,47],arang:[1,2,3,4],arbitrari:3,architectur:[3,8],area:9,arg:[1,2,3,12,13,47],argument:[1,2,3,10,11,12,13,45,47],arrai:[9,46],arrang:3,art:[8,9],artifici:4,arxiv:[8,9],ask:2,aspect:9,asplo:8,assert:[1,3,4],assum:[2,47],asynchron:[1,8],atom:[15,16,17,18,19],auguin1983:8,auguin:8,auto:[2,3,9,10,11,12],autom:8,automat:[2,3,8,9,10],autotun:[3,9],avail:[0,4,8,9],avoid:[2,11,45],awar:8,awkward:4,axi:[1,2,3,4,26,28,31,32,44,47],b:[3,8,9],b_ptr:3,back:[1,2,3,4],backpropag:4,bad:4,baghdadi2021:[8,9],baghdadi:[8,9],balanc:9,bandwidth:2,base:[4,7,8,9],basic:[1,5,9],becom:8,been:[1,8,9],befor:[3,11,15,16,17,18,19],begin:9,behavior:[9,11],being:[2,4],believ:9,below:[4,5,9],bench:0,benchmark:[0,48,49],benefit:[2,8,9],best:[1,8],between:[1,8],bit:4,block:[1,2,3,4,8,9,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,33,34,35,36,37,38,39,40,41,42,43,44,45,46],block_siz:[1,2,4,9,11,12],block_size_k:3,block_size_m:3,block_size_n:3,block_start:[1,4],blue:[1,2,3],boil:9,bool:[45,47],both:[9,45],bound:[1,2,3,9],branch:9,broad:8,broadcast:[20,24,43,45],build:[0,3],built:[1,9],c:[3,8,9],c_mask:3,c_ptr:3,cach:[8,9],call:[1,3,9,13,34],callabl:[1,12,13,48],can:[0,1,2,3,4,8,9,11,49],cannot:[3,8,9],capabl:[7,8],cd:0,cdiv:[1,3,4],ceil:12,certain:12,cgo:[8,9],challeng:4,chang:[3,4,11],chapter:7,characterist:9,cheap:8,check:[3,7],checkpoint:4,chen2018:8,chen:8,chip:2,choic:7,click:[1,2,3,4],clone:0,close:9,cmake:0,cmp:[15,16,17,18,19],coalesc:8,code:[1,2,3,4,5,8,9],col:[3,9],col_offset:2,color:47,column:[2,3],com:0,combin:8,come:[2,3,9],command:0,common:9,commonli:9,compar:[2,3,4,7,9,15,16,17,18,19],compat:22,compil:[2,3,7,8,10,13,30],complet:9,complex:9,compos:[4,8],composit:9,comprehens:[8,9],comput:[4,7,8,9,12,21,23,25,27,29,39,40,41,42],computation:[8,9],concern:9,concis:[1,47],condit:[9,45],config:[3,11],configur:[3,10,11,49],confirm:2,connectom:8,consecut:9,consequ:8,consid:2,consist:4,constraint:[3,9],construct:8,constructor:47,consum:3,contain:[9,15,16,17,18,19,47],contextu:9,contigu:[3,14,37],control:[8,9],conveni:3,convert:[1,3,13],convolut:8,cooper:10,copi:[4,8,15,16,17,18,19],core:[8,9],correct:1,correspond:[1,2,3,47],cosin:21,cost:9,could:[2,9],cours:8,cpython:0,creat:[1,2,3,8],crucial:4,csv:1,cubla:[3,8],cuda:[1,2,3,4,8],cudnn:8,current:32,custom:[1,2,3,7],cut:3,cvpr:8,d:[2,4,11,13],dart:9,darte1999:9,data:[1,3,4,8,9,15,16,17,18,19,24,45,46],data_ptr:13,dataflow:9,david:4,deal:4,decad:8,declar:1,decompos:9,decor:[1,3,11,12,13],decreas:4,dedic:3,deep:[3,4,8,9],def:[1,2,3,4,11,12],defin:[1,2,3,9,24],definit:9,denomin:2,denot:1,dens:9,depend:[0,9,45],deploi:8,describ:[4,9],design:9,desir:[20,38],detail:[3,9],detect:8,develop:[8,9],devic:[1,2,3],dialect:9,dict:12,dictionari:[10,12],diesel:9,differ:[1,2,3,4,8,9,47],difficult:9,difficulti:[3,8],dijkstra82:9,dijkstra:9,dim:[2,9],dimens:[3,22,26,28,44],dimension:[3,9,22],dir:0,direct:3,disjoint:9,disk:1,dissert:9,distribut:[2,4,9],divis:3,dnn:[7,8,9],do_bench:[1,2,3],doc:4,doe:[1,2,3,9],doesn:9,domain:[8,9],don:[1,2,3],done:[3,8,26,28,44],dot:3,doubli:3,doubt:9,down:[3,9],download:[0,1,2,3,4,5],dram:[1,2],dropout:[5,6],dror:4,dsl:[7,8,9],dtype:[1,2,3,15,16,17,18,19,24,43,46],e:[0,2,3,4,8,9,46],each:[1,2,3,4,8,9,10,12],eas:9,easi:[3,4],easier:[1,2,8],easili:3,ed:[1,3],education:2,effect:9,effici:[3,4,8,35],effort:9,either:[1,31,32,45],elango2018:9,elango:9,element:[1,2,3,4,21,23,25,26,27,28,29,39,40,41,42,43,44,45,47],element_s:2,element_ti:[15,16,17,18,19,24,43],elementwis:[2,24],els:3,emerg:8,empti:3,empty_lik:[1,2,4],enabl:9,encod:9,encourag:4,end:[8,9,14],enforc:9,engin:9,enqueu:[1,2],ensur:9,entir:9,entri:35,environ:7,equal:9,error:3,especi:8,et:[4,8,9],euromicro:8,evalu:[3,4,11,45],even:[4,9],evidenc:8,evolv:8,exampl:[1,2,3,4,5,8,9,10],exchang:19,execut:[6,8,9,10,49],exist:[8,9],exp:2,expect:[2,15,16,17,18,19],expens:[8,9,12],explor:[4,8],exponenti:[2,23],express:[8,9],extar:1,extend:[3,4],extract:3,extrem:9,f:[1,2,3,9],facilit:[8,9],fact:9,fairli:3,fals:[24,43,45,47,48],far:2,fast:[2,8,9],faster:[2,34],fastest:9,feel:3,fetch:8,few:9,field:8,figur:9,file:[1,2,3,6],fill:46,fine:4,first:[1,3,4,7,9,22,27,29],first_pid_m:3,firstli:4,fit:2,fix:47,flag:2,flatten:37,flexibl:8,float16:[3,22,46],float32:[1,2,3,4,22,33,36],flow:[8,9],fly:4,fn:[13,48],focu:[3,9],folder:4,follow:[0,2,3,7,8,9],footprint:4,forc:4,forget:1,formal:9,format:9,found:[15,16,17,18,19],foundat:9,four:35,fp16:3,fp32:3,frac:4,framework:[8,9],free:3,from:[1,2,3,4,8,9,24,45],full:[1,2,3,4],fulli:9,func:9,fundament:9,further:[4,9],fuse:[3,5,6],fusion:[2,9],g:[3,4,8,9,46],galleri:[1,2,3,4,5],gb:[1,2],gbp:[1,2],gener:[1,2,3,4,5,8,9,33,34,35,36,47],geoffrei:4,geq:9,get:[1,2,3,4,6],girbal2006:9,girbal:9,git:0,github:0,give:8,given:[2,3,4,20,31,32,33,34,35,36,38,46],global:9,go:[1,3,9],good:[1,9],gpgpu:8,gpu:[1,2,4,7,8,9,10,13],grad_to_non:48,gradient:48,grammat:9,graphic:8,greater:2,green:[1,2,3],grid:[1,2,3,4,31,32],grid_m:3,grid_n:3,grosser2012:9,grosser:9,group:3,group_id:3,group_m:3,group_size_m:3,grow:9,guard:[1,2],guid:8,ha:[1,3,4,8,9,31,32],had:1,halid:[8,9],hand:9,handl:[1,2,4,9],handwritten:8,hard:3,harder:9,hardwar:[3,7,9],hasn:1,have:[2,4,8,9,13,22,45,47],heavi:8,helper:[1,2],henc:3,here:[1,2,3,4],heurist:2,hierarch:8,hierarchi:9,high:[3,8,9],higher:3,highli:8,highlight:9,hint:9,hinton:4,hit:3,how:[1,2,3,7,8,12],howev:[2,9],html:4,http:[0,4],i:[1,2,3,4,8,9],id:[3,32],idea:8,ideal:2,ident:2,identifi:1,idx:[24,43],ilya:4,imag:[8,9],implement:[1,2,3,4,8,9],implicitli:[1,13,24,43],importantli:9,impos:9,improv:[3,4],incompat:[3,9],incorrect:3,increas:[1,2,3,4],incred:8,increment:9,inde:9,independ:[2,9],index:1,indic:[9,45],induc:9,industri:8,inequ:9,inf:2,inform:9,infrastructur:9,initi:[1,3],inner:[3,22],inplac:3,input:[1,2,3,4,9,12,20,21,22,23,25,26,27,28,29,30,37,38,39,40,41,42,44],input_ptr:2,input_row_strid:2,instal:7,instanc:[1,2,3,4,8,10,31,32],instanti:4,instead:[2,45],instruct:[7,8],int1:[24,43],int32:[4,34,35],integ:9,interchang:9,interest:[8,9],intermedi:9,intern:[2,9],interv:14,intrins:9,introduc:4,introduct:7,invari:[2,9],invoc:4,ipynb:[1,2,3,4],ir:9,irregular:[2,9],is_contigu:[3,4],is_cuda:1,isn:3,issu:[8,9],iter:[3,8,9],its:[1,2,3,9],j:[3,8,9],jit:[1,2,3,4,11,12],jmlr:4,john:4,johnson:4,journal:9,jrk2013:8,jupyt:[1,2,3,4,5],just:[3,9,12],k:[3,4,8,9],kb:8,keep:4,kei:[3,8,11],kellei:8,kernel:[4,7,8,10,11,12],keyword:[1,10],ki:9,kind:2,know:30,known:9,krizhevski:4,kwarg:13,label:[1,2,3,47],lam1991:8,lam:8,lambda:[1,2,3,4,12],languag:[1,2,3,4,7,8,13],larg:[8,9],last:3,later:[2,9],latest:0,lattner2004:9,lattner2019:9,lattner:9,launch:[1,2,3,31,32],law:9,layer:[8,9],lead:[4,8,9],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,7,8,9],least:9,lee2017:8,lee:8,left:9,legal:9,length:1,less:[4,8,9],let:[1,2,4,30],letter:9,level:[3,8,9],li:8,librari:[0,3,8,9],lifelong:9,like:[1,4,8,9,34],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,9,47],line_arg:[1,2,3,47],line_nam:[1,2,3,47],line_v:[1,2,3,47],linear:[8,9],link:0,list:[1,3,11,12,47,48,49],litteratur:9,ll:4,llvm11:0,llvm:[0,9],load:[1,2,3,4,9,45],local:[8,9],locat:[3,15,16,17,18,19,24,43],log2:12,log:47,logarithm:[1,25],look:[4,7,8],loop:[3,9,10],low:[5,6,9],m:[0,2,3,8],machin:[8,9],machineri:[8,9],made:8,mai:[2,9,12],main:[3,8,9],maintain:[2,9],major:[3,9],make:[1,2,8,9],manag:[4,8],mani:[1,8,9],manual:[2,9],manual_se:[1,2,3],map:3,mapl:9,mark:[4,49],markedli:8,mask:[1,2,3,4,15,17,18,19,24,43,45],match:[3,15,16,17,18,19],math:12,mathbb:9,mathbf:9,mathcal:[9,36],mathemat:9,matmul:[3,9],matmul_kernel:3,matric:[2,3],matrix:[2,4,5,6,8,9,10,22],matrix_s:9,matter:[3,8,9],max:[1,2,17],max_m:[1,2,3],maxim:[7,9,35],maximum:[1,2,26],mb:[6,8],mean:[3,9,11],mechan:[2,9],median:48,memori:[1,2,3,5,6,8,9,15,16,17,18,19,24,43,45],mention:3,meta:[1,2,3,4,10,11,12],metaparamet:1,method:[9,10,13,47,49],methodolog:9,micro:8,min:[3,18],min_m:[1,2,3],minimum:28,minut:[1,2,3,4],miss:9,mitig:9,ml:8,mlir:9,mn:2,model:[1,8,9],modern:[3,7,8,9],modular:9,moor:9,mora:4,more:[2,3,4,7,8,9,47],most:[3,9],mostli:10,move:3,movement:4,ms:[1,2,3,48],much:[2,3],mullapudi2016:9,mullapudi:9,multi:[3,8,9],multipl:[1,4,5,6,8,9,10,11,30,34],multipli:[3,4,9,22],must:[2,3,14,22,45],n:[2,3,8,36],n_col:2,n_element:[1,4],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,11,12,47],nativ:[1,2,3],natur:[2,8,25],nb:8,necessari:2,need:[1,2,3,4,34],nelement:2,nest:[3,9],net:9,network:[4,8,9],neural:[4,8,9],neurosci:8,never:4,next:[2,3],next_power_of_2:2,nightli:0,nip:8,nitish:4,nn:3,non:8,none:[2,3,11,15,17,18,19,24,43,47,48],nonzero:45,norm:4,normal:[2,3],note:[0,1,2,3,4,9,11,13,45],notebook:[1,2,3,4,5],notic:[2,9],notori:[3,8],novel:8,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,10],num_warp:[2,3,10,11],number:[1,2,3,4,9,10,31,33,34,35,36],numel:[1,4],numer:[2,8],nvidia:8,o:[2,4],object:[1,3,8,10,11,13,15,16,17,18,19],obtain:1,obvious:2,occur:9,offer:8,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,33,34,35,36],often:3,omega:9,onc:[2,8,9],one:[2,3,4,5,8,9,47],onli:[2,3,4,8,9,13],op:[1,2],open:14,openai:0,opencl:8,oper:[1,2,3,4,5,8,15,16,17,18,19,45],opportun:8,opsila:8,optim:[8,9],option:[1,3,24,43,47,48],order:[2,3,5,9],org:4,origin:9,osdi:8,other:[2,3,4,7,9,13,22,24,27,29],otherwis:[4,45],our:[1,2,3,8],out:[1,2,3,4,7,9],outlin:9,output2:4,output3:4,output:[1,2,3,4],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,8,9],overfit:4,overflow:2,own:3,p:[4,9],pa:3,packag:13,pact:9,pad:2,par:3,paradigm:[8,9],paragraph:4,parallel:[1,2,3,4,7,8,9,10],paralleliz:8,param:12,paramet:[1,3,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49],parametr:8,part:[3,4,9],particular:[2,3],particularli:[8,9],partit:8,pass:[1,9,10],past:[8,9],path:1,pattern:8,pb:3,peak:9,per:[2,4],percentil:48,perf:3,perf_report:[1,2,3,47],perform:[1,2,4,8,9,15,16,17,18,19,48],persist:4,person:9,perspect:9,pgm:1,phase:9,philosophi:9,philox:[4,35],pid:[1,3,4],pid_m:3,pid_n:3,pip:0,pipelin:[8,9,10],platform:[7,9],pldi:8,plot:[0,1,2,3,47],plot_nam:[1,2,3,47],pmatrix:9,point:[1,9,35],pointer:[1,2,4,13,15,16,17,18,19,24,43],pointerdtyp:[15,16,17,18,19,24,43],polli:9,polyhedr:8,polyhedra:9,popular:9,portabl:[8,9],pose:8,posit:12,possibl:[1,2,3,9,10],power:[2,4,9,12,14],ppopp:9,practic:[1,2,3,8],pragma:8,pre:[0,8],prealloc:1,predict:9,prefer:2,premis:8,present:[0,3],preserv:9,preserve_rng_st:4,prevent:[4,9],primer:9,primit:[8,13],principl:9,print:[1,2,3,4],print_data:[1,2,3],prng:4,probabl:[4,9],problem:1,problemat:9,procedur:9,process:[1,8,9],processor:8,produc:[3,4],product:[7,9,22],program:[1,2,3,4,7,8,31,32],program_id:[1,2,3,4],programm:[8,9],prohibitev:12,project:[4,8],promot:[3,9],properli:2,properti:9,propos:8,proprietari:3,provid:[1,2,3,4,7,9,11,26,28,44,48],prune:4,pseudo:[3,4,35],pseudorandom:4,ptr:3,purpos:[8,9],push:9,put:4,py:[0,1,2,3,4,6],pypi:0,pytest:0,python:[1,2,3,4,5,13],pytorch:[1,2,4],qquad:9,r:[0,2],ragan:8,rand:[1,4],randint4x:34,randn:[2,3,4],random:[4,33,34,35,36],randomli:4,rang:[1,2,3,8,9],rapidli:[8,9],rate:3,rather:8,raw:1,rdom:9,re:[1,3],read:[2,3,5],reader:9,real:8,reason:9,recent:8,recommend:5,recomput:[4,8],record_clock:48,rectifi:8,redmon2016:8,redmon:8,reduct:[2,26,28,44],refer:1,regardless:[4,45],regim:4,regrett:8,regular:[4,9],rel:[1,9],relat:7,releas:[0,8],reli:9,relu:3,remain:[8,47],rememb:3,reorder:9,rep:48,repetit:48,repres:[2,3,9,10],requir:[0,2,4,9],research:[8,9],reset:[11,48],reset_to_zero:11,resolut:9,resourc:8,resp:9,respect:9,restrict:9,result:[0,1,2,8,9],ret:2,retriev:9,reus:3,revisit:8,right:9,rise:9,role:9,ron:4,root:42,roughli:3,row:[2,3,4],row_idx:2,row_minus_max:2,row_start_ptr:2,run:[0,1,2,3,4,7,9,11,13,49],runtim:[9,48],ruslan:4,rvar:9,s:[1,2,4,9,35],said:9,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,8,47],sato2019:9,sato:9,save:[1,2,3],save_path:1,sc:9,scalabl:9,scalar:[4,8,22,33,34,35,36,46],scale:47,scan:9,schedul:8,scienc:9,scientif:9,scop:9,scope:9,script:[0,1,2,3,4],second:[1,2,3,4,9,22,27,29],secondli:4,section:[3,9],see:[1,2,3,4,9],seed:[33,34,35,36],seeded_dropout:4,seem:[1,9],select:[8,9,45],self:[10,47],semant:9,semi:9,sens:[1,8,9],separ:9,sequenc:8,set:[1,4,9],setup:0,sever:[8,9],shall:9,shape:[2,3,4,9,20,24,38,43,45,46],share:8,shaw:4,shift:2,should:[1,3,8,9,10,26,28,44,47],show_plot:[1,2,3],shown:9,side:9,sight:9,signal:8,significantli:2,sigplan:9,simd:8,simpl:[1,2,3,4],simplest:5,simpli:9,simplic:3,simplifi:4,sinc:[1,2,3],sine:40,singl:[2,4,8,34],size:[1,2,4,9],slower:[8,9],slowest:9,sm80:10,sm:9,smaller:[3,4],smallest:[2,12],snemi3d:8,so:[1,2,3,4,9],softmax:[4,5,6],softmax_kernel:2,softmax_output:2,softwar:10,solid:9,solut:3,solv:9,some:3,sometim:9,sourc:[1,2,3,4,5,9],space:[8,9],spars:[4,8,9],spatial:9,speak:3,special:8,specif:[3,8],specifi:[9,12,15,16,17,18,19,43],speed:2,sphinx:[1,2,3,4,5],split:9,spmd:[1,8,9],squar:42,sram:[2,3],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:10,standard:9,start:[5,14],started_tutori:6,state:[4,8,9],statement:9,step:9,still:[1,2,3,9],stop:14,store:[1,2,3,4,15,16,17,18,19,45],str:[11,12,47],straightforward:3,strategi:[4,9],stream:34,strength:8,stride:[2,3,4],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[8,9],style:[1,2,3,47],subscript:9,substanti:8,substract:2,subtract:2,successfulli:9,suffer:9,suit:8,sum:[1,2],superhuman:8,support:[4,9],sure:2,surprisingli:8,surround:9,suspicion:2,sutskev:[4,8],sutskever2014:8,swap:[15,16,17,18,19],swizzl:8,synchron:[1,8],system:[0,3,8,9],t:[1,2,3,9],t_:9,tabul:4,taco:9,take:[3,4,7,12],taken:9,target:8,techniqu:[3,8,9],temperatur:4,tempor:9,tend:9,tension:8,tensor:[1,2,3,4,8,9,11,13,48],tensorrt:8,test:[0,1,7],text:9,tflop:3,th:48,than:[2,3,8,9,34,47],thei:[3,8,9],them:1,themselv:3,theoret:2,therebi:9,therefor:3,theta:9,theta_:9,thi:[1,2,3,4,8,9,11,12,13,35,47],thing:[1,4],think:2,those:2,though:[8,9],thought:9,thread:[2,8,10],through:[5,9],throughout:[9,47],throughput:7,tile:9,time:[0,1,2,3,4,8,9,11,34,48],tiramisu:[8,9],tl:[1,2,3,4,46],tmp:0,tog:9,togeth:4,tolist:4,topic:9,torch:[1,2,3,4,13,48],torch_output:3,torch_relu:3,total:[1,2,3,4,6],tradit:[4,8,9],transform:[4,9],travers:9,trend:8,tri:[20,38],trick:2,tricki:4,trigger:[3,11],triton:[0,1,2,3,4,5,8,9],triton_output:3,trivial:8,tune:[2,3,9,11,12],tuner:10,tupl:[1,20,38,46],tutori:[1,2,3,4,7],tutorials_jupyt:5,tutorials_python:5,tvm:[8,9],two:[1,2,3,9,11,12,14,22],txt:0,type:[12,22,45,46],typecast:[24,43],typic:9,u:[0,33],un:9,uncommon:9,underneath:9,understand:2,undesir:11,unfortun:[3,9],unifi:8,uniformli:4,unint:45,unit:[0,8],univers:9,unrol:9,up:2,updat:[3,9,11],us:[1,2,3,4,8,9,10,11,12,13,34,45,47,49],util:1,v100:9,val:[15,16,17,18,19],valid:1,valu:[1,2,3,4,11,12,14,15,16,17,18,19,21,23,24,25,26,28,30,39,40,41,42,43,44,45,46,47,49],valuabl:2,variabl:[3,10],variant:8,variou:5,vasilach:[8,9],vasilache2018:[8,9],vast:9,vec:9,vector:[4,5,6,8,9],vendor:3,veri:[2,4,9],verif:9,verifi:[2,9],via:9,view:37,visibl:9,vision:8,vs:0,w:9,wa:4,wai:[2,3,4],want:[2,4,45],warmup:48,warp:[2,10],wast:2,we:[1,2,3,4,8,9],well:[4,8,9],whatev:11,wheel:0,when:[2,3,4,8,9,10,11,13,45],where:[1,3,4,9,12,43],whether:[8,47],which:[1,2,3,4,8,9,11,26,28,44,47],whose:[1,2,3,4,9,11,24],wide:9,wise:[1,2,21,23,25,27,29,39,40,41,42,43],wish:[3,9],within:[3,13,14],without:9,wolf:9,wolfe1989:9,won:2,word:9,work:[2,4,7,8],workload:[3,10],wors:[3,8,9],would:[1,2,4],wouldn:9,wrapper:3,write:[1,2,3,4,5,7,9],wrote:2,x:[1,2,3,4,9,21,23,25,27,29,37,39,40,41,42,45,47],x_keep:4,x_keep_ptr:4,x_log:[1,47],x_max:2,x_name:[1,2,3,47],x_ptr:[1,4,11,12],x_size:[11,12],x_val:[1,2,3,47],xi:9,xii:9,xlabel:47,xo:9,y:[1,2,3,9,27,29,45,47],y_log:47,y_name:[1,2],y_ptr:1,y_torch:2,y_triton:2,year:9,yet:[8,9],yi:9,yield:45,yii:9,ylabel:[1,2,3,47],yo:9,you:[0,1,2,3,4,5,8,11,34,45],your:[0,1,7],yourself:[2,3],z:[1,2,9],zero:[3,4,11],zip:5},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:9,algebra:51,api:7,arang:14,arithmet:3,atom:51,atomic_add:15,atomic_ca:16,atomic_max:17,atomic_min:18,atomic_xchg:19,autotun:11,baselin:4,benchmark:[1,2,3,47],binari:0,broadcast_to:20,cach:3,challeng:8,co:21,comparison:51,compil:[9,51],comput:[1,2,3,6],config:10,creation:51,distribut:0,do_bench:48,document:7,dot:22,dropout:4,exercis:4,exp:23,from:0,further:7,fuse:2,gener:51,get:7,go:7,heurist:12,hint:51,index:51,instal:0,introduct:8,jit:13,kernel:[1,2,3],l2:3,languag:[9,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,51],limit:9,linear:51,load:24,log:25,low:4,manipul:51,math:51,matrix:3,max:26,maximum:27,memori:[4,51],min:28,minimum:29,model:51,motiv:[2,3,8],multipl:3,multiple_of:30,num_program:31,number:51,op:51,optim:3,packag:0,perf_report:49,perform:3,pointer:3,polyhedr:9,program:[9,51],program_id:32,python:[0,7],rand:33,randint4x:35,randint:34,randn:36,random:51,ravel:37,reduct:51,refer:[4,8,9],relat:9,represent:9,reshap:38,result:3,s:7,schedul:9,seed:4,shape:51,sigmoid:39,sin:40,softmax:[2,41],sourc:0,sqrt:42,squar:3,start:7,store:43,sum:44,test:[2,3,47,48,49,52],time:6,triton:[7,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52],tutori:5,unit:[2,3],vector:1,welcom:7,where:45,work:9,zero:46}})
\ No newline at end of file
+Search.setIndex({docnames:["getting-started/installation","getting-started/tutorials/01-vector-add","getting-started/tutorials/02-fused-softmax","getting-started/tutorials/03-matrix-multiplication","getting-started/tutorials/04-low-memory-dropout","getting-started/tutorials/05-layer-norm","getting-started/tutorials/index","getting-started/tutorials/sg_execution_times","index","programming-guide/chapter-1/introduction","programming-guide/chapter-2/related-work","python-api/generated/triton.Config","python-api/generated/triton.autotune","python-api/generated/triton.heuristics","python-api/generated/triton.jit","python-api/generated/triton.language.arange","python-api/generated/triton.language.atomic_add","python-api/generated/triton.language.atomic_cas","python-api/generated/triton.language.atomic_max","python-api/generated/triton.language.atomic_min","python-api/generated/triton.language.atomic_xchg","python-api/generated/triton.language.broadcast_to","python-api/generated/triton.language.cos","python-api/generated/triton.language.dot","python-api/generated/triton.language.exp","python-api/generated/triton.language.load","python-api/generated/triton.language.log","python-api/generated/triton.language.max","python-api/generated/triton.language.maximum","python-api/generated/triton.language.min","python-api/generated/triton.language.minimum","python-api/generated/triton.language.multiple_of","python-api/generated/triton.language.num_programs","python-api/generated/triton.language.program_id","python-api/generated/triton.language.rand","python-api/generated/triton.language.randint","python-api/generated/triton.language.randint4x","python-api/generated/triton.language.randn","python-api/generated/triton.language.ravel","python-api/generated/triton.language.reshape","python-api/generated/triton.language.sigmoid","python-api/generated/triton.language.sin","python-api/generated/triton.language.softmax","python-api/generated/triton.language.sqrt","python-api/generated/triton.language.store","python-api/generated/triton.language.sum","python-api/generated/triton.language.where","python-api/generated/triton.language.zeros","python-api/generated/triton.testing.Benchmark","python-api/generated/triton.testing.do_bench","python-api/generated/triton.testing.perf_report","python-api/triton","python-api/triton.language","python-api/triton.testing"],envversion:{"sphinx.domains.c":2,"sphinx.domains.changeset":1,"sphinx.domains.citation":1,"sphinx.domains.cpp":4,"sphinx.domains.index":1,"sphinx.domains.javascript":2,"sphinx.domains.math":2,"sphinx.domains.python":3,"sphinx.domains.rst":2,"sphinx.domains.std":2,"sphinx.ext.intersphinx":1,sphinx:56},filenames:["getting-started/installation.rst","getting-started/tutorials/01-vector-add.rst","getting-started/tutorials/02-fused-softmax.rst","getting-started/tutorials/03-matrix-multiplication.rst","getting-started/tutorials/04-low-memory-dropout.rst","getting-started/tutorials/05-layer-norm.rst","getting-started/tutorials/index.rst","getting-started/tutorials/sg_execution_times.rst","index.rst","programming-guide/chapter-1/introduction.rst","programming-guide/chapter-2/related-work.rst","python-api/generated/triton.Config.rst","python-api/generated/triton.autotune.rst","python-api/generated/triton.heuristics.rst","python-api/generated/triton.jit.rst","python-api/generated/triton.language.arange.rst","python-api/generated/triton.language.atomic_add.rst","python-api/generated/triton.language.atomic_cas.rst","python-api/generated/triton.language.atomic_max.rst","python-api/generated/triton.language.atomic_min.rst","python-api/generated/triton.language.atomic_xchg.rst","python-api/generated/triton.language.broadcast_to.rst","python-api/generated/triton.language.cos.rst","python-api/generated/triton.language.dot.rst","python-api/generated/triton.language.exp.rst","python-api/generated/triton.language.load.rst","python-api/generated/triton.language.log.rst","python-api/generated/triton.language.max.rst","python-api/generated/triton.language.maximum.rst","python-api/generated/triton.language.min.rst","python-api/generated/triton.language.minimum.rst","python-api/generated/triton.language.multiple_of.rst","python-api/generated/triton.language.num_programs.rst","python-api/generated/triton.language.program_id.rst","python-api/generated/triton.language.rand.rst","python-api/generated/triton.language.randint.rst","python-api/generated/triton.language.randint4x.rst","python-api/generated/triton.language.randn.rst","python-api/generated/triton.language.ravel.rst","python-api/generated/triton.language.reshape.rst","python-api/generated/triton.language.sigmoid.rst","python-api/generated/triton.language.sin.rst","python-api/generated/triton.language.softmax.rst","python-api/generated/triton.language.sqrt.rst","python-api/generated/triton.language.store.rst","python-api/generated/triton.language.sum.rst","python-api/generated/triton.language.where.rst","python-api/generated/triton.language.zeros.rst","python-api/generated/triton.testing.Benchmark.rst","python-api/generated/triton.testing.do_bench.rst","python-api/generated/triton.testing.perf_report.rst","python-api/triton.rst","python-api/triton.language.rst","python-api/triton.testing.rst"],objects:{"triton.Config":{__init__:[11,1,1,""]},"triton.language":{arange:[15,2,1,""],atomic_add:[16,2,1,""],atomic_cas:[17,2,1,""],atomic_max:[18,2,1,""],atomic_min:[19,2,1,""],atomic_xchg:[20,2,1,""],broadcast_to:[21,2,1,""],cos:[22,2,1,""],dot:[23,2,1,""],exp:[24,2,1,""],load:[25,2,1,""],log:[26,2,1,""],max:[27,2,1,""],maximum:[28,2,1,""],min:[29,2,1,""],minimum:[30,2,1,""],multiple_of:[31,2,1,""],num_programs:[32,2,1,""],program_id:[33,2,1,""],rand:[34,2,1,""],randint4x:[36,2,1,""],randint:[35,2,1,""],randn:[37,2,1,""],ravel:[38,2,1,""],reshape:[39,2,1,""],sigmoid:[40,2,1,""],sin:[41,2,1,""],softmax:[42,2,1,""],sqrt:[43,2,1,""],store:[44,2,1,""],sum:[45,2,1,""],where:[46,2,1,""],zeros:[47,2,1,""]},"triton.testing":{Benchmark:[48,0,1,""],do_bench:[49,2,1,""],perf_report:[50,2,1,""]},"triton.testing.Benchmark":{__init__:[48,1,1,""]},triton:{Config:[11,0,1,""],autotune:[12,2,1,""],heuristics:[13,2,1,""],jit:[14,2,1,""]}},objnames:{"0":["py","class","Python class"],"1":["py","method","Python method"],"2":["py","function","Python function"]},objtypes:{"0":"py:class","1":"py:method","2":"py:function"},terms:{"0":[1,2,3,4,5,7,9,10,32,33,34,37,47,49],"00":7,"0000":3,"000000":2,"000001":2,"000002":2,"004273":1,"01":[1,3,7],"016170":5,"02":[2,7],"025776":3,"028308":3,"03":[3,7],"04":[4,7],"042978":3,"046389":5,"047592":3,"05":[5,7],"058574":2,"06":7,"060814":3,"061463":3,"062468":3,"0625":3,"071098":5,"076480":5,"081070":5,"08199":4,"08452":4,"084721":1,"088617":5,"0938":3,"097543":2,"0f":10,"0s":4,"1":[1,2,3,4,5,8,10,13,32,33,34,37],"10":[1,3,4,5,34,35,36,37],"100":[2,49],"1024":[1,3,4,5,12],"10240":5,"102553":5,"102570":5,"1045":3,"1048576":1,"106434":4,"10752":5,"108913":3,"109587":3,"11":[0,1,3,5],"111129":5,"111783":2,"11264":5,"1152":3,"11776":5,"118087":5,"12":[1,3,5,7],"12160":2,"12288":[2,5],"123":4,"12416":2,"125224":3,"12544":2,"12672":2,"127":1,"128":[1,2,3,5,12],"1280":3,"12800":5,"13":[1,3,5],"131072":1,"1328":3,"133":5,"13312":5,"133347":2,"134217728":1,"13686":4,"13824":5,"138541":3,"14":[1,3,5],"1408":3,"142849":2,"14336":5,"14848":5,"149375":2,"149397":4,"15":[1,3,5],"153":2,"1536":[3,5],"15360":5,"154":2,"157889":5,"15872":5,"159180":5,"16":[2,3,5,10,47],"160":2,"162":[2,5],"16384":1,"163950":2,"1664":3,"16777216":1,"17":[3,5],"172588":3,"17879":4,"1792":3,"179533":2,"18":[3,5],"181815":5,"182":5,"1823":2,"19":[1,3,5],"190":2,"190482":1,"191":5,"191387":3,"192":1,"1920":3,"198":2,"1982":10,"1983":9,"1984":10,"1989":10,"1991":[9,10],"1999":10,"1d":[1,2,3],"1e":[1,2,3,5],"1s":4,"2":[1,2,3,4,5,8,10,11,13,32,33,49],"20":[3,5,49],"200000":1,"200001":3,"2004":10,"2006":10,"2011":4,"2012":10,"2013":9,"2014":[4,9],"2016":[9,10],"2017":9,"2018":[9,10],"2019":10,"2021":[9,10],"2048":[2,3,5],"208":5,"2097152":1,"21":[3,5],"212868":4,"2141":1,"214186":4,"214963":2,"216187":2,"2176":3,"217818":5,"219":1,"22":[2,3,5,7],"220":[3,5],"23":[3,5,7],"2304":3,"231237":5,"231788":5,"232":5,"233226":3,"24":[3,5],"242181":3,"243":5,"2432":3,"243907":5,"245":[3,5],"246":5,"249":5,"25":[3,5,49],"252":5,"253":5,"253256":5,"253788":5,"254":5,"256":[1,2,3,5,11],"2560":[3,5],"257":5,"26":[3,5],"260869":3,"261":5,"262":5,"262144":1,"262953":3,"265046":2,"2656":3,"268":5,"2688":3,"269":5,"269692":3,"27":[3,5],"271186":5,"273":5,"28":[1,3,5],"280":5,"2812":3,"2816":3,"286":5,"2891":3,"29":[3,5],"291681":3,"293429":4,"293872":5,"2944":3,"298583":3,"2d":[3,5,23],"2m":2,"2mn":2,"3":[0,1,2,3,4,5,10],"30":3,"304157":3,"3072":[3,5],"3076":1,"31":3,"311":5,"312":[1,7],"3125":3,"313973":5,"319081":3,"32":[3,5,11],"3200":3,"326017":5,"326731":5,"32768":1,"3281":3,"33":3,"3328":3,"333321":1,"33554432":1,"336141":5,"34":3,"341":1,"34172":4,"3438":3,"3456":3,"347":5,"3477":3,"3516":3,"351933":5,"3555":3,"3584":[3,5],"36":3,"360017":2,"362445":1,"364394":3,"368":5,"368085":3,"369354":3,"37":3,"3712":3,"3713":1,"371721":4,"372800":3,"373605":3,"374":5,"375705":3,"38":1,"380953":3,"381977":5,"382":5,"384":[2,3],"3840":3,"384000":3,"384634":5,"387109":5,"39":3,"390218":5,"3906":3,"396":5,"3968":3,"398346":3,"3984":3,"3986":4,"3d":[32,33],"3mn":2,"4":[1,2,3,5,10,11,12,35],"40":3,"400001":1,"400016":[1,2],"4023":3,"403":5,"403344":4,"403347":4,"404206":5,"406":2,"4062":3,"408716":4,"4096":[1,2,3,5],"410":5,"410903":5,"412":2,"414":5,"415":2,"41576":4,"417":5,"4194304":1,"42":7,"420":5,"420235":3,"42142":4,"427":5,"428372":4,"428568":1,"428801":3,"429770":1,"431":[2,5,7],"431969":4,"441739":3,"442":5,"446516":3,"446623":3,"447482":5,"448":5,"448255":1,"4492":3,"45":3,"4531":3,"455":5,"457532":2,"46":3,"4608":5,"4609":3,"461":5,"464755":3,"467168":5,"4688":3,"469040":5,"472":1,"473602":3,"477":[4,7],"481":5,"485074":3,"49":3,"4940":1,"497980":5,"4m":2,"4x":2,"5":[1,3,4,5,10,49],"50":[1,7],"500":5,"5000":3,"500614":3,"501303":5,"502":5,"507":5,"507077":3,"509438":3,"51":3,"511":5,"511628":2,"512":[2,3,4,5],"5120":5,"52":3,"521112":5,"524288":1,"526831":3,"527":5,"527177":3,"528":5,"530610":2,"5312":3,"54":3,"540":5,"541":4,"544":5,"545450":2,"545956":5,"546":2,"546756":2,"551":5,"558":2,"56":3,"5632":5,"563555":3,"563952":3,"565214":5,"566038":2,"568":5,"568431":4,"571432":5,"585":2,"5859":3,"586858":4,"588687":3,"5898":3,"592983":3,"5mn":2,"6":[0,1,3,5],"600000":1,"600004":2,"608294":3,"6094":3,"614":[1,2],"6144":5,"615390":1,"62":3,"620451":5,"64":[1,3,5],"640":[2,3],"643199":3,"64kb":5,"65":3,"652963":5,"655":2,"65536":[1,5],"656000":3,"656574":1,"656713":3,"66":3,"661740":2,"664":2,"6656":5,"666684":2,"67086":4,"67108864":1,"671974":5,"6724":1,"673582":5,"674644":5,"676257":3,"679085":2,"68":3,"682":2,"684049":3,"69":3,"690905":5,"694297":2,"6953":3,"699797":5,"7":[0,1,3,5,10],"70":3,"7031":3,"7070":3,"707878":4,"709358":3,"71":3,"7168":5,"716830":2,"719258":4,"72":3,"722":[1,2],"726532":3,"73":3,"730667":3,"737435":1,"743443":4,"75":3,"7500":3,"754967":2,"759501":5,"76":[1,3],"761569":3,"767793":5,"768":[2,3],"7680":5,"768000":3,"77":3,"770":[5,7],"773587":5,"776119":3,"78":3,"780":1,"781":2,"783176":3,"79":3,"790220":5,"791980":5,"79719":4,"8":[1,2,3,5,10,11,12,47,49],"80":[3,49],"800002":1,"806694":4,"81":3,"812":1,"814":2,"815768":3,"817432":4,"818184":3,"8192":[1,5],"82":3,"820159":3,"823517":[1,2],"827785":3,"83":3,"833":1,"833728":3,"834511":3,"838026":4,"8388608":1,"84":3,"842":1,"84284":4,"847":1,"848":1,"849":1,"849303":3,"85":3,"850":1,"857144":5,"86":3,"862216":5,"867052":3,"87":3,"8704":5,"873762":3,"877538":3,"88":3,"8828":3,"8867":3,"888887":3,"889094":3,"89":3,"890274":5,"8906":3,"8945":3,"896":3,"899428":3,"8mn":2,"9":[0,1,2,3,4,5],"90":3,"907859":5,"91":3,"913":7,"92":3,"9216":5,"9219":3,"923":[3,7],"929456":3,"93":2,"932191":3,"9375":3,"939284":3,"94":[2,3],"9492":3,"95":2,"952835":4,"953074":5,"9531":3,"953642":5,"955488":3,"959706":3,"96":[2,5],"9688":3,"97":2,"971025":3,"9728":5,"9733":1,"978909":3,"98":[2,3],"9805":3,"983276":3,"98432":1,"9844":3,"99":5,"998493":3,"999982":5,"999995":1,"abstract":[9,10],"break":10,"byte":2,"case":[1,2,9,10,13,16,17,18,19,20],"class":[2,5,9,10,11,48],"default":49,"do":[2,3,9,10,25,44],"float":[2,9,10,49],"function":[1,2,3,4,5,10,11,12,13,14,48,49,50],"import":[1,2,3,4,5,9,10],"int":[1,9,10,13,15,21,32,33,39,47,49],"new":[21,39,47],"return":[1,2,3,4,5,12,15,16,17,18,19,20,23,25,27,29,32,33,34,35,36,37,38,45,46,47,49,50],"static":[0,9,10],"super":3,"switch":3,"true":[1,2,3,5,23,46],"try":[3,5,11],"var":[5,10],"voil\u00e0":4,"while":[3,5,9],A:[3,4,9,10],And:[0,3],As:[2,3,4,9,10],At:[4,10],But:4,By:49,For:[3,9,10,11],If:[4,10,35,44,46,48],In:[1,2,3,4,10],It:[1,3,4,6,8,10,12,14],NOT:5,Of:9,On:10,One:3,The:[1,2,3,4,9,10,16,17,18,19,20,21,23,32,33,34,35,36,37,39,44,46,50],There:1,These:10,To:[1,4,6,9,10,12],_:5,__expf:2,__init__:[11,48],_db:5,_dropout:4,_dw:5,_layer_norm_bwd_dwdb:5,_layer_norm_bwd_dx_fus:5,_layer_norm_fwd_fus:5,_matmul:3,_seeded_dropout:4,a100:[3,10],a_ptr:3,ab:1,abl:10,about:[1,2,3,4,8],abov:[1,2,3,4,10,12],academ:9,acc:[3,9,10],acceler:9,access:[1,3,9,10,14],accomod:3,accordingli:10,account:10,accumul:[3,5,10],accuraci:[3,9],achiev:[3,9,10],across:[2,4,9,10],activ:3,actual:[3,9,10],ad:5,add:[1,4,5,7,16],add_kernel:1,addit:[2,6,7,9,49],addition:10,address:[9,25],adopt:10,advanc:[2,3,9],advoc:10,affect:3,affin:10,after:3,against:[0,1,2,3,8],aggress:[9,10],agnost:[9,10],ahead:10,aim:[2,8],al:[9,10],alex:4,algebra:10,algorithm:[3,4,9,10],alia:10,all:[2,3,4,6,9,10,12,27,29,31,45,48],allclos:[2,3],allen1984:10,allen:10,alloc:[1,2,3,5,9],allow:[1,2,5,9,10],allow_tf32:23,along:[1,3,27,29,32,33,45,49],also:[1,2,3,4,5,9,10],altern:4,alwai:[10,46],amd:9,amen:10,amount:[5,9],ampl:10,an:[1,2,3,4,9,10,11,16,17,18,19,20,34,35,36,37],analog:1,analysi:[9,10],analyz:10,ancourt1991:10,ancourt:10,ani:[1,2,3,10,12,13,48],anoth:[2,10],anytim:12,apart:10,apex:5,apex_layer_norm:5,api:48,appear:48,appli:[3,4,5,9,10],applic:[4,10,13],approach:[9,10],appropri:1,approxim:2,ar:[0,1,2,3,4,9,10,11,12,14,25,31,44,46,48],arang:[1,2,3,4,5],arbitrari:3,architectur:[3,9],area:10,arg:[1,2,3,5,11,13,14,48],argument:[1,2,3,11,12,13,14,46,48],arrai:[10,47],arrang:3,art:[9,10],artifici:4,arxiv:[9,10],ask:2,aspect:10,asplo:9,assert:[1,2,3,4],assert_almost_equ:5,assum:[2,48],asynchron:[1,9],atom:[16,17,18,19,20],atomic_ca:5,atomic_xchg:5,auguin1983:9,auguin:9,auto:[2,3,10,11,12,13],autograd:5,autom:9,automat:[2,3,9,10,11],autotun:[3,10],avail:[0,4,9,10],avoid:[2,12,46],awar:9,awkward:4,axi:[1,2,3,4,5,27,29,32,33,45,48],b:[3,5,9,10],b_ptr:3,back:[1,2,3,4,5],backpropag:4,backward:5,bad:4,baghdadi2021:[9,10],baghdadi:[9,10],balanc:10,bandwidth:2,base:[4,8,9,10],basic:[1,6,10],becom:9,been:[1,9,10],befor:[3,11,12,16,17,18,19,20],begin:10,behavior:[10,12],being:[2,4],believ:10,below:[4,6,10],bench:[0,12],bench_layer_norm:5,benchmark:[0,5,49,50],benefit:[2,9,10],best:[1,9],between:[1,9],bfloat16:23,bia:5,bit:4,block:[1,2,3,4,9,10,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,34,35,36,37,38,39,40,41,42,43,44,45,46,47],block_siz:[1,2,4,5,10,12,13],block_size_k:3,block_size_m:[3,5],block_size_n:[3,5],block_start:[1,4],blue:[1,2,3,5],boil:10,bool:[46,48],both:[10,46],bound:[1,2,3,10],branch:10,broad:9,broadcast:[21,25,44,46],buffer:5,build:[0,3],built:[1,10],c:[3,9,10],c_mask:3,c_ptr:3,cach:[9,10,25],cache_modifi:25,call:[1,3,10,11,14,35],callabl:[1,13,14,49],can:[0,1,2,3,4,9,10,12,50],cannot:[3,9,10],capabl:[8,9],cast:5,cd:[0,6],cdiv:[1,3,4,5],ceil:13,certain:13,cgo:[9,10],challeng:4,chang:[3,4,12,25],chapter:8,characterist:10,cheap:9,check:[3,8],checkpoint:4,chen2018:9,chen:9,chip:2,choic:8,click:[1,2,3,4,5],clone:[0,5],close:10,cmake:0,cmp:[16,17,18,19,20],coalesc:9,code:[1,2,3,4,5,6,9,10],col:[3,5,10],col_offset:2,color:48,column:[2,3],com:[0,5],combin:9,come:[2,3,10],command:0,common:10,commonli:10,compar:[2,3,4,5,8,10,16,17,18,19,20],compat:23,compil:[2,3,8,9,11,14,31],complet:10,complex:10,compos:[4,9],composit:10,comprehens:[9,10],comput:[4,5,8,9,10,13,22,24,26,28,30,40,41,42,43],computation:[9,10],concern:10,concis:[1,48],condit:[10,46],config:[3,12],configur:[3,11,12,50],confirm:2,connectom:9,consecut:10,consequ:9,consid:2,consist:4,constexpr:[1,2,3,4,5,34,35,36,37],constraint:[3,10],construct:9,constructor:48,consum:3,contain:[10,16,17,18,19,20,48],contextu:10,contigu:[3,15,38],control:[9,10],conveni:3,convert:[1,3,14],convolut:9,cooper:11,copi:[4,9,16,17,18,19,20],core:[9,10,34,35,36,37],correct:1,correspond:[1,2,3,48],cosin:22,cost:10,could:[2,10],count:5,cours:9,cpython:0,creat:[1,2,3,5,9],crucial:4,csv:1,ctx:5,cubla:[3,9],cuda:[1,2,3,4,5,9],cudnn:9,current:33,custom:[1,2,3,8],cut:3,cvpr:9,d:[2,4,12,14],dart:10,darte1999:10,data:[1,3,4,5,9,10,16,17,18,19,20,25,46,47],data_ptr:14,dataflow:10,david:4,db:5,db_ref:5,db_tri:5,deal:4,decad:9,decim:5,declar:1,decompos:10,decor:[1,3,12,13,14],decreas:4,dedic:3,deep:[3,4,9,10],def:[1,2,3,4,5,12,13],defin:[1,2,3,10,25],definit:10,denomin:2,denot:1,dens:10,depend:[0,6,10,46],deploi:9,describ:[4,10],design:10,desir:[21,39],detail:[3,10],detect:9,develop:[9,10],devic:[1,2,3,5],dg:5,dialect:10,dict:[12,13],dictionari:[11,13],diesel:10,differ:[1,2,3,4,9,10,12,48],difficult:10,difficulti:[3,9],dijkstra82:10,dijkstra:10,dim:[2,5,10],dimens:[3,23,27,29,45],dimension:[3,10,23],dir:0,direct:3,disjoint:10,disk:1,dissert:10,distribut:[2,4,10],divis:3,dnn:[8,9,10],do_bench:[1,2,3,5],doc:4,doe:[1,2,3,10],doesn:[5,10],domain:[9,10],don:[1,2,3],done:[3,9,27,29,45],dot:3,doubli:3,doubt:10,down:[3,10],download:[0,1,2,3,4,5,6],dram:[1,2],dropout:[6,7],dror:4,dsl:[8,9,10],dtype:[1,2,3,5,16,17,18,19,20,25,44,47],dw:5,dw_ref:5,dw_tri:5,dx:5,dx_ref:5,dx_tri:5,dy:5,e:[0,2,3,4,6,9,10,47],each:[1,2,3,4,5,9,10,11,13],eas:10,easi:[3,4],easier:[1,2,9],easili:3,ed:[1,3],education:2,effect:10,effici:[3,4,9,36],effort:10,either:[1,32,33,46],elango2018:10,elango:10,element:[1,2,3,4,5,22,24,26,27,28,29,30,40,41,42,43,44,45,46,48],element_s:[2,5],element_ti:[16,17,18,19,20,25,44],elementwis:[2,25],els:[3,5],emerg:9,empti:[3,5],empty_lik:[1,2,4,5],enabl:10,encod:10,encourag:4,end:[9,10,15],enforc:10,engin:10,enqueu:[1,2,5],ensur:10,entir:10,entri:36,environ:8,ep:5,equal:10,error:3,especi:9,et:[4,9,10],euromicro:9,evalu:[3,4,12,46],even:[4,10],evidenc:9,evolv:9,exampl:[1,2,3,4,5,6,9,10,11],except:5,exchang:20,execut:[7,9,10,11,50],exist:[9,10],exp:2,expect:[2,16,17,18,19,20],expens:[9,10,13],explor:[4,9],exponenti:[2,24],express:[9,10],extend:[3,4],extra:1,extras_requir:5,extrem:10,f:[1,2,3,10],facilit:[9,10],fact:10,fairli:3,fals:[5,25,42,44,46,48,49],far:2,fast:[2,5,9,10],faster:[2,35],fastest:10,featur:5,feel:3,fetch:9,few:10,field:[9,12],figur:10,file:[1,2,3,7],fill:47,final_db:5,final_dw:5,fine:4,first:[1,3,4,5,8,10,23,28,30],first_pid_m:3,firstli:4,fit:2,fix:48,flag:2,flatten:38,flexibl:9,float16:[3,5,23,47],float32:[1,2,3,4,5,23,34,37],flow:[9,10],fly:4,fn:[14,49],focu:[3,10],folder:4,follow:[0,2,3,8,9,10],footprint:4,forc:4,forget:1,formal:10,format:10,forward:5,found:[16,17,18,19,20],foundat:10,four:36,fp16:3,fp32:3,frac:4,framework:[9,10],free:3,from:[1,2,3,4,9,10,25,46],full:[1,2,3,4,5],fulli:10,func:10,fundament:10,further:[4,10],fuse:[3,5,6,7],fusedlayernorm:5,fusion:[2,10],g:[3,4,9,10,47],galleri:[1,2,3,4,5,6],gb:[1,2,5],gbp:[1,2,5],gener:[1,2,3,4,5,6,9,10,34,35,36,37,48],geoffrei:4,geq:10,get:[1,2,3,4,7],girbal2006:10,girbal:10,git:0,github:[0,5],give:9,given:[2,3,4,21,32,33,34,35,36,37,39,47],global:10,go:[1,3,10],good:[1,10],gpgpu:9,gpu:[1,2,4,8,9,10,11,14],grad:5,grad_to_non:[5,49],gradient:[5,49],grammat:10,graphic:9,greater:2,green:[1,2,3,5],grid:[1,2,3,4,5,32,33],grid_m:3,grid_n:3,grosser2012:10,grosser:10,group:3,group_id:3,group_m:3,group_size_m:[3,5],grow:10,guard:[1,2],guid:9,ha:[1,3,4,9,10,32,33],had:1,halid:[9,10],hand:10,handl:[1,2,4,10],handwritten:9,hard:3,harder:10,hardwar:[3,8,10],has_apex:5,hasn:1,have:[2,4,9,10,14,23,46,48],heavi:9,helper:[1,2],henc:3,here:[1,2,3,4,5],heurist:[2,5],hierarch:9,hierarchi:10,high:[3,9,10],higher:3,highli:9,highlight:10,hint:10,hinton:4,hit:3,how:[1,2,3,8,9,13],howev:[2,10],html:4,http:[0,4,5],i:[1,2,3,4,5,9,10],id:[3,33],idea:9,ideal:2,ident:2,identifi:1,idx:[25,44],ieee_round:42,ilya:4,imag:[9,10],implement:[1,2,3,4,9,10],implicitli:[1,14,25,44],importantli:10,impos:10,improv:[3,4],incompat:[3,10],incorrect:3,increas:[1,2,3,4],incred:9,increment:10,inde:10,independ:[2,5,10],index:1,indic:[10,46],induc:10,industri:9,inequ:10,inf:2,inform:10,infrastructur:10,initi:[1,3],inner:[3,23],inplac:3,input:[1,2,3,4,5,10,12,13,21,22,23,24,26,27,28,29,30,31,38,39,40,41,42,43,45],input_ptr:2,input_row_strid:2,instal:[6,8],instanc:[1,2,3,4,5,9,11,32,33],instanti:4,instead:[2,46],instruct:[8,9],int1:[25,44],int32:[4,5,35,36],integ:10,interchang:10,interest:[5,9,10],intermedi:10,intern:[2,10],interv:15,intrins:10,introduc:4,introduct:8,invari:[2,10],invoc:4,ipynb:[1,2,3,4,5],ir:10,irregular:[2,10],is_contigu:[3,4],is_cuda:1,isn:3,issu:[9,10],iter:[3,9,10],its:[1,2,3,10,12],j:[3,9,10],jit:[1,2,3,4,5,12,13],jmlr:4,john:4,johnson:4,journal:10,jrk2013:9,jupyt:[1,2,3,4,5,6],just:[3,10,13],k:[3,4,9,10],kb:9,keep:4,kei:[3,9,12],kellei:9,kernel:[4,5,8,9,11,12,13],keyword:[1,11],ki:10,kind:2,know:31,known:10,krizhevski:4,kwarg:[11,14],l2:5,label:[1,2,3,48],lam1991:9,lam:9,lambda:[1,2,3,4,5,13],languag:[1,2,3,4,5,8,9,14],larg:[9,10],last:3,later:[2,10],latest:0,lattner2004:10,lattner2019:10,lattner:10,launch:[1,2,3,32,33],law:10,layer:[6,7,9,10],layer_norm:5,layernorm:5,lead:[4,9,10],leaky_relu:3,leakyrelu:3,learn:[1,2,3,4,8,9,10],least:10,lee2017:9,lee:9,left:10,legal:10,length:1,less:[4,5,9,10],let:[1,2,4,31],letter:10,level:[3,9,10],li:9,librari:[0,3,9,10],lifelong:10,like:[1,4,9,10,35],limit:[2,4],lindenstrauss:4,line:[1,2,3,4,10,48],line_arg:[1,2,3,5,48],line_nam:[1,2,3,5,48],line_v:[1,2,3,5,48],linear:[9,10],link:0,list:[1,3,12,13,48,49,50],litteratur:10,ll:4,llvm11:0,llvm:[0,10],load:[1,2,3,4,5,10,46],local:[9,10],locat:[3,16,17,18,19,20,25,44],lock:5,lock_id:5,log2:13,log:48,logarithm:[1,26],look:[4,8,9],loop:[3,10,11],low:[6,7,10],m:[0,2,3,5,9],machin:[9,10],machineri:[9,10],made:9,mai:[2,10,13],main:[3,9,10],maintain:[2,10],major:[3,10],make:[1,2,9,10],manag:[4,9],mani:[9,10],manual:[2,10],manual_se:[1,2,3],map:3,mapl:10,mark:[4,50],markedli:9,mask:[1,2,3,4,5,16,18,19,20,25,44,46],match:[3,16,17,18,19,20],math:13,mathbb:10,mathbf:10,mathcal:[10,37],mathemat:10,matmul:[3,10],matmul_kernel:3,matric:[2,3],matrix:[2,4,6,7,9,10,11,23],matrix_s:10,matter:[3,9,10],max:[1,2,5,18],max_fused_s:5,max_m:[1,2,3,5],maxim:[8,10,36],maximum:[1,2,27],mb:[7,9],mean1:5,mean2:5,mean:[3,5,10,12],mechan:[2,10],median:49,memori:[1,2,3,6,7,9,10,16,17,18,19,20,25,44,46],mention:3,meta:[1,2,3,4,5,11,12,13],metaparamet:1,method:[10,11,14,48,50],methodolog:10,micro:9,min:[3,5,19],min_m:[1,2,3,5],minimum:29,minut:[1,2,3,4,5],miss:10,mitig:10,ml:9,mlir:10,mn:2,mode:5,model:[1,9,10,12],modern:[3,8,9,10],modular:10,modulenotfounderror:5,moor:10,mora:4,more:[2,3,4,8,9,10,48],most:[3,10],mostli:11,move:3,movement:4,ms:[1,2,3,5,49],much:[2,3],mullapudi2016:10,mullapudi:10,multi:[3,9,10],multipl:[1,4,6,7,9,10,11,12,31,35],multipli:[3,4,5,10,23],must:[2,3,15,23,46],n:[2,3,5,9,37],n_col:2,n_element:[1,4],n_round:[34,35,36,37],n_row:2,naiv:[2,4],naive_softmax:2,name:[1,2,3,12,13,48],nativ:[1,2,3],natur:[2,9,26],nb:9,necessari:2,need:[1,2,3,4,35],nelement:2,nest:[3,10],net:10,network:[4,9,10],neural:[4,9,10],neurosci:9,never:4,next:[2,3],next_power_of_2:[2,5],nightli:0,nip:9,nitish:4,nn:[3,5],non:9,none:[2,3,5,11,12,16,18,19,20,25,44,48,49],nonzero:46,norm:[4,5,7],normal:[2,6,7],normalized_shap:5,note:[0,1,2,3,4,10,12,14,46],notebook:[1,2,3,4,5,6],notic:[2,10],notori:[3,9],novel:9,now:[1,3],num_pid_in_group:3,num_pid_m:3,num_pid_n:3,num_stag:[3,11,12],num_warp:[2,3,5,11,12],number:[1,2,3,4,5,10,11,12,32,34,35,36,37],numel:[1,4,5],numer:[2,9],nvidia:[5,9,25],o:[2,4],object:[1,3,9,11,12,14,16,17,18,19,20],obtain:1,obvious:2,occur:10,off:5,offer:9,offici:0,offs_am:3,offs_bn:3,offs_cm:3,offs_cn:3,offs_k:3,offset:[1,4,5,34,35,36,37],often:3,omega:10,onc:[2,9,10],one:[2,3,4,5,6,9,10,48],onli:[2,3,4,9,10,14],op:[1,2],open:15,openai:0,opencl:9,oper:[1,2,3,4,6,9,16,17,18,19,20,46],opportun:9,opsila:9,optim:[9,10],option:[3,12,25,44,48,49],orang:5,order:[2,3,6,10],org:4,origin:10,osdi:9,other:[2,3,4,5,8,10,14,23,25,28,30],otherwis:[4,46],our:[1,2,3,9],out:[1,2,3,4,5,8,10],outlin:10,output2:4,output3:4,output:[1,2,3,4,5],output_ptr:[1,2,4],output_row_start_ptr:2,output_row_strid:2,output_torch:1,output_triton:1,over:[2,4,9,10],overfit:4,overflow:2,own:3,p:[4,10],pa:3,packag:14,pact:10,pad:2,par:3,paradigm:[9,10],paragraph:4,parallel:[1,2,3,4,5,8,9,10,11],paralleliz:9,param:13,paramet:[1,3,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50],parametr:9,part:[3,4,10],partial:5,partial_db:5,partial_dw:5,particular:[2,3],particularli:[9,10],partit:9,pass:[1,5,10,11],past:[9,10],path:1,pattern:9,pb:3,peak:10,per:[2,4,5],percentil:49,perf:3,perf_model:12,perf_report:[1,2,3,5,48],perform:[1,2,4,9,10,12,16,17,18,19,20,49],persist:4,person:10,perspect:10,phase:10,philosophi:10,philox:[4,36],pid:[1,3,4,5],pid_m:3,pid_n:3,pip:[0,6],pipelin:[9,10,11],platform:[8,10],pldi:9,plot:[0,1,2,3,48],plot_nam:[1,2,3,5,48],pmatrix:10,point:[1,10,36],pointer:[1,2,4,5,14,16,17,18,19,20,25,44],pointerdtyp:[16,17,18,19,20,25,44],polli:10,polyhedr:9,polyhedra:10,popular:10,portabl:[9,10],pose:9,posit:[5,13],possibl:[1,2,3,10,11],power:[2,4,10,13,15],ppopp:10,practic:[1,2,3,9],pragma:9,pre:[0,9],pre_hook:11,prealloc:1,predic:12,predict:10,prefer:2,premis:9,present:0,preserv:10,preserve_rng_st:4,prevent:[4,10],primer:10,primit:[9,14],principl:10,print:[1,3,4],print_data:[1,2,3,5],prng:4,probabl:[4,10],problem:1,problemat:10,procedur:10,process:[1,5,9,10],processor:9,produc:[3,4],product:[8,10,23],program:[1,2,3,4,5,8,9,32,33],program_id:[1,2,3,4,5],programm:[9,10],prohibitev:13,project:[4,9],promot:[3,10],properli:2,properti:10,propos:9,proprietari:3,provid:[1,2,3,4,5,8,10,12,27,29,45,49],prune:[4,12],prune_configs_bi:12,prune_num_stages_bi:12,pseudo:[3,4,36],pseudorandom:4,ptr:3,ptx:25,purpos:[9,10],push:10,put:4,py:[0,1,2,3,4,5,7],pypi:[0,5],pytest:0,python:[1,2,3,4,5,6,14],pytorch:[1,2,4],qquad:10,r:2,ragan:9,rais:5,rand:[1,4,5],randint4x:35,randn:[2,3,4,5],randn_lik:5,random:[4,34,35,36,37],randomli:4,rang:[1,2,3,5,9,10],rapidli:[9,10],rate:3,rather:9,raw:1,rdom:10,re:[1,3],read:[2,3,6],reader:10,real:9,reason:10,recent:9,recommend:6,recomput:[4,9],record_clock:49,rectifi:9,redmon2016:9,redmon:9,reduct:[2,5,27,29,45],refer:1,regardless:[4,46],regim:4,regrett:9,regular:[4,10],rel:[1,10],relat:8,releas:[0,5,9],reli:10,relu:3,remain:[9,48],rememb:3,reorder:10,rep:[5,49],repetit:49,repres:[2,3,10,11],requir:[2,4,10],requires_grad:5,requires_grad_:5,research:[9,10],reset:[12,49],reset_to_zero:12,reshap:5,resolut:10,resourc:9,resp:10,respect:10,restrict:10,result:[0,1,2,9,10],ret:2,retain_graph:5,retriev:10,reus:3,revisit:9,right:10,rise:10,role:10,ron:4,root:43,roughli:3,row:[2,3,4,5],row_idx:2,row_minus_max:2,row_start_ptr:2,rstd:5,run:[0,1,2,3,4,5,8,10,12,14,50],runtim:[10,49],runtimeerror:5,ruslan:4,rvar:10,s:[1,2,4,5,10,36],said:10,salakhutdinov:4,salmon2011:4,salmon:4,same:[4,9,48],sato2019:10,sato:10,save:[1,2,3],save_for_backward:5,save_path:[1,5],saved_tensor:5,sc:10,scalabl:10,scalar:[4,9,23,34,35,36,37,47],scale:48,scan:10,schedul:9,scienc:10,scientif:10,scop:10,scope:10,script:[0,1,2,3,4,5],second:[1,2,3,4,5,10,23,28,30],secondli:4,section:[3,10],see:[1,2,3,4,10],seed:[34,35,36,37],seeded_dropout:4,seem:[1,10],select:[9,10,46],self:[11,48],semant:10,semi:10,sens:[1,9,10],separ:[5,10],sequenc:9,set:[1,4,10],setup:[0,5],sever:[9,10],shall:10,shape:[1,2,3,4,5,10,21,25,39,44,46,47],share:9,shaw:4,shift:2,should:[1,3,5,9,10,11,27,29,45,48],show_plot:[1,2,3],shown:10,side:10,sight:10,signal:9,significantli:2,sigplan:10,simd:9,simpl:[1,2,3,4],simplest:6,simpli:10,simplic:3,simplifi:4,sinc:[1,2,3],sine:41,singl:[2,4,9,35],size:[1,2,4,10],slower:[9,10],slowest:10,sm80:11,sm:10,smaller:[3,4],smallest:[2,13],snemi3d:9,so:[1,2,3,4,5,10],softmax:[4,6,7],softmax_kernel:2,softmax_output:2,softwar:11,solid:10,solut:3,solv:10,some:3,sometim:10,sourc:[1,2,3,4,5,6,10],space:[9,10],spars:[4,9,10],spatial:10,speak:3,special:9,specif:[3,9],specifi:[10,13,16,17,18,19,20,44],speed:2,sphinx:[1,2,3,4,5,6],split:10,spmd:[1,9,10],sqrt:5,squar:43,sram:[2,3,5],srivastava2014:4,srivastava:4,stabil:2,stabl:0,stage:11,stai:5,standard:10,start:[5,6,15],started_tutori:7,state:[4,9,10],statement:10,staticmethod:5,std:5,step:10,still:[1,2,3,10],stop:15,store:[1,2,3,4,5,16,17,18,19,20,46],str:[12,13,25,48],straightforward:3,strategi:[4,10],stream:[5,35],strength:9,stride:[2,3,4,5],stride_ak:3,stride_am:3,stride_bk:3,stride_bn:3,stride_cm:3,stride_cn:3,stride_xi:3,stride_xj:3,structur:[9,10],style:[1,2,3,5,48],subscript:10,substanti:9,substract:2,subtract:2,successfulli:10,suffer:10,suit:9,sum:[1,2,5],sum_db:5,sum_dw:5,superhuman:9,support:[4,5,10],sure:2,surprisingli:9,surround:10,suspicion:2,sutskev:[4,9],sutskever2014:9,swap:[16,17,18,19,20],swizzl:9,synchron:[1,9],system:[0,3,9,10],t:[1,2,3,5,10],t_:10,tabul:4,taco:10,take:[3,4,8,12,13],taken:10,target:9,techniqu:[9,10],temperatur:4,tempor:10,tend:10,tension:9,tensor:[1,2,3,4,5,9,10,12,14,49],tensorrt:9,test:[0,1,5,8],test_layer_norm:5,text:10,tflop:3,th:49,than:[2,3,5,9,10,35,48],thei:[3,9,10],them:1,themselv:3,theoret:2,therebi:10,therefor:3,theta:10,theta_:10,thi:[1,2,3,4,5,9,10,11,12,13,14,36,48],thing:[1,4],think:2,those:2,though:[9,10],thought:10,thread:[2,9,11],through:[6,10],throughout:[10,48],throughput:8,tile:10,time:[0,1,2,3,4,5,9,10,12,35,49],tiramisu:[9,10],tl:[1,2,3,4,5,47],tmp:0,tog:10,togeth:4,tolist:4,top_k:12,topic:10,torch:[1,2,3,4,5,14,49],torch_output:3,torch_relu:3,total:[1,2,3,4,5,7],tradit:[4,9,10],transform:[4,10],travers:10,trend:9,tri:[21,39],trick:2,tricki:4,trigger:[3,12],triton:[0,1,2,3,4,5,6,9,10],triton_output:3,trivial:9,tune:[2,3,10,12,13],tuner:11,tupl:[1,21,39,47],tutori:[1,2,3,4,8],tutorials_jupyt:6,tutorials_python:6,tvm:[9,10],two:[1,2,3,10,12,13,15,23],type:[13,23,25,46,47],typecast:[25,44],typic:10,u:[0,34],un:10,uncommon:10,underneath:10,understand:2,undesir:12,unfortun:[3,10],unifi:9,uniformli:4,unint:46,unit:[0,9],univers:10,unrol:10,up:2,updat:[3,10,12],us:[1,2,3,4,5,9,10,11,12,13,14,35,46,48,50],util:[1,5],v100:10,v:5,val:[16,17,18,19,20],valid:1,valu:[1,2,3,4,12,13,15,16,17,18,19,20,22,24,25,26,27,29,31,40,41,42,43,44,45,46,47,48,50],valuabl:2,variabl:[3,11],variant:9,variou:6,vasilach:[9,10],vasilache2018:[9,10],vast:10,vec:10,vector:[4,6,7,9,10],vendor:3,veri:[2,4,10],verif:10,verifi:[2,10],via:10,view:38,visibl:10,vision:9,volatil:25,vs:0,w:[5,10],w_shape:5,wa:4,wai:[2,3,4],want:[2,4,46],warmup:49,warp:[2,5,11],wast:2,wdy:5,we:[1,2,3,4,9,10],weight:5,well:[4,9,10],whatev:12,wheel:0,when:[2,3,4,9,10,11,12,14,46],where:[1,3,4,5,10,13,44],whether:[9,48],which:[1,2,3,4,5,9,10,12,27,29,45,48],whose:[1,2,3,4,10,12,25],wide:10,wise:[1,2,22,24,26,28,30,40,41,42,43,44],wish:[3,10],within:[3,14,15],without:10,wolf:10,wolfe1989:10,won:2,word:10,work:[2,4,8,9],workload:[3,11],wors:[3,9,10],would:[1,2,4],wouldn:10,wrapper:3,write:[1,2,3,4,5,6,8,10],wrote:2,x:[1,2,3,4,5,10,22,24,26,28,30,38,40,41,42,43,46,48],x_arg:5,x_keep:4,x_keep_ptr:4,x_log:[1,48],x_max:2,x_name:[1,2,3,5,48],x_ptr:[1,4,12,13],x_shape:5,x_size:[12,13],x_val:[1,2,3,5,48],xhat:5,xi:10,xii:10,xlabel:48,xmean:5,xo:10,y:[1,2,3,5,10,28,30,46,48],y_fwd:5,y_log:48,y_name:[1,2],y_ptr:1,y_ref:5,y_torch:2,y_tri:5,y_triton:2,year:10,yet:[9,10],yi:10,yield:46,yii:10,ylabel:[1,2,3,5,48],yo:10,you:[0,1,2,3,4,6,9,12,35,46],your:[0,1,8],yourself:[2,3],z:[1,2,10],zero:[3,4,5,12],zip:6},titles:["Installation","Vector Addition","Fused Softmax","Matrix Multiplication","Low-Memory Dropout","Layer Normalization","Tutorials","Computation times","Welcome to Triton\u2019s documentation!","Introduction","Related Work","triton.Config","triton.autotune","triton.heuristics","triton.jit","triton.language.arange","triton.language.atomic_add","triton.language.atomic_cas","triton.language.atomic_max","triton.language.atomic_min","triton.language.atomic_xchg","triton.language.broadcast_to","triton.language.cos","triton.language.dot","triton.language.exp","triton.language.load","triton.language.log","triton.language.max","triton.language.maximum","triton.language.min","triton.language.minimum","triton.language.multiple_of","triton.language.num_programs","triton.language.program_id","triton.language.rand","triton.language.randint","triton.language.randint4x","triton.language.randn","triton.language.ravel","triton.language.reshape","triton.language.sigmoid","triton.language.sin","triton.language.softmax","triton.language.sqrt","triton.language.store","triton.language.sum","triton.language.where","triton.language.zeros","triton.testing.Benchmark","triton.testing.do_bench","triton.testing.perf_report","triton","triton.language","triton.testing"],titleterms:{"final":3,addit:1,advantag:10,algebra:52,api:8,arang:15,arithmet:3,atom:52,atomic_add:16,atomic_ca:17,atomic_max:18,atomic_min:19,atomic_xchg:20,autotun:12,baselin:4,benchmark:[1,2,3,48],binari:0,broadcast_to:21,cach:3,challeng:9,co:22,comparison:52,compil:[10,52],comput:[1,2,3,7],config:11,creation:52,distribut:0,do_bench:49,document:8,dot:23,dropout:4,exercis:4,exp:24,from:0,further:8,fuse:2,gener:52,get:8,go:8,heurist:13,hint:52,index:52,instal:0,introduct:9,jit:14,kernel:[1,2,3],l2:3,languag:[10,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,52],layer:5,limit:10,linear:52,load:25,log:26,low:4,manipul:52,math:52,matrix:3,max:27,maximum:28,memori:[4,52],min:29,minimum:30,model:52,motiv:[2,3,9],multipl:3,multiple_of:31,normal:5,num_program:32,number:52,op:52,optim:3,packag:0,perf_report:50,perform:3,pointer:3,polyhedr:10,program:[10,52],program_id:33,python:[0,8],rand:34,randint4x:36,randint:35,randn:37,random:52,ravel:38,reduct:52,refer:[4,9,10],relat:10,represent:10,reshap:39,result:3,s:8,schedul:10,seed:4,shape:52,sigmoid:40,sin:41,softmax:[2,42],sourc:0,sqrt:43,squar:3,start:8,store:44,sum:45,test:[2,3,48,49,50,53],time:7,triton:[8,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53],tutori:6,unit:[2,3],vector:1,welcom:8,where:46,work:10,zero:47}})
\ No newline at end of file