Merge triton-mlir branch - Complete rewrite of the backend from scratch (#1004)

This PR merges the `triton-mlir` branch, in which we have been quietly rewriting the Triton backend from scratch to increase maintainability, stability and ultimately performance. Changes to the runtime are minimal, and this new version aims to remain backward-compatible with the previous commit. The legacy backend is now officially deprecated, but can still be accessed via the `legacy-backend` tag. Co-authored-by: Keren Zhou <kerenzhou@openai.com> Co-authored-by: Yan Chunwei <yanchunwei@outlook.com> Co-authored-by: goostavz <109190422+goostavz@users.noreply.github.com> Co-authored-by: Shintaro Iwasaki <siwasaki@fb.com> Co-authored-by: Yan Da <dyanab@connect.ust.hk> Co-authored-by: Jun Yang <yangjunpro@gmail.com> Co-authored-by: Ian Bearman <ianb@microsoft.com> Co-authored-by: Jason Ansel <jansel@jansel.net> Co-authored-by: Qingyi Liu <qingyil@nvidia.com> Co-authored-by: ben-zhang-609 <110140741+ben-zhang-609@users.noreply.github.com> Co-authored-by: Chenggang Zhao <lyricz@yeah.net> Co-authored-by: ben-zhang-609 <benzh609@gmail.com> Co-authored-by: dongdongl <dongdongl@nvidia.com>
2022-12-21 01:30:50 -08:00
parent 8650b4d1cb
commit 20100a7254
285 changed files with 26312 additions and 50143 deletions
--- a/python/tutorials/02-fused-softmax.py
+++ b/python/tutorials/02-fused-softmax.py
@@ -80,7 +80,7 @@ def softmax_kernel(
    row = tl.load(input_ptrs, mask=col_offsets < n_cols, other=-float('inf'))
    # Subtract maximum for numerical stability
    row_minus_max = row - tl.max(row, axis=0)
-    # Note that exponentials in Triton are fast but approximate (i.e., think __expf in CUDA)
+    # Note that exponentiation in Triton is fast but approximate (i.e., think __expf in CUDA)
    numerator = tl.exp(row_minus_max)
    denominator = tl.sum(numerator, axis=0)
    softmax_output = numerator / denominator
@@ -188,4 +188,4 @@ benchmark.run(show_plots=True, print_data=True)
 #
 #  - Triton is 4x faster than the Torch JIT. This confirms our suspicions that the Torch JIT does not do any fusion here.
 #  - Triton is noticeably faster than :code:`torch.softmax` -- in addition to being **easier to read, understand and maintain**.
-#    Note however that the PyTorch `softmax` operation is more general and will works on tensors of any shape.
+#    Note however that the PyTorch `softmax` operation is more general and will work on tensors of any shape.
--- a/python/tutorials/03-matrix-multiplication.py
+++ b/python/tutorials/03-matrix-multiplication.py
@@ -156,16 +156,7 @@ import triton.language as tl

@triton.autotune(
    configs=[
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
-        triton.Config({'BLOCK_SIZE_M': 256, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 128, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=4, num_warps=4),
-        triton.Config({'BLOCK_SIZE_M': 64, 'BLOCK_SIZE_N': 32, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
-        triton.Config({'BLOCK_SIZE_M': 32, 'BLOCK_SIZE_N': 64, 'BLOCK_SIZE_K': 32, 'GROUP_SIZE_M': 8}, num_stages=5, num_warps=2),
+        triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3, num_warps=8),
    ],
    key=['M', 'N', 'K'],
 )
@@ -236,8 +227,8 @@ def matmul_kernel(
        b_ptrs += BLOCK_SIZE_K * stride_bk
    # you can fuse arbitrary activation functions here
    # while the accumulator is still in FP32!
-    if ACTIVATION == "leaky_relu":
-        accumulator = leaky_relu(accumulator)
+    if ACTIVATION:
+        accumulator = ACTIVATION(accumulator)
    c = accumulator.to(tl.float16)

    # -----------------------------------------------------------
@@ -252,7 +243,6 @@ def matmul_kernel(
 # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`
@triton.jit
 def leaky_relu(x):
-    x = x + 1
    return tl.where(x >= 0, x, 0.01 * x)


@@ -261,7 +251,7 @@ def leaky_relu(x):
 # and (1) checks any shape constraint; (2) allocates the output; (3) launches the above kernel


-def matmul(a, b, activation=""):
+def matmul(a, b, activation=None):
    # checks constraints
    assert a.shape[1] == b.shape[0], "incompatible dimensions"
    assert a.is_contiguous(), "matrix A must be contiguous"
@@ -297,7 +287,7 @@ def matmul(a, b, activation=""):
 torch.manual_seed(0)
 a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-triton_output = matmul(a, b)
+triton_output = matmul(a, b, activation=None)
 torch_output = torch.matmul(a, b)
 print(f"triton_output={triton_output}")
 print(f"torch_output={torch_output}")
@@ -319,13 +309,13 @@ else:
    triton.testing.Benchmark(
        x_names=['M', 'N', 'K'],  # argument names to use as an x-axis for the plot
        x_vals=[
-            128 * i for i in range(2, 33)
+            8192
        ],  # different possible values for `x_name`
        line_arg='provider',  # argument name whose value corresponds to a different line in the plot
        # possible values for `line_arg``
-        line_vals=['cublas', 'cublas + relu', 'triton', 'triton + relu'],
+        line_vals=['cublas', 'triton'],
        # label name for the lines
-        line_names=["cuBLAS", "cuBLAS (+ torch.nn.LeakyReLU)", "Triton", "Triton (+ LeakyReLU)"],
+        line_names=["cuBLAS", "Triton"],
        # line styles
        styles=[('green', '-'), ('green', '--'), ('blue', '-'), ('blue', '--')],
        ylabel="TFLOPS",  # label name for the y-axis
@@ -337,18 +327,9 @@ def benchmark(M, N, K, provider):
    a = torch.randn((M, K), device='cuda', dtype=torch.float16)
    b = torch.randn((K, N), device='cuda', dtype=torch.float16)
    if provider == 'cublas':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b))
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: torch.matmul(a, b), rep=100)
    if provider == 'triton':
-        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b))
-    if provider == 'cublas + relu':
-        torch_relu = torch.nn.ReLU(inplace=True)
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: torch_relu(torch.matmul(a, b))
-        )
-    if provider == 'triton + relu':
-        ms, min_ms, max_ms = triton.testing.do_bench(
-            lambda: matmul(a, b, activation="leaky_relu")
-        )
+        ms, min_ms, max_ms = triton.testing.do_bench(lambda: matmul(a, b), rep=100)
    perf = lambda ms: 2 * M * N * K * 1e-12 / (ms * 1e-3)
    return perf(ms), perf(max_ms), perf(min_ms)

--- a/python/tutorials/05-layer-norm.py
+++ b/python/tutorials/05-layer-norm.py
@@ -19,8 +19,8 @@ except ModuleNotFoundError:

@triton.jit
 def _layer_norm_fwd_fused(
-    Out,
    A,
+    Out,
    Weight,
    Bias,
    Mean, Rstd,
@@ -36,14 +36,14 @@ def _layer_norm_fwd_fused(
    _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
    for off in range(0, N, BLOCK_SIZE):
        cols = off + tl.arange(0, BLOCK_SIZE)
-        a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)
+        a = tl.load(A + cols, mask=cols < N, other=0.).to(tl.float32)
        _mean += a
    mean = tl.sum(_mean, axis=0) / N
    # compute variance
    _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
    for off in range(0, N, BLOCK_SIZE):
        cols = off + tl.arange(0, BLOCK_SIZE)
-        a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)
+        a = tl.load(A + cols, mask=cols < N, other=0.).to(tl.float32)
        a = tl.where(cols < N, a - mean, 0.)
        _var += a * a
    var = tl.sum(_var, axis=0) / N
@@ -57,192 +57,155 @@ def _layer_norm_fwd_fused(
        mask = cols < N
        weight = tl.load(Weight + cols, mask=mask)
        bias = tl.load(Bias + cols, mask=mask)
-        a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
+        a = tl.load(A + cols, mask=mask, other=0.).to(tl.float32)
        a_hat = (a - mean) * rstd
        out = a_hat * weight + bias
        # # write-back
        tl.store(Out + cols, out, mask=mask)

-# Backward pass (DA + partial DW + partial DB)
-

+# Backward pass (DX + partial DW + partial DB)
@triton.jit
-def _layer_norm_bwd_dx_fused(
-    _DA,
-    _DOut,
-    _A,
-    Weight,
-    Mean, Rstd,
-    stride, NumRows, NumCols, eps,
-    BLOCK_SIZE_N: tl.constexpr,
-):
+def _layer_norm_bwd_dx_fused(DX, DY, DW, DB, X, W, B, M, V, Lock, stride, N, eps,
+                             GROUP_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
    # position of elements processed by this program
-    pid = tl.program_id(0)
-    row = pid
-    A = _A + row * stride
-    DOut = _DOut + row * stride
-    DA = _DA + row * stride
-    mean = tl.load(Mean + row)
-    rstd = tl.load(Rstd + row)
+    row = tl.program_id(0)
+    cols = tl.arange(0, BLOCK_SIZE_N)
+    mask = cols < N
+    # offset data pointers to start at the row of interest
+    X += row * stride
+    DY += row * stride
+    DX += row * stride
+    # offset locks and weight/bias gradient pointer
+    # each kernel instance accumulates partial sums for
+    # DW and DB into one of GROUP_SIZE_M independent buffers
+    # these buffers stay in the L2, which allow this kernel
+    # to be fast
+    lock_id = row % GROUP_SIZE_M
+    Lock += lock_id
+    Count = Lock + GROUP_SIZE_M
+    DW = DW + lock_id * N + cols
+    DB = DB + lock_id * N + cols
    # load data to SRAM
-    _mean1 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
-    _mean2 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
-    for off in range(0, NumCols, BLOCK_SIZE_N):
-        cols = off + tl.arange(0, BLOCK_SIZE_N)
-        mask = cols < NumCols
-        a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)
-        dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)
-        weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)
-        a_hat = (a - mean) * rstd
-        wdout = weight * dout
-        _mean1 += a_hat * wdout
-        _mean2 += wdout
-    mean1 = tl.sum(_mean1, axis=0) / NumCols
-    mean2 = 0.
-    mean2 = tl.sum(_mean2, axis=0) / NumCols
-    for off in range(0, NumCols, BLOCK_SIZE_N):
-        cols = off + tl.arange(0, BLOCK_SIZE_N)
-        mask = cols < NumCols
-        a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)
-        dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)
-        weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)
-        a_hat = (a - mean) * rstd
-        wdout = weight * dout
-        da = (wdout - (a_hat * mean1 + mean2)) * rstd
-        # write-back dx
-        tl.store(DA + cols, da, mask=mask)
-
+    x = tl.load(X + cols, mask=mask, other=0).to(tl.float32)
+    dy = tl.load(DY + cols, mask=mask, other=0).to(tl.float32)
+    w = tl.load(W + cols, mask=mask).to(tl.float32)
+    mean = tl.load(M + row)
+    rstd = tl.load(V + row)
+    # compute dx
+    xhat = (x - mean) * rstd
+    wdy = w * dy
+    xhat = tl.where(mask, xhat, 0.)
+    wdy = tl.where(mask, wdy, 0.)
+    mean1 = tl.sum(xhat * wdy, axis=0) / N
+    mean2 = tl.sum(wdy, axis=0) / N
+    dx = (wdy - (xhat * mean1 + mean2)) * rstd
+    # write-back dx
+    tl.store(DX + cols, dx, mask=mask)
+    # accumulate partial sums for dw/db
+    partial_dw = (dy * xhat).to(w.dtype)
+    partial_db = (dy).to(w.dtype)
+    while tl.atomic_cas(Lock, 0, 1) == 1:
+        pass
+    count = tl.load(Count)
+    # first store doesn't accumulate
+    if count == 0:
+        tl.atomic_xchg(Count, 1)
+    else:
+        partial_dw += tl.load(DW, mask=mask)
+        partial_db += tl.load(DB, mask=mask)
+    tl.store(DW, partial_dw, mask=mask)
+    tl.store(DB, partial_db, mask=mask)
+    # release lock
+    tl.atomic_xchg(Lock, 0)

 # Backward pass (total DW + total DB)
+
+
@triton.jit
-def _layer_norm_bwd_dwdb(
-    A, DOut,
-    Mean, Var,
-    DW,
-    DB,
-    M, N,
-    BLOCK_SIZE_M: tl.constexpr,
-    BLOCK_SIZE_N: tl.constexpr,
-):
+def _layer_norm_bwd_dwdb(DW, DB, FINAL_DW, FINAL_DB, M, N,
+                         BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr):
    pid = tl.program_id(0)
    cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
    dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
    db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    UNROLL: tl.constexpr = 4
-    for i in range(0, M, BLOCK_SIZE_M * UNROLL):
-        for j in range(UNROLL):
-            rows = i + j * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
-            mask = (rows[:, None] < M) & (cols[None, :] < N)
-            offs = rows[:, None] * N + cols[None, :]
-            a = tl.load(A + offs, mask=mask, other=0.).to(tl.float32)
-            dout = tl.load(DOut + offs, mask=mask, other=0.).to(tl.float32)
-            mean = tl.load(Mean + rows, mask=rows < M, other=0.)
-            rstd = tl.load(Var + rows, mask=rows < M, other=0.)
-            a_hat = (a - mean[:, None]) * rstd[:, None]
-            dw += dout * a_hat
-            db += dout
+    for i in range(0, M, BLOCK_SIZE_M):
+        rows = i + tl.arange(0, BLOCK_SIZE_M)
+        mask = (rows[:, None] < M) & (cols[None, :] < N)
+        offs = rows[:, None] * N + cols[None, :]
+        dw += tl.load(DW + offs, mask=mask, other=0.)
+        db += tl.load(DB + offs, mask=mask, other=0.)
    sum_dw = tl.sum(dw, axis=0)
    sum_db = tl.sum(db, axis=0)
-    tl.store(DW + cols, sum_dw, mask=cols < N)
-    tl.store(DB + cols, sum_db, mask=cols < N)
+    tl.store(FINAL_DW + cols, sum_dw, mask=cols < N)
+    tl.store(FINAL_DB + cols, sum_db, mask=cols < N)


 class LayerNorm(torch.autograd.Function):
+
    @staticmethod
-    def forward(ctx, a, normalized_shape, weight, bias, eps):
+    def forward(ctx, x, normalized_shape, weight, bias, eps):
        # allocate output
-        out = torch.empty_like(a)
+        y = torch.empty_like(x)
        # reshape input data into 2D tensor
-        a_arg = a.reshape(-1, a.shape[-1])
-        M, N = a_arg.shape
-        mean = torch.empty((M,), dtype=torch.float32, device="cuda")
-        rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
+        x_arg = x.reshape(-1, x.shape[-1])
+        M, N = x_arg.shape
+        mean = torch.empty((M, ), dtype=torch.float32, device='cuda')
+        rstd = torch.empty((M, ), dtype=torch.float32, device='cuda')
        # Less than 64KB per feature: enqueue fused kernel
-        MAX_FUSED_SIZE = 65536 // a.element_size()
+        MAX_FUSED_SIZE = 65536 // x.element_size()
        BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
-        BLOCK_SIZE = max(BLOCK_SIZE, 128)
-        BLOCK_SIZE = min(BLOCK_SIZE, 4096)
+        if N > BLOCK_SIZE:
+            raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
        # heuristics for number of warps
        num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
-        _layer_norm_fwd_fused[(M,)](
-            out,
-            a_arg,
-            weight,
-            bias,
-            mean, rstd,
-            a_arg.stride(0), N, eps,
-            BLOCK_SIZE=BLOCK_SIZE,
-            num_warps=num_warps,
-        )
-        ctx.save_for_backward(
-            a, weight, bias, mean, rstd,
-        )
+        # enqueue kernel
+        _layer_norm_fwd_fused[(M,)](x_arg, y, weight, bias, mean, rstd,
+                                    x_arg.stride(0), N, eps,
+                                    BLOCK_SIZE=BLOCK_SIZE, num_warps=num_warps)
+        ctx.save_for_backward(x, weight, bias, mean, rstd)
        ctx.BLOCK_SIZE = BLOCK_SIZE
        ctx.num_warps = num_warps
        ctx.eps = eps
-        if hasattr(bias, "config"):
-            assert bias.config.grad_scale_name == weight.config.grad_scale_name
-            grad_scale_name = bias.config.grad_scale_name
-        else:
-            grad_scale_name = None
-        ctx.grad_scale_gain_bias_name = grad_scale_name
-        return out
+        return y

    @staticmethod
-    def backward(ctx, dout):
-        assert dout.is_contiguous()
-        a, weight, bias, mean, var = ctx.saved_tensors
+    def backward(ctx, dy):
+        x, w, b, m, v = ctx.saved_tensors
        # heuristics for amount of parallel reduction stream for DG/DB
-        N = weight.shape[0]
+        N = w.shape[0]
+        GROUP_SIZE_M = 64
+        if N <= 8192: GROUP_SIZE_M = 96
+        if N <= 4096: GROUP_SIZE_M = 128
+        if N <= 1024: GROUP_SIZE_M = 256
        # allocate output
-        da = torch.empty_like(dout)
+        locks = torch.zeros(2 * GROUP_SIZE_M, dtype=torch.int32, device='cuda')
+        _dw = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+        _db = torch.empty((GROUP_SIZE_M, w.shape[0]), dtype=x.dtype, device=w.device)
+        dw = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+        db = torch.empty((w.shape[0],), dtype=w.dtype, device=w.device)
+        dx = torch.empty_like(dy)
        # enqueue kernel using forward pass heuristics
        # also compute partial sums for DW and DB
-        x_arg = a.reshape(-1, a.shape[-1])
+        x_arg = x.reshape(-1, x.shape[-1])
        M, N = x_arg.shape
-        dweight = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)
-        dbias = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)
-        _layer_norm_bwd_dx_fused[(M,)](
-            da,
-            dout,
-            a,
-            weight,
-            mean, var,
-            x_arg.stride(0), M, N,
-            ctx.eps,
-            BLOCK_SIZE_N=ctx.BLOCK_SIZE,
-            num_warps=ctx.num_warps,
-        )
-        if N > 10240:
-            BLOCK_SIZE_N = 128
-            BLOCK_SIZE_M = 32
-            num_warps = 4
-        else:
-            # maximize occupancy for small N
-            BLOCK_SIZE_N = 16
-            BLOCK_SIZE_M = 16
-            num_warps = 8
-        grid = lambda meta: [triton.cdiv(N, meta["BLOCK_SIZE_N"])]
-        _layer_norm_bwd_dwdb[grid](
-            a, dout,
-            mean, var,
-            dweight,
-            dbias,
-            M,
-            N,
-            BLOCK_SIZE_M=BLOCK_SIZE_M,
-            BLOCK_SIZE_N=BLOCK_SIZE_N,
-            num_warps=num_warps
-        )
-        return (da, None, dweight, dbias, None)
+        _layer_norm_bwd_dx_fused[(M,)](dx, dy, _dw, _db, x, w, b, m, v, locks,
+                                       x_arg.stride(0), N, ctx.eps,
+                                       BLOCK_SIZE_N=ctx.BLOCK_SIZE,
+                                       GROUP_SIZE_M=GROUP_SIZE_M,
+                                       num_warps=ctx.num_warps)
+        grid = lambda meta: [triton.cdiv(N, meta['BLOCK_SIZE_N'])]
+        # accumulate partial sums in separate kernel
+        _layer_norm_bwd_dwdb[grid](_dw, _db, dw, db, GROUP_SIZE_M, N,
+                                   BLOCK_SIZE_M=32,
+                                   BLOCK_SIZE_N=128)
+        return dx, None, dw, db, None


-def layer_norm(a, normalized_shape, weight, bias, eps):
-    return LayerNorm.apply(a, normalized_shape, weight, bias, eps)
+layer_norm = LayerNorm.apply


 def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
-    torch.manual_seed(0)
    # create data
    x_shape = (M, N)
    w_shape = (x_shape[-1], )
@@ -277,11 +240,11 @@ def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
        line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),
        styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
        ylabel='GB/s',
-        plot_name='layer-norm',
-        args={'M': 4096, 'dtype': torch.float16, 'mode': 'forward'}
+        plot_name='layer-norm-backward',
+        args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'}
    )
 )
-def bench_layer_norm(M, N, dtype, provider, mode, eps=1e-5, device='cuda'):
+def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='cuda'):
    # create data
    x_shape = (M, N)
    w_shape = (x_shape[-1], )
@@ -311,5 +274,5 @@ def bench_layer_norm(M, N, dtype, provider, mode, eps=1e-5, device='cuda'):
    return gbps(ms), gbps(max_ms), gbps(min_ms)


-# test_layer_norm(1151, 8192, torch.float16)
-bench_layer_norm.run(save_path='.', print_data=True)
+test_layer_norm(1151, 8192, torch.float16)
+# bench_layer_norm.run(save_path='.', print_data=True)
--- a/python/tutorials/06-fused-attention.py
+++ b/python/tutorials/06-fused-attention.py
@@ -15,7 +15,7 @@ import triton.language as tl
@triton.jit
 def _fwd_kernel(
    Q, K, V, sm_scale,
-    TMP, L, M,  # NOTE: TMP is a scratchpad buffer to workaround a compiler bug
+    TMP, L, M,  # NOTE: TMP is a scratchpad buffer to work around a compiler bug
    Out,
    stride_qz, stride_qh, stride_qm, stride_qk,
    stride_kz, stride_kh, stride_kn, stride_kk,
@@ -39,7 +39,6 @@ def _fwd_kernel(
    k_ptrs = K + off_k
    v_ptrs = V + off_v
    # initialize pointer to m and l
-    t_ptrs = TMP + off_hz * N_CTX + offs_m
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
@@ -47,11 +46,11 @@ def _fwd_kernel(
    q = tl.load(q_ptrs)
    # loop over k, v and update accumulator
    for start_n in range(0, (start_m + 1) * BLOCK_M, BLOCK_N):
-        start_n = tl.multiple_of(start_n, BLOCK_N)
+        # start_n = tl.multiple_of(start_n, BLOCK_N)
        # -- compute qk ----
        k = tl.load(k_ptrs + start_n * stride_kn)
        qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
-        qk += tl.dot(q, k, trans_b=True)
+        qk += tl.dot(q, tl.trans(k))
        qk *= sm_scale
        qk += tl.where(offs_m[:, None] >= (start_n + offs_n[None, :]), 0, float("-inf"))
        # -- compute m_ij, p, l_ij
@@ -69,8 +68,6 @@ def _fwd_kernel(
        p = p * p_scale[:, None]
        # scale acc
        acc_scale = l_i / l_i_new * alpha
-        tl.store(t_ptrs, acc_scale)
-        acc_scale = tl.load(t_ptrs)  # BUG: have to store and immediately load
        acc = acc * acc_scale[:, None]
        # update acc
        v = tl.load(v_ptrs + start_n * stride_vk)
@@ -168,26 +165,26 @@ def _bwd_kernel(
            q = tl.load(q_ptrs)
            # recompute p = softmax(qk, dim=-1).T
            # NOTE: `do` is pre-divided by `l`; no normalization here
-            qk = tl.dot(q, k, trans_b=True)
+            qk = tl.dot(q, tl.trans(k))
            qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf"))
            m = tl.load(m_ptrs + offs_m_curr)
            p = tl.exp(qk * sm_scale - m[:, None])
            # compute dv
            do = tl.load(do_ptrs)
-            dv += tl.dot(p.to(tl.float16), do, trans_a=True)
+            dv += tl.dot(tl.trans(p.to(tl.float16)), do)
            # compute dp = dot(v, do)
            Di = tl.load(D_ptrs + offs_m_curr)
            dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None]
-            dp += tl.dot(do, v, trans_b=True)
+            dp += tl.dot(do, tl.trans(v))
            # compute ds = p * (dp - delta[:, None])
            ds = p * dp * sm_scale
            # compute dk = dot(ds.T, q)
-            dk += tl.dot(ds.to(tl.float16), q, trans_a=True)
-            # # compute dq
-            dq = tl.load(dq_ptrs, eviction_policy="evict_last")
+            dk += tl.dot(tl.trans(ds.to(tl.float16)), q)
+            # compute dq
+            dq = tl.load(dq_ptrs)
            dq += tl.dot(ds.to(tl.float16), k)
-            tl.store(dq_ptrs, dq, eviction_policy="evict_last")
-            # # increment pointers
+            tl.store(dq_ptrs, dq)
+            # increment pointers
            dq_ptrs += BLOCK_M * stride_qm
            q_ptrs += BLOCK_M * stride_qm
            do_ptrs += BLOCK_M * stride_qm
@@ -198,6 +195,9 @@ def _bwd_kernel(
        tl.store(dk_ptrs, dk)


+empty = torch.empty(128, device="cuda")
+
+
 class _attention(torch.autograd.Function):

    @staticmethod
@@ -208,7 +208,7 @@ class _attention(torch.autograd.Function):
        assert Lq == Lk and Lk == Lv
        assert Lk in {16, 32, 64, 128}
        o = torch.empty_like(q)
-        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1])
+        grid = (triton.cdiv(q.shape[2], BLOCK), q.shape[0] * q.shape[1], 1)
        tmp = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
        L = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
        m = torch.empty((q.shape[0] * q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
@@ -227,6 +227,7 @@ class _attention(torch.autograd.Function):
            BLOCK_DMODEL=Lk, num_warps=num_warps,
            num_stages=1,
        )
+
        ctx.save_for_backward(q, k, v, o, L, m)
        ctx.BLOCK = BLOCK
        ctx.grid = grid
@@ -272,13 +273,13 @@ class _attention(torch.autograd.Function):
 attention = _attention.apply


-@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(3, 2, 2048, 64)])
+@pytest.mark.parametrize('Z, H, N_CTX, D_HEAD', [(4, 48, 1024, 64)])
 def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
    torch.manual_seed(20)
-    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0, std=.5).requires_grad_()
-    sm_scale = 0.3
+    q = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.1, std=0.2).requires_grad_()
+    k = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.4, std=0.2).requires_grad_()
+    v = torch.empty((Z, H, N_CTX, D_HEAD), dtype=dtype, device="cuda").normal_(mean=0.3, std=0.2).requires_grad_()
+    sm_scale = 0.2
    dout = torch.randn_like(q)
    # reference implementation
    M = torch.tril(torch.ones((N_CTX, N_CTX), device="cuda"))
@@ -287,13 +288,16 @@ def test_op(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
        for h in range(H):
            p[:, :, M == 0] = float("-inf")
    p = torch.softmax(p.float(), dim=-1).half()
+    # p = torch.exp(p)
    ref_out = torch.matmul(p, v)
    ref_out.backward(dout)
    ref_dv, v.grad = v.grad.clone(), None
    ref_dk, k.grad = k.grad.clone(), None
    ref_dq, q.grad = q.grad.clone(), None
-    # triton implementation
+    # # triton implementation
    tri_out = attention(q, k, v, sm_scale)
+    # print(ref_out)
+    # print(tri_out)
    tri_out.backward(dout)
    tri_dv, v.grad = v.grad.clone(), None
    tri_dk, k.grad = k.grad.clone(), None
@@ -323,7 +327,7 @@ configs = [triton.testing.Benchmark(
    ylabel='ms',
    plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-{mode}',
    args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': torch.float16, 'mode': mode}
-) for mode in ['bwd']]
+) for mode in ['fwd']]


@triton.testing.perf_report(configs)
@@ -356,5 +360,4 @@ def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, mode, provider, dtype=torch.f
        ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep)
        return ms

-# only works on A100 at the moment
 # bench_flash_attention.run(save_path='.', print_data=True)
--- a/python/tutorials/07-libdevice-function.py
+++ b/python/tutorials/07-libdevice-function.py
@@ -1,74 +0,0 @@
-"""
-Libdevice function
-===============
-Triton can invoke a custom function from an external library.
-In this example, we will use the `libdevice` library to apply `asin` on a tensor.
-Please refer to https://docs.nvidia.com/cuda/libdevice-users-guide/index.html regarding the semantics of all available libdevice functions.
-
-In `trition/language/libdevice.py`, we try to aggregate functions with the same computation but different data types together.
-For example, both `__nv_asin` and `__nvasinf` calculate the principal value of the arc sine of the input, but `__nv_asin` operates on `double` and `__nv_asinf` operates on `float`.
-Using triton, you can simply call `tl.libdevice.asin`.
-triton automatically selects the correct underlying device function to invoke based on input and output types.
-"""
-
-# %%
-#  asin Kernel
-# --------------------------
-
-import torch
-
-import triton
-import triton.language as tl
-
-
-@triton.jit
-def asin_kernel(
-    x_ptr,
-    y_ptr,
-    n_elements,
-    BLOCK_SIZE: tl.constexpr,
-):
-    pid = tl.program_id(axis=0)
-    block_start = pid * BLOCK_SIZE
-    offsets = block_start + tl.arange(0, BLOCK_SIZE)
-    mask = offsets < n_elements
-    x = tl.load(x_ptr + offsets, mask=mask)
-    x = tl.libdevice.asin(x)
-    tl.store(y_ptr + offsets, x, mask=mask)
-
-# %%
-#  Using the default libdevice library path
-# --------------------------
-# We can use the default libdevice library path encoded in `triton/language/libdevice.py`
-
-
-torch.manual_seed(0)
-size = 98432
-x = torch.rand(size, device='cuda')
-output_triton = torch.zeros(size, device='cuda')
-output_torch = torch.asin(x)
-assert x.is_cuda and output_triton.is_cuda
-n_elements = output_torch.numel()
-grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']),)
-asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024)
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)
-
-# %%
-#  Customize the libdevice library path
-# --------------------------
-# We can also customize the libdevice library path by passing the path to the `libdevice` library to the `asin` kernel.
-
-output_triton = torch.empty_like(x)
-asin_kernel[grid](x, output_triton, n_elements, BLOCK_SIZE=1024,
-                  extern_libs={'libdevice': '/usr/local/cuda/nvvm/libdevice/libdevice.10.bc'})
-print(output_torch)
-print(output_triton)
-print(
-    f'The maximum difference between torch and triton is '
-    f'{torch.max(torch.abs(output_torch - output_triton))}'
-)