triton/master/_sources/getting-started/tutorials/05-layer-norm.rst.txt


.. DO NOT EDIT.
.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
.. "getting-started/tutorials/05-layer-norm.py"
.. LINE NUMBERS ARE GIVEN BELOW.

.. only:: html

    .. note::
        :class: sphx-glr-download-link-note

        Click :ref:`here <sphx_glr_download_getting-started_tutorials_05-layer-norm.py>`
        to download the full example code

.. rst-class:: sphx-glr-example-title

.. _sphx_glr_getting-started_tutorials_05-layer-norm.py:


Layer Normalization
====================

.. GENERATED FROM PYTHON SOURCE LINES 5-316


.. image:: /getting-started/tutorials/images/sphx_glr_05-layer-norm_001.png
    :alt: 05 layer norm
    :class: sphx-glr-single-img


.. rst-class:: sphx-glr-script-out

 Out:

 .. code-block:: none

    layer-norm:
              N      Triton       Torch        Apex
    0    1024.0  585.142849  277.694907  468.114273
    1    1536.0  630.153868  327.679984  511.999982
    2    2048.0  668.734716  334.367358  520.126988
    3    2560.0  694.237267  362.477870  518.481028
    4    3072.0  702.171410  375.206126  501.551037
    5    3584.0  725.873439  384.859062  455.111115
    6    4096.0  728.177767  381.023256  451.972420
    7    4608.0  670.254540  394.267384  418.909088
    8    5120.0  688.403381  397.669909  424.455959
    9    5632.0  704.000002  395.228063  411.470331
    10   6144.0  697.191505  402.885254  409.600010
    11   6656.0  700.631610  398.861429  400.360920
    12   7168.0  686.754468  394.116833  384.859062
    13   7680.0  678.895043  393.846167  385.203746
    14   8192.0  633.198054  388.937680  369.216905
    15   8704.0  624.502255  390.095225  379.465939
    16   9216.0  604.327881  407.337026  381.023249
    17   9728.0  587.350922  408.524944  382.427505
    18  10240.0  564.965524  409.600010  382.803739
    19  10752.0  546.133312  411.559798  381.445676
    20  11264.0  533.207081  405.909906  375.466676
    21  11776.0  520.486200  409.599991  377.587162
    22  12288.0  513.336807  413.911572  383.251457
    23  12800.0  504.433489  409.599981  375.779805
    24  13312.0  494.180982  405.699062  376.310952
    25  13824.0  481.882350  411.888257  378.739711
    26  14336.0  470.997935  405.257949  371.158581
    27  14848.0  460.403127  407.492270  374.712936
    28  15360.0  454.269882  406.887417  377.511515
    29  15872.0  446.312810  406.974373  375.668625


|

.. code-block:: default


    import torch

    import triton
    import triton.language as tl

    try:
        # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
        # should not be added to extras_require in setup.py.
        import apex
        HAS_APEX = True
    except ModuleNotFoundError:
        HAS_APEX = False


    @triton.jit
    def _layer_norm_fwd_fused(
        Out,
        A,
        Weight,
        Bias,
        Mean, Rstd,
        stride, N, eps,
        BLOCK_SIZE: tl.constexpr,
    ):
        # position of elements processed by this program
        row = tl.program_id(0)
        Out += row * stride
        A += row * stride
        # compute mean
        mean = 0
        _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
        for off in range(0, N, BLOCK_SIZE):
            cols = off + tl.arange(0, BLOCK_SIZE)
            a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)
            _mean += a
        mean = tl.sum(_mean, axis=0) / N
        # compute variance
        _var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
        for off in range(0, N, BLOCK_SIZE):
            cols = off + tl.arange(0, BLOCK_SIZE)
            a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)
            a = tl.where(cols < N, a - mean, 0.)
            _var += a * a
        var = tl.sum(_var, axis=0) / N
        rstd = 1 / tl.sqrt(var + eps)
        # write-back mean/rstd
        tl.store(Mean + row, mean)
        tl.store(Rstd + row, rstd)
        # multiply by weight and add bias
        for off in range(0, N, BLOCK_SIZE):
            cols = off + tl.arange(0, BLOCK_SIZE)
            mask = cols < N
            weight = tl.load(Weight + cols, mask=mask)
            bias = tl.load(Bias + cols, mask=mask)
            a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)
            a_hat = (a - mean) * rstd
            out = a_hat * weight + bias
            # # write-back
            tl.store(Out + cols, out, mask=mask)

    # Backward pass (DA + partial DW + partial DB)


    @triton.jit
    def _layer_norm_bwd_dx_fused(
        _DA,
        _DOut,
        _A,
        Weight,
        Mean, Rstd,
        stride, NumRows, NumCols, eps,
        BLOCK_SIZE_N: tl.constexpr,
    ):
        # position of elements processed by this program
        pid = tl.program_id(0)
        row = pid
        A = _A + row * stride
        DOut = _DOut + row * stride
        DA = _DA + row * stride
        mean = tl.load(Mean + row)
        rstd = tl.load(Rstd + row)
        # load data to SRAM
        _mean1 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
        _mean2 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)
        for off in range(0, NumCols, BLOCK_SIZE_N):
            cols = off + tl.arange(0, BLOCK_SIZE_N)
            mask = cols < NumCols
            a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)
            dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)
            weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)
            a_hat = (a - mean) * rstd
            wdout = weight * dout
            _mean1 += a_hat * wdout
            _mean2 += wdout
        mean1 = tl.sum(_mean1, axis=0) / NumCols
        mean2 = 0.
        mean2 = tl.sum(_mean2, axis=0) / NumCols
        for off in range(0, NumCols, BLOCK_SIZE_N):
            cols = off + tl.arange(0, BLOCK_SIZE_N)
            mask = cols < NumCols
            a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)
            dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)
            weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)
            a_hat = (a - mean) * rstd
            wdout = weight * dout
            da = (wdout - (a_hat * mean1 + mean2)) * rstd
            # write-back dx
            tl.store(DA + cols, da, mask=mask)


    # Backward pass (total DW + total DB)
    @triton.jit
    def _layer_norm_bwd_dwdb(
        A, DOut,
        Mean, Var,
        DW,
        DB,
        M, N,
        BLOCK_SIZE_M: tl.constexpr,
        BLOCK_SIZE_N: tl.constexpr,
    ):
        pid = tl.program_id(0)
        cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
        dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
        UNROLL: tl.constexpr = 4
        for i in range(0, M, BLOCK_SIZE_M * UNROLL):
            for j in range(UNROLL):
                rows = i + j * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
                mask = (rows[:, None] < M) & (cols[None, :] < N)
                offs = rows[:, None] * N + cols[None, :]
                a = tl.load(A + offs, mask=mask, other=0.).to(tl.float32)
                dout = tl.load(DOut + offs, mask=mask, other=0.).to(tl.float32)
                mean = tl.load(Mean + rows, mask=rows < M, other=0.)
                rstd = tl.load(Var + rows, mask=rows < M, other=0.)
                a_hat = (a - mean[:, None]) * rstd[:, None]
                dw += dout * a_hat
                db += dout
        sum_dw = tl.sum(dw, axis=0)
        sum_db = tl.sum(db, axis=0)
        tl.store(DW + cols, sum_dw, mask=cols < N)
        tl.store(DB + cols, sum_db, mask=cols < N)


    class LayerNorm(torch.autograd.Function):
        @staticmethod
        def forward(ctx, a, normalized_shape, weight, bias, eps):
            # allocate output
            out = torch.empty_like(a)
            # reshape input data into 2D tensor
            a_arg = a.reshape(-1, a.shape[-1])
            M, N = a_arg.shape
            mean = torch.empty((M,), dtype=torch.float32, device="cuda")
            rstd = torch.empty((M,), dtype=torch.float32, device="cuda")
            # Less than 64KB per feature: enqueue fused kernel
            MAX_FUSED_SIZE = 65536 // a.element_size()
            BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
            BLOCK_SIZE = max(BLOCK_SIZE, 128)
            BLOCK_SIZE = min(BLOCK_SIZE, 4096)
            # heuristics for number of warps
            num_warps = min(max(BLOCK_SIZE // 256, 1), 8)
            _layer_norm_fwd_fused[(M,)](
                out,
                a_arg,
                weight,
                bias,
                mean, rstd,
                a_arg.stride(0), N, eps,
                BLOCK_SIZE=BLOCK_SIZE,
                num_warps=num_warps,
            )
            ctx.save_for_backward(
                a, weight, bias, mean, rstd,
            )
            ctx.BLOCK_SIZE = BLOCK_SIZE
            ctx.num_warps = num_warps
            ctx.eps = eps
            if hasattr(bias, "config"):
                assert bias.config.grad_scale_name == weight.config.grad_scale_name
                grad_scale_name = bias.config.grad_scale_name
            else:
                grad_scale_name = None
            ctx.grad_scale_gain_bias_name = grad_scale_name
            return out

        @staticmethod
        def backward(ctx, dout):
            assert dout.is_contiguous()
            a, weight, bias, mean, var = ctx.saved_tensors
            # heuristics for amount of parallel reduction stream for DG/DB
            N = weight.shape[0]
            # allocate output
            da = torch.empty_like(dout)
            # enqueue kernel using forward pass heuristics
            # also compute partial sums for DW and DB
            x_arg = a.reshape(-1, a.shape[-1])
            M, N = x_arg.shape
            dweight = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)
            dbias = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)
            _layer_norm_bwd_dx_fused[(M,)](
                da,
                dout,
                a,
                weight,
                mean, var,
                x_arg.stride(0), M, N,
                ctx.eps,
                BLOCK_SIZE_N=ctx.BLOCK_SIZE,
                num_warps=ctx.num_warps,
            )
            if N > 10240:
                BLOCK_SIZE_N = 128
                BLOCK_SIZE_M = 32
                num_warps = 4
            else:
                # maximize occupancy for small N
                BLOCK_SIZE_N = 16
                BLOCK_SIZE_M = 16
                num_warps = 8
            grid = lambda meta: [triton.cdiv(N, meta["BLOCK_SIZE_N"])]
            _layer_norm_bwd_dwdb[grid](
                a, dout,
                mean, var,
                dweight,
                dbias,
                M,
                N,
                BLOCK_SIZE_M=BLOCK_SIZE_M,
                BLOCK_SIZE_N=BLOCK_SIZE_N,
                num_warps=num_warps
            )
            return (da, None, dweight, dbias, None)


    def layer_norm(a, normalized_shape, weight, bias, eps):
        return LayerNorm.apply(a, normalized_shape, weight, bias, eps)


    def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):
        torch.manual_seed(0)
        # create data
        x_shape = (M, N)
        w_shape = (x_shape[-1], )
        weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
        dy = .1 * torch.randn_like(x)
        x.requires_grad_(True)
        # forward pass
        y_tri = layer_norm(x, w_shape, weight, bias, eps)
        y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)
        # backward pass (triton)
        y_tri.backward(dy, retain_graph=True)
        dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]
        x.grad, weight.grad, bias.grad = None, None, None
        # backward pass (torch)
        y_ref.backward(dy, retain_graph=True)
        dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]
        # compare
        triton.testing.assert_almost_equal(y_tri, y_ref)
        triton.testing.assert_almost_equal(dx_tri, dx_ref)
        triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)
        triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)


    @triton.testing.perf_report(
        triton.testing.Benchmark(
            x_names=['N'],
            x_vals=[512 * i for i in range(2, 32)],
            line_arg='provider',
            line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),
            line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),
            styles=[('blue', '-'), ('green', '-'), ('orange', '-')],
            ylabel='GB/s',
            plot_name='layer-norm',
            args={'M': 4096, 'dtype': torch.float16, 'mode': 'forward'}
        )
    )
    def bench_layer_norm(M, N, dtype, provider, mode, eps=1e-5, device='cuda'):
        # create data
        x_shape = (M, N)
        w_shape = (x_shape[-1], )
        weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)
        x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')
        dy = .1 * torch.randn_like(x)
        x.requires_grad_(True)
        # utility functions
        if provider == 'triton':
            y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)
        if provider == 'torch':
            y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)
        if provider == 'apex':
            apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)
            y_fwd = lambda: apex_layer_norm(x)
        # forward pass
        if mode == 'forward':
            gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6
            ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)
        # backward pass
        if mode == 'backward':
            gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6
            y = y_fwd()
            ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),
                                                         grad_to_none=[x], rep=500)
        return gbps(ms), gbps(max_ms), gbps(min_ms)


    # test_layer_norm(1151, 8192, torch.float16)
    bench_layer_norm.run(save_path='.', print_data=True)


.. rst-class:: sphx-glr-timing

   **Total running time of the script:** ( 5 minutes  37.208 seconds)


.. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:


.. only :: html

 .. container:: sphx-glr-footer
    :class: sphx-glr-footer-example


  .. container:: sphx-glr-download sphx-glr-download-python

     :download:`Download Python source code: 05-layer-norm.py <05-layer-norm.py>`


  .. container:: sphx-glr-download sphx-glr-download-jupyter

     :download:`Download Jupyter notebook: 05-layer-norm.ipynb <05-layer-norm.ipynb>`


.. only:: html

 .. rst-class:: sphx-glr-signature

    `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00
			`.. DO NOT EDIT.`
			`.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.`
			`.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:`
			`.. "getting-started/tutorials/05-layer-norm.py"`
			`.. LINE NUMBERS ARE GIVEN BELOW.`

			`.. only:: html`

			`.. note::`
			`:class: sphx-glr-download-link-note`

			Click :ref:`here <sphx_glr_download_getting-started_tutorials_05-layer-norm.py>`
			`to download the full example code`

			`.. rst-class:: sphx-glr-example-title`

			`.. _sphx_glr_getting-started_tutorials_05-layer-norm.py:`


			`Layer Normalization`
			`====================`

[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`.. GENERATED FROM PYTHON SOURCE LINES 5-316`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00


			`.. image:: /getting-started/tutorials/images/sphx_glr_05-layer-norm_001.png`
			`:alt: 05 layer norm`
			`:class: sphx-glr-single-img`


			`.. rst-class:: sphx-glr-script-out`

			`Out:`

			`.. code-block:: none`

			`layer-norm:`
			`N Triton Torch Apex`
[GH-PAGES] Updated website 2022-06-26 00:50:12 +00:00			`0 1024.0 585.142849 277.694907 468.114273`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`1 1536.0 630.153868 327.679984 511.999982`
[GH-PAGES] Updated website 2022-07-15 00:51:15 +00:00			`2 2048.0 668.734716 334.367358 520.126988`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`3 2560.0 694.237267 362.477870 518.481028`
			`4 3072.0 702.171410 375.206126 501.551037`
			`5 3584.0 725.873439 384.859062 455.111115`
			`6 4096.0 728.177767 381.023256 451.972420`
			`7 4608.0 670.254540 394.267384 418.909088`
[GH-PAGES] Updated website 2022-07-15 00:51:15 +00:00			`8 5120.0 688.403381 397.669909 424.455959`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`9 5632.0 704.000002 395.228063 411.470331`
			`10 6144.0 697.191505 402.885254 409.600010`
			`11 6656.0 700.631610 398.861429 400.360920`
			`12 7168.0 686.754468 394.116833 384.859062`
			`13 7680.0 678.895043 393.846167 385.203746`
			`14 8192.0 633.198054 388.937680 369.216905`
			`15 8704.0 624.502255 390.095225 379.465939`
			`16 9216.0 604.327881 407.337026 381.023249`
			`17 9728.0 587.350922 408.524944 382.427505`
			`18 10240.0 564.965524 409.600010 382.803739`
[GH-PAGES] Updated website 2022-07-15 00:51:15 +00:00			`19 10752.0 546.133312 411.559798 381.445676`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`20 11264.0 533.207081 405.909906 375.466676`
[GH-PAGES] Updated website 2022-07-15 00:51:15 +00:00			`21 11776.0 520.486200 409.599991 377.587162`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`22 12288.0 513.336807 413.911572 383.251457`
			`23 12800.0 504.433489 409.599981 375.779805`
[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`24 13312.0 494.180982 405.699062 376.310952`
[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`25 13824.0 481.882350 411.888257 378.739711`
			`26 14336.0 470.997935 405.257949 371.158581`
			`27 14848.0 460.403127 407.492270 374.712936`
			`28 15360.0 454.269882 406.887417 377.511515`
			`29 15872.0 446.312810 406.974373 375.668625`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00





			`\|`

			`.. code-block:: default`


			`import torch`

			`import triton`
			`import triton.language as tl`

			`try:`
			`# This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it`
			`# should not be added to extras_require in setup.py.`
			`import apex`
			`HAS_APEX = True`
			`except ModuleNotFoundError:`
			`HAS_APEX = False`


			`@triton.jit`
			`def _layer_norm_fwd_fused(`
			`Out,`
			`A,`
			`Weight,`
			`Bias,`
			`Mean, Rstd,`
			`stride, N, eps,`
			`BLOCK_SIZE: tl.constexpr,`
			`):`
			`# position of elements processed by this program`
			`row = tl.program_id(0)`
			`Out += row * stride`
			`A += row * stride`
			`# compute mean`
			`mean = 0`
			`_mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32)`
			`for off in range(0, N, BLOCK_SIZE):`
			`cols = off + tl.arange(0, BLOCK_SIZE)`
			`a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)`
			`_mean += a`
			`mean = tl.sum(_mean, axis=0) / N`
			`# compute variance`
			`_var = tl.zeros([BLOCK_SIZE], dtype=tl.float32)`
			`for off in range(0, N, BLOCK_SIZE):`
			`cols = off + tl.arange(0, BLOCK_SIZE)`
			`a = tl.load(A + cols, mask=cols < N, other=0., eviction_policy="evict_last").to(tl.float32)`
			`a = tl.where(cols < N, a - mean, 0.)`
			`_var += a * a`
			`var = tl.sum(_var, axis=0) / N`
			`rstd = 1 / tl.sqrt(var + eps)`
			`# write-back mean/rstd`
			`tl.store(Mean + row, mean)`
			`tl.store(Rstd + row, rstd)`
			`# multiply by weight and add bias`
			`for off in range(0, N, BLOCK_SIZE):`
			`cols = off + tl.arange(0, BLOCK_SIZE)`
			`mask = cols < N`
			`weight = tl.load(Weight + cols, mask=mask)`
			`bias = tl.load(Bias + cols, mask=mask)`
			`a = tl.load(A + cols, mask=mask, other=0., eviction_policy="evict_first").to(tl.float32)`
			`a_hat = (a - mean) * rstd`
			`out = a_hat * weight + bias`
			`# # write-back`
			`tl.store(Out + cols, out, mask=mask)`

			`# Backward pass (DA + partial DW + partial DB)`


			`@triton.jit`
			`def _layer_norm_bwd_dx_fused(`
			`_DA,`
			`_DOut,`
			`_A,`
			`Weight,`
			`Mean, Rstd,`
			`stride, NumRows, NumCols, eps,`
			`BLOCK_SIZE_N: tl.constexpr,`
			`):`
			`# position of elements processed by this program`
			`pid = tl.program_id(0)`
			`row = pid`
			`A = _A + row * stride`
			`DOut = _DOut + row * stride`
			`DA = _DA + row * stride`
			`mean = tl.load(Mean + row)`
			`rstd = tl.load(Rstd + row)`
			`# load data to SRAM`
			`_mean1 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)`
			`_mean2 = tl.zeros([BLOCK_SIZE_N], dtype=tl.float32)`
			`for off in range(0, NumCols, BLOCK_SIZE_N):`
			`cols = off + tl.arange(0, BLOCK_SIZE_N)`
			`mask = cols < NumCols`
			`a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)`
			`dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)`
			`weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)`
			`a_hat = (a - mean) * rstd`
			`wdout = weight * dout`
			`_mean1 += a_hat * wdout`
			`_mean2 += wdout`
			`mean1 = tl.sum(_mean1, axis=0) / NumCols`
			`mean2 = 0.`
			`mean2 = tl.sum(_mean2, axis=0) / NumCols`
			`for off in range(0, NumCols, BLOCK_SIZE_N):`
			`cols = off + tl.arange(0, BLOCK_SIZE_N)`
			`mask = cols < NumCols`
			`a = tl.load(A + cols, mask=mask, other=0).to(tl.float32)`
			`dout = tl.load(DOut + cols, mask=mask, other=0).to(tl.float32)`
			`weight = tl.load(Weight + cols, mask=mask, other=0).to(tl.float32)`
			`a_hat = (a - mean) * rstd`
			`wdout = weight * dout`
			`da = (wdout - (a_hat * mean1 + mean2)) * rstd`
			`# write-back dx`
			`tl.store(DA + cols, da, mask=mask)`


			`# Backward pass (total DW + total DB)`
			`@triton.jit`
			`def _layer_norm_bwd_dwdb(`
			`A, DOut,`
			`Mean, Var,`
			`DW,`
			`DB,`
			`M, N,`
			`BLOCK_SIZE_M: tl.constexpr,`
			`BLOCK_SIZE_N: tl.constexpr,`
			`):`
			`pid = tl.program_id(0)`
			`cols = pid * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)`
			`dw = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)`
			`db = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)`
[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`UNROLL: tl.constexpr = 4`
			`for i in range(0, M, BLOCK_SIZE_M * UNROLL):`
			`for j in range(UNROLL):`
			`rows = i + j * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)`
			`mask = (rows[:, None] < M) & (cols[None, :] < N)`
			`offs = rows[:, None] * N + cols[None, :]`
			`a = tl.load(A + offs, mask=mask, other=0.).to(tl.float32)`
			`dout = tl.load(DOut + offs, mask=mask, other=0.).to(tl.float32)`
			`mean = tl.load(Mean + rows, mask=rows < M, other=0.)`
			`rstd = tl.load(Var + rows, mask=rows < M, other=0.)`
			`a_hat = (a - mean[:, None]) * rstd[:, None]`
			`dw += dout * a_hat`
			`db += dout`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00			`sum_dw = tl.sum(dw, axis=0)`
			`sum_db = tl.sum(db, axis=0)`
			`tl.store(DW + cols, sum_dw, mask=cols < N)`
			`tl.store(DB + cols, sum_db, mask=cols < N)`


			`class LayerNorm(torch.autograd.Function):`
			`@staticmethod`
			`def forward(ctx, a, normalized_shape, weight, bias, eps):`
			`# allocate output`
			`out = torch.empty_like(a)`
			`# reshape input data into 2D tensor`
			`a_arg = a.reshape(-1, a.shape[-1])`
			`M, N = a_arg.shape`
			`mean = torch.empty((M,), dtype=torch.float32, device="cuda")`
			`rstd = torch.empty((M,), dtype=torch.float32, device="cuda")`
			`# Less than 64KB per feature: enqueue fused kernel`
			`MAX_FUSED_SIZE = 65536 // a.element_size()`
			`BLOCK_SIZE = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))`
			`BLOCK_SIZE = max(BLOCK_SIZE, 128)`
			`BLOCK_SIZE = min(BLOCK_SIZE, 4096)`
			`# heuristics for number of warps`
			`num_warps = min(max(BLOCK_SIZE // 256, 1), 8)`
			`_layer_norm_fwd_fused[(M,)](`
			`out,`
			`a_arg,`
			`weight,`
			`bias,`
			`mean, rstd,`
			`a_arg.stride(0), N, eps,`
			`BLOCK_SIZE=BLOCK_SIZE,`
			`num_warps=num_warps,`
			`)`
			`ctx.save_for_backward(`
			`a, weight, bias, mean, rstd,`
			`)`
			`ctx.BLOCK_SIZE = BLOCK_SIZE`
			`ctx.num_warps = num_warps`
			`ctx.eps = eps`
			`if hasattr(bias, "config"):`
			`assert bias.config.grad_scale_name == weight.config.grad_scale_name`
			`grad_scale_name = bias.config.grad_scale_name`
			`else:`
			`grad_scale_name = None`
			`ctx.grad_scale_gain_bias_name = grad_scale_name`
			`return out`

			`@staticmethod`
			`def backward(ctx, dout):`
			`assert dout.is_contiguous()`
			`a, weight, bias, mean, var = ctx.saved_tensors`
			`# heuristics for amount of parallel reduction stream for DG/DB`
			`N = weight.shape[0]`
			`# allocate output`
			`da = torch.empty_like(dout)`
			`# enqueue kernel using forward pass heuristics`
			`# also compute partial sums for DW and DB`
			`x_arg = a.reshape(-1, a.shape[-1])`
			`M, N = x_arg.shape`
			`dweight = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)`
			`dbias = torch.empty((weight.shape[0],), dtype=weight.dtype, device=weight.device)`
			`_layer_norm_bwd_dx_fused[(M,)](`
			`da,`
			`dout,`
			`a,`
			`weight,`
			`mean, var,`
			`x_arg.stride(0), M, N,`
			`ctx.eps,`
			`BLOCK_SIZE_N=ctx.BLOCK_SIZE,`
			`num_warps=ctx.num_warps,`
			`)`
[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`if N > 10240:`
			`BLOCK_SIZE_N = 128`
			`BLOCK_SIZE_M = 32`
			`num_warps = 4`
			`else:`
			`# maximize occupancy for small N`
			`BLOCK_SIZE_N = 16`
			`BLOCK_SIZE_M = 16`
			`num_warps = 8`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00			`grid = lambda meta: [triton.cdiv(N, meta["BLOCK_SIZE_N"])]`
			`_layer_norm_bwd_dwdb[grid](`
			`a, dout,`
			`mean, var,`
			`dweight,`
			`dbias,`
			`M,`
			`N,`
[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`BLOCK_SIZE_M=BLOCK_SIZE_M,`
			`BLOCK_SIZE_N=BLOCK_SIZE_N,`
			`num_warps=num_warps`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00			`)`
[GH-PAGES] Updated website 2022-07-14 07:22:19 +00:00			`return (da, None, dweight, dbias, None)`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00

			`def layer_norm(a, normalized_shape, weight, bias, eps):`
			`return LayerNorm.apply(a, normalized_shape, weight, bias, eps)`


			`def test_layer_norm(M, N, dtype, eps=1e-5, device='cuda'):`
			`torch.manual_seed(0)`
			`# create data`
			`x_shape = (M, N)`
			`w_shape = (x_shape[-1], )`
			`weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)`
			`bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)`
			`x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')`
			`dy = .1 * torch.randn_like(x)`
			`x.requires_grad_(True)`
			`# forward pass`
			`y_tri = layer_norm(x, w_shape, weight, bias, eps)`
			`y_ref = torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps).to(dtype)`
			`# backward pass (triton)`
			`y_tri.backward(dy, retain_graph=True)`
			`dx_tri, dw_tri, db_tri = [_.grad.clone() for _ in [x, weight, bias]]`
			`x.grad, weight.grad, bias.grad = None, None, None`
			`# backward pass (torch)`
			`y_ref.backward(dy, retain_graph=True)`
			`dx_ref, dw_ref, db_ref = [_.grad.clone() for _ in [x, weight, bias]]`
			`# compare`
			`triton.testing.assert_almost_equal(y_tri, y_ref)`
			`triton.testing.assert_almost_equal(dx_tri, dx_ref)`
			`triton.testing.assert_almost_equal(db_tri, db_ref, decimal=1)`
			`triton.testing.assert_almost_equal(dw_tri, dw_ref, decimal=1)`


			`@triton.testing.perf_report(`
			`triton.testing.Benchmark(`
			`x_names=['N'],`
			`x_vals=[512 * i for i in range(2, 32)],`
			`line_arg='provider',`
			`line_vals=['triton', 'torch'] + (['apex'] if HAS_APEX else []),`
			`line_names=['Triton', 'Torch'] + (['Apex'] if HAS_APEX else []),`
			`styles=[('blue', '-'), ('green', '-'), ('orange', '-')],`
			`ylabel='GB/s',`
			`plot_name='layer-norm',`
			`args={'M': 4096, 'dtype': torch.float16, 'mode': 'forward'}`
			`)`
			`)`
			`def bench_layer_norm(M, N, dtype, provider, mode, eps=1e-5, device='cuda'):`
			`# create data`
			`x_shape = (M, N)`
			`w_shape = (x_shape[-1], )`
			`weight = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)`
			`bias = torch.rand(w_shape, dtype=dtype, device='cuda', requires_grad=True)`
			`x = -2.3 + 0.5 * torch.randn(x_shape, dtype=dtype, device='cuda')`
			`dy = .1 * torch.randn_like(x)`
			`x.requires_grad_(True)`
			`# utility functions`
			`if provider == 'triton':`
			`y_fwd = lambda: layer_norm(x, w_shape, weight, bias, eps)`
			`if provider == 'torch':`
			`y_fwd = lambda: torch.nn.functional.layer_norm(x, w_shape, weight, bias, eps)`
			`if provider == 'apex':`
			`apex_layer_norm = apex.normalization.FusedLayerNorm(w_shape).to(x.device).to(x.dtype)`
			`y_fwd = lambda: apex_layer_norm(x)`
			`# forward pass`
			`if mode == 'forward':`
			`gbps = lambda ms: 2 * x.numel() * x.element_size() / ms * 1e-6`
			`ms, min_ms, max_ms = triton.testing.do_bench(y_fwd, rep=500)`
			`# backward pass`
			`if mode == 'backward':`
			`gbps = lambda ms: 3 * x.numel() * x.element_size() / ms * 1e-6`
			`y = y_fwd()`
			`ms, min_ms, max_ms = triton.testing.do_bench(lambda: y.backward(dy, retain_graph=True),`
			`grad_to_none=[x], rep=500)`
			`return gbps(ms), gbps(max_ms), gbps(min_ms)`


			`# test_layer_norm(1151, 8192, torch.float16)`
			`bench_layer_norm.run(save_path='.', print_data=True)`


			`.. rst-class:: sphx-glr-timing`

[GH-PAGES] Updated website 2022-07-16 00:49:55 +00:00			`Total running time of the script: ( 5 minutes 37.208 seconds)`
[GH-PAGES] Updated website 2022-06-05 21:05:02 +00:00

			`.. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py:`


			`.. only :: html`

			`.. container:: sphx-glr-footer`
			`:class: sphx-glr-footer-example`



			`.. container:: sphx-glr-download sphx-glr-download-python`

			:download:`Download Python source code: 05-layer-norm.py <05-layer-norm.py>`



			`.. container:: sphx-glr-download sphx-glr-download-jupyter`

			:download:`Download Jupyter notebook: 05-layer-norm.ipynb <05-layer-norm.ipynb>`


			`.. only:: html`

			`.. rst-class:: sphx-glr-signature`

			`Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_