""" Layer Normalization ==================== """ import torch import triton import triton.language as tl try: # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it # should not be added to extras_require in setup.py. import apex HAS_APEX = True except ModuleNotFoundError: HAS_APEX = False # fmt: off @triton.jit def _layer_norm_fwd_fused( Out, A, Weight, Bias, Mean, Rstd, stride, N, eps, BLOCK_SIZE: tl.constexpr, ): # position of elements processed by this program row = tl.program_id(0) Out += row * stride A += row * stride # compute mean mean = 0 _mean = tl.zeros([BLOCK_SIZE], dtype=tl.float32) for off in range(0, N, BLOCK_SIZE): cols = off + tl.arange(0, BLOCK_SIZE) a = tl.load(A + cols, mask=cols