[PYTHON] Made codebase pep8 compliant

2021-02-07 15:06:57 -05:00
parent 2a02fabdac
commit dffd66bc83
5 changed files with 207 additions and 177 deletions
--- a/python/triton/kernel.py
+++ b/python/triton/kernel.py
@@ -6,6 +6,7 @@ import triton._C.libtriton.triton as _triton
 import triton._C.libtriton.torch_utils as _torch_utils
 # Make sure internal C resources are cleaned up upon exit
 import atexit
@atexit.register
 def cleanup():
    _triton.cleanup()
@@ -50,7 +51,6 @@ def read(path, kernel_names=[]):
    return source
 class kernel:
    def __init__(self, src, device, defines=dict(), num_warps=4, autotune_vals=[], autotune_key=[]):
        # check if src is empty
        if src == '':
--- a/python/triton/ops/blocksparse/softmax.py
+++ b/python/triton/ops/blocksparse/softmax.py
@@ -2,19 +2,13 @@ import triton
 import torch
 import os
-fwd_src = triton.read(os.path.join(os.path.dirname(__file__), 'softmax.c'), 
+fwd_src = triton.read(os.path.join(os.path.dirname(__file__), 'softmax.c'), kernel_names=['forward'])
                      kernel_names=['forward'])
 fwd_kernels = dict()
-
+bwd_src = triton.read(os.path.join(os.path.dirname(__file__), 'softmax.c'), kernel_names=['backward'])
 bwd_src = triton.read(os.path.join(os.path.dirname(__file__), 'softmax.c'), 
                      kernel_names=['backward'])
 bwd_kernels = dict()
 class _softmax(torch.autograd.Function):
    @staticmethod
    def next_power_of_2(n):
        n -= 1
@@ -49,18 +43,21 @@ class _softmax(torch.autograd.Function):
        return lut, int(sizes.max())
    @staticmethod
-    def make_kernel(cache, src, max_k, device, dtype, block, apply_scale, apply_rpe, apply_kp_mask, apply_attn_mask, kp_mask_mode, attn_mask_mode):
+    def make_kernel(cache, src, max_k, device, dtype, block, apply_scale, apply_rpe, apply_kp_mask, apply_attn_mask,
                    kp_mask_mode, attn_mask_mode):
        if max_k >= 32768:
            raise NotImplementedError('Reductions larger than 32768 elements '\
                                      'are not yet implemented')
        num_warps = 4 if max_k < 512 else (8 if max_k < 2048 else 16)
        TN = _softmax.next_power_of_2(max_k)
        # just-in-time compile kernel
-        key = (block, device, dtype, num_warps, TN, apply_scale, apply_rpe, apply_kp_mask, apply_attn_mask, kp_mask_mode, attn_mask_mode)
+        key = (block, device, dtype, num_warps, TN, apply_scale, apply_rpe, apply_kp_mask, apply_attn_mask,
               kp_mask_mode, attn_mask_mode)
        if key not in cache:
-            defines = {'TM': 1, 'TN': TN, 'TYPE': dtype, 'BLOCK': block,
+            defines = {
-                       'INFINITY': {torch.float32: 'F32_INFINITY',
+                'TM': 1, 'TN': TN, 'TYPE': dtype, 'BLOCK': block, 'INFINITY':
-                                    torch.float16: 'F16_INFINITY'}[dtype]}
+                {torch.float32: 'F32_INFINITY', torch.float16: 'F16_INFINITY'}[dtype]
            }
            if apply_scale:
                defines['APPLY_SCALE'] = True
            if apply_rpe:
@@ -78,8 +75,8 @@ class _softmax(torch.autograd.Function):
        return cache[key]
    @staticmethod
-    def forward(ctx, x, scale, rpe, key_padding_mask, attn_mask, kp_mask_mode, attn_mask_mode,
+    def forward(ctx, x, scale, rpe, key_padding_mask, attn_mask, kp_mask_mode, attn_mask_mode, spdims, block, lut,
-                spdims, block, lut, maxlut, bench, time):
+                maxlut, bench, time):
        apply_scale = False if scale == 1.0 else True
        # handle None rpe
@@ -109,17 +106,26 @@ class _softmax(torch.autograd.Function):
            apply_attn_mask = True
            stride_zattnm = attn_mask.stride(0)
        # run kernel
-        kernel = _softmax.make_kernel(fwd_kernels, fwd_src, maxlut*block, x.device, x.dtype, block,
+        kernel = _softmax.make_kernel(fwd_kernels, fwd_src, maxlut * block, x.device, x.dtype, block, apply_scale,
-                                            apply_scale, apply_rpe, apply_kp_mask, apply_attn_mask,
+                                      apply_rpe, apply_kp_mask, apply_attn_mask, kp_mask_mode, attn_mask_mode)
                                            kp_mask_mode, attn_mask_mode)
        M = x.shape[0]
        grid = lambda opt: [triton.cdiv(spdims[0] * spdims[1] * block, opt.TM), M]
        # run kernel
-        kernel(x.data_ptr(), scale, lut.data_ptr(), rpe.data_ptr(), key_padding_mask.data_ptr(), attn_mask.data_ptr(),
+        kernel(x.data_ptr(),
-               maxlut, x.stride(0), stride_zrpe, stride_hrpe, stride_srpe, stride_zkpm, stride_zattnm,
+               scale,
               lut.data_ptr(),
               rpe.data_ptr(),
               key_padding_mask.data_ptr(),
               attn_mask.data_ptr(),
               maxlut,
               x.stride(0),
               stride_zrpe,
               stride_hrpe,
               stride_srpe,
               stride_zkpm,
               stride_zattnm,
               grid=grid)
        # save to context
        ctx.mark_dirty(x)
@@ -166,8 +172,14 @@ class softmax:
        self.bench = bench
        self.lut_cache = dict()
-    def __call__(self, x, scale = 1., rpe = None, key_padding_mask = None, attn_mask = None,
+    def __call__(self,
-            key_padding_mask_mode='add', attn_mask_mode='add'):
+                 x,
                 scale=1.,
                 rpe=None,
                 key_padding_mask=None,
                 attn_mask=None,
                 key_padding_mask_mode='add',
                 attn_mask_mode='add'):
        time_y = [None]
        if rpe is not None and rpe.dtype != x.dtype:
            raise ValueError('relative position embedding must be %s' % x.dtype)
@@ -176,9 +188,6 @@ class softmax:
        if key_padding_mask is not None and key_padding_mask.dtype != x.dtype:
            raise ValueError('Key padding mask must be %s' % x.dtype)
        lut, maxlut = self.make_lut(x.device)
-        x = softmax.apply_softmax(x, scale, rpe, key_padding_mask, attn_mask,
+        x = softmax.apply_softmax(x, scale, rpe, key_padding_mask, attn_mask, key_padding_mask_mode, attn_mask_mode,
-                                  key_padding_mask_mode, attn_mask_mode,
+                                  self.spdims, self.block, lut, maxlut, self.bench, time_y)
                                  self.spdims, self.block,
                                  lut, 
                                  maxlut, self.bench, time_y)
        return x
--- a/python/triton/ops/conv.py
+++ b/python/triton/ops/conv.py
@@ -33,7 +33,12 @@ class _conv(torch.autograd.Function):
                'TN': 64,
                'TK': TK,
                'TZ': 1,
-              'HH': H, 'WW': W, 'PP': P, 'QQ': Q, 'SS': S, 'RR': R,
+                'HH': H,
                'WW': W,
                'PP': P,
                'QQ': Q,
                'SS': S,
                'RR': R,
            }
            idx = torch.arange(CI * R * S)
            ci, r, s = _conv.unpack(idx, CI, R, S)
@@ -45,12 +50,31 @@ class _conv(torch.autograd.Function):
        # allocate output
        c = torch.empty([Z, CO, P, Q], dtype=dtype, device=device)
        # enqueue
-      kernel(a.data_ptr(), b.data_ptr(), c.data_ptr(), 1., Z*P*Q, CO, CI*R*S, 
+        kernel(
-            pad[0], pad[1], stride[0], stride[1],
+            a.data_ptr(),
            b.data_ptr(),
            c.data_ptr(),
            1.,
            Z * P * Q,
            CO,
            CI * R * S,
            pad[0],
            pad[1],
            stride[0],
            stride[1],
            delta.data_ptr(),
-            a.stride(0), a.stride(1), a.stride(2), a.stride(3),
+            a.stride(0),
-            b.stride(0), b.stride(1), b.stride(2), b.stride(3),
+            a.stride(1),
-            c.stride(0), c.stride(1), c.stride(2), c.stride(3),
+            a.stride(2),
            a.stride(3),
            b.stride(0),
            b.stride(1),
            b.stride(2),
            b.stride(3),
            c.stride(0),
            c.stride(1),
            c.stride(2),
            c.stride(3),
            grid=lambda opt: [triton.cdiv(Z * P * Q, opt.TM), triton.cdiv(CO, opt.TN)])
        return c
--- a/python/triton/ops/matmul.py
+++ b/python/triton/ops/matmul.py
@@ -28,9 +28,9 @@ class _matmul(torch.autograd.Function):
        if N % 2 == 0: return 2
        return 1
    _locks = dict()
    _kernels = dict()
    @staticmethod
    def _call(a, b):
        dtype = a.dtype
@@ -55,18 +55,16 @@ class _matmul(torch.autograd.Function):
        key = (device, dtype, is_a_row, is_b_row, lda_pow2_div, ldb_pow2_div, ldc_pow2_div, is_tk_div_k)
        if key not in _matmul._kernels:
            defines = {
-                'TYPE' : dtype,
+                'TYPE': dtype, 'STRIDE_AM': 'lda' if is_a_row else '1', 'STRIDE_AK': '1' if is_a_row else 'lda',
-                'STRIDE_AM'   : 'lda' if is_a_row else '1', 
+                'STRIDE_BK': 'ldb' if is_b_row else '1', 'STRIDE_BN': '1' if is_b_row else 'ldb', 'LDA_POW2_DIV':
-                'STRIDE_AK'   : '1'   if is_a_row else 'lda',
+                lda_pow2_div, 'LDB_POW2_DIV': ldb_pow2_div, 'LDC_POW2_DIV': ldc_pow2_div, 'IS_TK_DIV_K':
-                'STRIDE_BK'   : 'ldb' if is_b_row else '1',
+                int(is_tk_div_k)
                'STRIDE_BN'   : '1'   if is_b_row else 'ldb',
                'LDA_POW2_DIV': lda_pow2_div,
                'LDB_POW2_DIV': ldb_pow2_div,
                'LDC_POW2_DIV': ldc_pow2_div,
                'IS_TK_DIV_K' : int(is_tk_div_k)
            }
-            _matmul._kernels[key] = triton.kernel(_matmul.src, device, defines=defines, 
+            _matmul._kernels[key] = triton.kernel(_matmul.src,
-                                                  autotune_vals = _matmul._CONFIGS, autotune_key=['M', 'N', 'K'])
+                                                  device,
                                                  defines=defines,
                                                  autotune_vals=_matmul._CONFIGS,
                                                  autotune_key=['M', 'N', 'K'])
        kernel = _matmul._kernels[key]
        # # locks for split-k
        if device not in _matmul._locks:
--- a/python/triton/ops/softmax.py
+++ b/python/triton/ops/softmax.py
@@ -4,6 +4,7 @@ import os
 fwd_src = triton.read(os.path.join(os.path.dirname(__file__), 'softmax.c'), kernel_names=['forward'])
 fwd_kernels = dict()
 def get_fwd_kernel(block, dtype, device):
    key = (block, dtype, device)
    if key not in fwd_kernels:
@@ -11,16 +12,14 @@ def get_fwd_kernel(block, dtype, device):
        fwd_kernels[key] = triton.kernel(fwd_src, device=device, defines=defines)
    return fwd_kernels[key]
 class _softmax(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x):
        y = torch.empty_like(x)
        M, N = x.shape
        kernel = get_fwd_kernel(N, x.dtype, x.device)
-        kernel(x.data_ptr(), y.data_ptr(), grid = lambda opt: [M, ])
+        grid = lambda opt: (M, )
        kernel(x.data_ptr(), y.data_ptr(), grid=grid)
        return y
 softmax = _softmax.apply