[STYLE] add isort and autopep8 config files and check on CI (#423)

Also a fix a few more style issues from the "aggressive" mode of autopep8.
2022-01-07 13:11:34 -08:00
parent 9801aa7b56
commit a70acfec77
11 changed files with 102 additions and 77 deletions
--- a/.github/workflows/integration-tests.yml
+++ b/.github/workflows/integration-tests.yml
@@ -30,6 +30,12 @@ jobs:
          cd python
          pip3 install -e '.[tests]'
      - name: Check imports
        run: "isort -c ./python || ( echo '::error title=Imports not sorted::Please run \"isort ./python\"' ; exit 1 )"
      - name: Check style
        run: "autopep8 -a -r -d --exit-code ./python || ( echo '::error title=Style issues::Please run \"autopep8 -a -r -i ./python\"' ; exit 1 )"
      - name: Unit tests
        run: |
          cd python/test/unit
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -0,0 +1,4 @@
 [settings]
 known_local_folder=triton
 line_length=88
 py_version=36
--- a/python/setup.cfg
+++ b/python/setup.cfg
@@ -1,2 +1,5 @@
 [metadata]
 description_file = README.md
 [pycodestyle]
 ignore = E501,E701,E731
--- a/python/setup.py
+++ b/python/setup.py
@@ -148,6 +148,8 @@ setup(
    ],
    extras_require={
        "tests": [
            "autopep8",
            "isort",
            "numpy",
            "pytest",
            "scipy>=1.7.1",
--- a/python/test/regression/test_performance.py
+++ b/python/test/regression/test_performance.py
@@ -1,4 +1,3 @@
 import triton.language as tl
 import subprocess
 import sys
@@ -7,6 +6,7 @@ import torch
 from numpy import record
 import triton
 import triton.language as tl
 #######################
 # Utilities
--- a/python/triton/init.py
+++ b/python/triton/init.py
@@ -1,4 +1,4 @@
-# version
+"""isort:skip_file"""
 __version__ = '2.0.0'
 # TODO: torch needs to be imported first
--- a/python/triton/code_gen.py
+++ b/python/triton/code_gen.py
@@ -852,7 +852,7 @@ class Autotuner:
        else:
            config = self.configs[0]
        self.best_config = config
-        if config.pre_hook != None:
+        if config.pre_hook is not None:
            config.pre_hook(self.nargs)
        return self.kernel(*args, num_warps=config.num_warps, num_stages=config.num_stages, **kwargs, **config.kwargs)
--- a/python/triton/language/core.py
+++ b/python/triton/language/core.py
@@ -293,7 +293,7 @@ class block:
        dst_shape = []
        curr = 0
        for sl in slices:
-            if sl == None:
+            if sl is None:
                dst_shape.append(1)
            elif sl == slice(None, None, None):
                dst_shape.append(src_shape[curr])
--- a/python/triton/ops/blocksparse/matmul.py
+++ b/python/triton/ops/blocksparse/matmul.py
@@ -39,21 +39,23 @@ def _sdd_kernel(
    start_am = tl.load(lut + 1)
    offs_am = start_am * BLOCK + (tl.arange(0, TILE_M) % BLOCK)
    offs_ak = tl.arange(0, TILE_K)
-    a_ptrs = A + (off_z * stride_za
+    a_ptrs = A \
-                  + off_h * stride_ha
+        + off_z * stride_za \
-                  + offs_am[:, None] * stride_ma
+        + off_h * stride_ha \
-                  + offs_ak[None, :] * stride_ak)
+        + offs_am[:, None] * stride_ma \
        + offs_ak[None, :] * stride_ak
    # initialize pointers to B
    start_bn = tl.load(lut + 2)
    offs_bn = start_bn * BLOCK + (tl.arange(0, TILE_N) % BLOCK)
    offs_bk = tl.arange(0, TILE_K)
-    b_ptrs = B + (off_z * stride_zb
+    b_ptrs = B \
-                  + off_h * stride_hb
+        + off_z * stride_zb \
-                  + offs_bn[None, :] * stride_nb
+        + off_h * stride_hb \
-                  + offs_bk[:, None] * stride_bk)
+        + offs_bn[None, :] * stride_nb \
-    ## ---------------- ##
+        + offs_bk[:, None] * stride_bk
-    ##    Inner Loop    ##
+    # ---------------- #
-    ## ---------------- ##
+    #    Inner Loop    #
    # ---------------- #
    acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
    for k in range(K, 0, -TILE_K):
        if EVEN_K:
@@ -66,15 +68,16 @@ def _sdd_kernel(
        a_ptrs += TILE_K * stride_ak
        b_ptrs += TILE_K * stride_bk
    c = acc.to(C.dtype.element_ty)
-    ## ---------------- ##
+    # ---------------- #
-    ##    Epilogue      ##
+    #    Epilogue      #
-    ## ---------------- ##
+    # ---------------- #
    offs_cm = tl.arange(0, TILE_M) % BLOCK
    offs_cn = tl.arange(0, TILE_N) % BLOCK
-    pc = C + (off_z * stride_zc
+    pc = C \
-              + block_id * stride_hc
+        + off_z * stride_zc \
-              + offs_cm[:, None] * stride_mc
+        + block_id * stride_hc \
-              + offs_cn[None, :] * stride_nc)
+        + offs_cm[:, None] * stride_mc \
        + offs_cn[None, :] * stride_nc
    tl.store(pc, c, mask=True)
@@ -168,9 +171,9 @@ def _dsd_kernel(
        + off_h * stride_hb \
        + offs_bn[None, :] * stride_bn \
        + offs_bk[:, None] * stride_bk
-    ## ---------------- ##
+    # ---------------- #
-    ##    Inner Loop    ##
+    #    Inner Loop    #
-    ## ---------------- ##
+    # ---------------- #
    acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
    pinc += 2
    inc_a = tl.load(pinc + 1)
@@ -192,7 +195,8 @@ def _dsd_kernel(
    # initialize pointers to C
    offs_cm = column * TILE_M + tl.arange(0, TILE_M)
    offs_cn = pid_m * TILE_N + tl.arange(0, TILE_N)
-    pc = C + off_h * stride_hc \
+    pc = C \
        + off_h * stride_hc \
        + pidz * stride_zc \
        + offs_cm[:, None] * stride_cm \
        + offs_cn[None, :] * stride_cn
@@ -224,7 +228,6 @@ def dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=N
    TILE_N = 128
    # compute output
    grid = lambda meta: [triton.cdiv(BS3, meta['TILE_N']), width, BS0]
    # fmt: off
    _dsd_kernel[grid](
        a, b, c,
        a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3),
@@ -237,6 +240,7 @@ def dsd_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=N
    # exit()
    return c
 def dsd_lut(layout, block, step, trans, device):
    sizes = torch.sum(layout, 2 if trans else 1)
    head_id, col_id = sizes.nonzero(as_tuple=True)
@@ -317,6 +321,8 @@ def dsd_lut(layout, block, step, trans, device):
 # -----------------------------
 # Dense = Dense x Sparse (DDS)
 # -----------------------------
@triton.jit
 def _dds_kernel(
    A, B, C,
@@ -361,9 +367,9 @@ def _dds_kernel(
        + block_id * stride_hb \
        + offs_bn[None, :] * stride_bn \
        + offs_bk[:, None] * stride_bk
-    ## ---------------- ##
+    # ---------------- #
-    ##    Inner Loop    ##
+    #    Inner Loop    #
-    ## ---------------- ##
+    # ---------------- #
    acc = tl.zeros((TILE_M, TILE_N), dtype=tl.float32)
    for k in range(AS1, 0, -TILE_K):
        a = tl.load(ptrs_a, mask=offs_am[:, None] < DS0)
@@ -377,9 +383,9 @@ def _dds_kernel(
        inc_a = inc_a * stride_ka
        ptrs_a += inc_a
        ptrs_b += inc_b
-    ## ---------------- ##
+    # ---------------- #
-    ##    Epilogue      ##
+    #    Epilogue      #
-    ## ---------------- ##
+    # ---------------- #
    c = acc.to(C.dtype.element_ty)
    # initialize pointers to C (dense)
    offs_cm = pid_m * TILE_M + tl.arange(0, TILE_M)
@@ -391,6 +397,7 @@ def _dds_kernel(
    # write back
    tl.store(ptrs_c, c, mask=offs_cm[:, None] < DS0)
 def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out=None):
    if a.stride(2) != 1 and a.stride(3) != 1:
        a = a.contiguous()
@@ -414,7 +421,6 @@ def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out =
        c = out
    TILE_M = {16: 256, 32: 256, 64: 128, 128: 128}[block]
    grid = lambda meta: [triton.cdiv(AS2, meta['TILE_M']), width, AS0]
    # fmt: off
    _dds_kernel[grid](
        a, b, c,
        a.stride(0), a.stride(1), a.stride(3 if trans_a else 2), a.stride(2 if trans_a else 3),
@@ -429,6 +435,8 @@ def dds_matmul(a, b, trans_a, trans_b, trans_c, spdims, block, lut, width, out =
 ##############
 #  MAIN API  #
 ##############
 class _matmul(torch.autograd.Function):
    fn = {'sdd': sdd_matmul, 'dsd': dsd_matmul, 'dds': dds_matmul}
@@ -477,6 +485,7 @@ class _matmul(torch.autograd.Function):
            None, None, None, None,\
            None, None, None, None, None, dout
 class matmul:
    def __init__(self, layout, block, mode, device, trans_a=False, trans_b=False, trans_c=False):
--- a/python/triton/tools/disasm.py
+++ b/python/triton/tools/disasm.py
@@ -52,7 +52,7 @@ def processSassLines(fline, sline, labels):
        asm = asm[:-2] + ";"
    ctrl = parseCtrl(sline)
    # BRA target address
-    if BRA_RE.match(asm) != None:
+    if BRA_RE.match(asm) is not None:
        target = int(BRA_RE.match(asm).group(2), 16)
        if target in labels:
            pass
@@ -62,7 +62,7 @@ def processSassLines(fline, sline, labels):
 def extract(file_path, fun):
-    if fun == None:
+    if fun is None:
        sass_str = subprocess.check_output(["cuobjdump", "-sass", file_path])
    else:
        sass_str = subprocess.check_output(["cuobjdump", "-fun", fun, "-sass", file_path])
@@ -77,7 +77,7 @@ def extract(file_path, fun):
        #                 /*0x...*/
        fname_match = FNAME_RE.match(line)
        # Looking for new function header (function: <name>)
-        while FNAME_RE.match(line) == None:
+        while FNAME_RE.match(line) is None:
            line_idx += 1
            if line_idx < len(sass_lines):
                line = sass_lines[line_idx].decode()
@@ -94,7 +94,7 @@ def extract(file_path, fun):
        # store sass asm in buffer and them print them (for labels)
        # (ctrl, asm)
        asm_buffer = []
-        while FLINE_RE.match(line) != None:
+        while FLINE_RE.match(line) is not None:
            # First line (Offset ASM Encoding)
            fline = sass_lines[line_idx].decode()
            line_idx += 1
--- a/python/tutorials/02-fused-softmax.py
+++ b/python/tutorials/02-fused-softmax.py
@@ -16,10 +16,11 @@ You will learn about:
 # Custom GPU kernels for elementwise additions are educationally valuable but won't get you very far in practice.
 # Let us consider instead the case of a simple (numerically stabilized) softmax operation:
 import triton.language as tl
 import triton
 import torch
 import triton
 import triton.language as tl
@torch.jit.script
 def naive_softmax(x):