[CODEGEN] Performance improvement on A100 (#125)

Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores.
2021-06-21 14:25:13 +08:00
parent 5a51f3e529
commit d8d6b715c8
21 changed files with 855 additions and 174 deletions
--- a/python/test/test_blocksparse.py
+++ b/python/test/test_blocksparse.py
@@ -27,7 +27,7 @@ def test_matmul(MODE, TRANS_A, TRANS_B, BLOCK, DTYPE, Z=3, H=2, M=512, N=384, K=
    op = triton.ops.blocksparse.matmul(layout, BLOCK, MODE, trans_a=TRANS_A, trans_b=TRANS_B)
    ra = triton.testing.sparsify_tensor(a, layout, BLOCK) if MODE == "dsd" else a
    rb = triton.testing.sparsify_tensor(b, layout, BLOCK) if MODE == "dds" else b
-    rc = op(ra, rb)
+    rc = triton.testing.catch_oor(lambda : op(ra, rb), pytest)
    # torch result
    ta = triton.testing.mask_tensor(a, layout, BLOCK) if MODE == "dsd" else a
    tb = triton.testing.mask_tensor(b, layout, BLOCK) if MODE == "dds" else b
--- a/python/test/test_matmul.py
+++ b/python/test/test_matmul.py
@@ -5,56 +5,69 @@ import torch


@pytest.mark.parametrize(
-    "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, M, N, K, AT, BT, DTYPE",
+    "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE",
    itertools.chain(
        *[
            [
                # 1 warp
-                (16, 16, 16, 1, 1, None, None, None, AT, BT, DTYPE),
-                (32, 16, 16, 1, 1, None, None, None, AT, BT, DTYPE),
-                (16, 32, 16, 1, 1, None, None, None, AT, BT, DTYPE),
-                (16, 16, 32, 1, 1, None, None, None, AT, BT, DTYPE),
-                (32, 16, 32, 1, 1, None, None, None, AT, BT, DTYPE),
-                (16, 32, 32, 1, 1, None, None, None, AT, BT, DTYPE),
-                (16, 16, 64, 1, 1, None, None, None, AT, BT, DTYPE),
-                (64, 16, 64, 1, 1, None, None, None, AT, BT, DTYPE),
-                (16, 64, 64, 1, 1, None, None, None, AT, BT, DTYPE),
+                (16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
+                (16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                # 2 warp
-                (64, 32, 64, 1, 2, None, None, None, AT, BT, DTYPE),
-                (32, 64, 64, 1, 2, None, None, None, AT, BT, DTYPE),
-                (64, 32, 16, 1, 2, None, None, None, AT, BT, DTYPE),
-                (32, 64, 16, 1, 2, None, None, None, AT, BT, DTYPE),
-                (128, 32, 32, 1, 2, None, None, None, AT, BT, DTYPE),
-                (32, 128, 32, 1, 2, None, None, None, AT, BT, DTYPE),
+                (64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
+                (32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
+                (64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
+                (32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
+                (128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
+                (32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                # 4 warp
-                (128, 64, 16, 1, 4, None, None, None, AT, BT, DTYPE),
-                (64, 128, 16, 1, 4, None, None, None, AT, BT, DTYPE),
-                (128, 32, 32, 1, 4, None, None, None, AT, BT, DTYPE),
-                (32, 128, 32, 1, 4, None, None, None, AT, BT, DTYPE),
-                (128, 32, 64, 1, 4, None, None, None, AT, BT, DTYPE),
-                (32, 128, 64, 1, 4, None, None, None, AT, BT, DTYPE),
+                (128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
+                (64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
+                (128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
+                (32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
+                (128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
+                (32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                # 8 warp
-                (128, 256, 16, 1, 8, None, None, None, AT, BT, DTYPE),
-                (256, 128, 16, 1, 8, None, None, None, AT, BT, DTYPE),
-                (256, 128, 32, 1, 8, None, None, None, AT, BT, DTYPE),
-                # # split-k
-                (64, 64, 16, 2, 4, None, None, None, AT, BT, DTYPE),
-                (64, 64, 16, 4, 4, None, None, None, AT, BT, DTYPE),
-                (64, 64, 16, 8, 4, None, None, None, AT, BT, DTYPE),
-                # # variable input
-                (128, 128, 32, 1, 4, 1024, 1024, 1024, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 384, 128, 640, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 107, 233, 256, AT, BT, DTYPE),
-                (128, 128, 32, 1, 4, 107, 233, 311, AT, BT, DTYPE),
+                (128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
+                (256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
+                (256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE),
+                # split-k
+                (64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE),
+                (64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE),
+                (64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE),
+                # variable input
+                (128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE),
+                (128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE),
+                (128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE),
+                (128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE),
            ] for DTYPE in ["float16", "float32"] for AT in [False, True] for BT in [False, True]
+        ],
+        # n-stage
+        *[
+            [
+                (16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
+                (64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
+                (128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
+                (256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
+                (128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE),
+                # split-k
+                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
+                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE),
+            ] for DTYPE in ["float16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4]
        ]
    ),
 )
-def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, M, N, K, AT, BT, DTYPE):
+def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE):
    torch.manual_seed(0)
    # nuke kernel decorators -- will set meta-parameters manually
    META = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K, 'GROUP_M': 8}
-    configs = [triton.Config(meta=META, num_warps=NWARP)]
+    configs = [triton.Config(meta=META, num_warps=NWARP, num_stages=NSTAGE)]
    kernel = triton.ops._matmul.kernel
    decorators = kernel.kernel_decorators
    kernel.kernel_decorators = []
@@ -72,5 +85,5 @@ def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, M, N, K, AT, BT, DTYPE):
    b = b.t() if BT else b
    # run test
    th_c = torch.matmul(a, b)
-    tt_c = triton.ops.matmul(a, b)
+    tt_c = triton.testing.catch_oor(lambda : triton.ops.matmul(a, b), pytest)
    assert triton.testing.allclose(th_c, tt_c)