triton/python/test/unit/operators/test_matmul.py

import itertools

import pytest
import torch

import triton
import triton._C.libtriton.triton as _triton


@pytest.mark.parametrize(
    "BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE",
    itertools.chain(
        *[
            [
                # 1 warp
                (16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                (16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),
                # 2 warp
                (64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                (32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                (64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                (32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                (128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                (32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),
                # 4 warp
                (128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                (64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                (128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                (32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                (128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                (32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),
                # 8 warp
                (128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
                (256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),
                (256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE),
                # split-k
                (64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE),
                (64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE),
                (64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE),
                # variable input
                (128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE),
                (128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE),
                (128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE),
                (128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE),
            ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True]
        ],
        # n-stage
        *[
            [
                (16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
                (64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
                (128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
                (256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
                (128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE),
                # split-k
                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),
                (64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE),
            ] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4]
        ]
    ),
)
def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE):
    cc = _triton.runtime.cc(_triton.runtime.backend.CUDA, torch.cuda.current_device())
    if cc < 80 and DTYPE == "bfloat16":
        pytest.skip("Only test bfloat16 on devices with sm >= 80")
    if DTYPE == "bfloat16" and SPLIT_K != 1:
        pytest.skip("bfloat16 matmuls don't allow split_k for now")
    torch.manual_seed(0)
    # nuke kernel decorators -- will set meta-parameters manually
    kwargs = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K}
    pre_hook = None if SPLIT_K == 1 else lambda nargs: nargs['C'].zero_()
    configs = [triton.Config(kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook)]
    kernel = triton.ops._matmul.kernel
    kernel.configs = configs
    # kernel.run = kernel.run.run.run

    # get matrix shape
    M = BLOCK_M if M is None else M
    N = BLOCK_N if N is None else N
    K = BLOCK_K * SPLIT_K if K is None else K
    # allocate/transpose inputs
    DTYPE = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[DTYPE]
    a = .1 * torch.randn((K, M) if AT else (M, K), device="cuda", dtype=DTYPE)
    b = .1 * torch.randn((N, K) if BT else (K, N), device="cuda", dtype=DTYPE)
    a = a.t() if AT else a
    b = b.t() if BT else b
    # run test
    th_c = torch.matmul(a, b)
    tt_c = triton.testing.catch_oor(lambda: triton.ops.matmul(a, b), pytest)
    triton.testing.assert_almost_equal(th_c, tt_c)
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`import itertools`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
			`import pytest`
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`import torch`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00			`import triton`
[TESTS] Added bfloat16 tests (#430) 2022-01-14 15:38:32 +08:00			`import triton._C.libtriton.triton as _triton`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`@pytest.mark.parametrize(`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`"BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE",`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`itertools.chain(`
			`*[`
			`[`
			`# 1 warp`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`(16, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 16, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(16, 32, 16, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(16, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 16, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(16, 32, 32, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(16, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(64, 16, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
			`(16, 64, 64, 1, 1, 2, None, None, None, AT, BT, DTYPE),`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# 2 warp`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`(64, 32, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 64, 64, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
			`(64, 32, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 64, 16, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
			`(128, 32, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 128, 32, 1, 2, 2, None, None, None, AT, BT, DTYPE),`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# 4 warp`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`(128, 64, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(64, 128, 16, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(128, 32, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 128, 32, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(128, 32, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(32, 128, 64, 1, 4, 2, None, None, None, AT, BT, DTYPE),`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# 8 warp`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`(128, 256, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),`
			`(256, 128, 16, 1, 8, 2, None, None, None, AT, BT, DTYPE),`
			`(256, 128, 32, 1, 8, 2, None, None, None, AT, BT, DTYPE),`
			`# split-k`
			`(64, 64, 16, 2, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(64, 64, 16, 4, 4, 2, None, None, None, AT, BT, DTYPE),`
			`(64, 64, 16, 8, 4, 2, None, None, None, AT, BT, DTYPE),`
			`# variable input`
			`(128, 128, 32, 1, 4, 2, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(128, 128, 32, 1, 4, 2, 384, 128, 640, AT, BT, DTYPE),`
			`(128, 128, 32, 1, 4, 2, 107, 233, 256, AT, BT, DTYPE),`
			`(128, 128, 32, 1, 4, 2, 107, 233, 311, AT, BT, DTYPE),`
[TESTS] Added bfloat16 tests (#430) 2022-01-14 15:38:32 +08:00			`] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True]`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`],`
			`# n-stage`
			`*[`
			`[`
			`(16, 16, 16, 1, 1, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(64, 32, 64, 1, 2, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(128, 64, 16, 1, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(256, 128, 32, 1, 8, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(128, 128, 32, 1, 4, STAGES, 384, 128, 640, AT, BT, DTYPE),`
			`# split-k`
			`(64, 64, 16, 8, 4, STAGES, 1024, 1024, 1024, AT, BT, DTYPE),`
			`(64, 64, 16, 8, 4, STAGES, 1024, 1024, 32, AT, BT, DTYPE),`
[TESTS] Added bfloat16 tests (#430) 2022-01-14 15:38:32 +08:00			`] for DTYPE in ["float16", "bfloat16", "float32"] for AT in [False, True] for BT in [False, True] for STAGES in [2, 3, 4]`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`]`
			`),`
[CODEGEN] Major performance improvements on A100 (#70) Improved handling of asynchronous copy, scheduling and synchronization for A100. Now achieving CUTLASS-like performance on large square dense matrix multiplication tasks 2021-02-21 15:19:39 -08:00			`)`
[CODEGEN] Performance improvement on A100 (#125) Improved codegen for the Ampere GPUs. * Make the layout pass recognize the multistage pipelined pattern. * Now the pipeline pass can automate the multistage pipelining transformation. * Remove extra barriers (from the prefetch pass & WAR) on Ampere. * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores. 2021-06-21 14:25:13 +08:00			`def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT, DTYPE):`
[TESTS] Added bfloat16 tests (#430) 2022-01-14 15:38:32 +08:00			`cc = _triton.runtime.cc(_triton.runtime.backend.CUDA, torch.cuda.current_device())`
			`if cc < 80 and DTYPE == "bfloat16":`
			`pytest.skip("Only test bfloat16 on devices with sm >= 80")`
			`if DTYPE == "bfloat16" and SPLIT_K != 1:`
			`pytest.skip("bfloat16 matmuls don't allow split_k for now")`
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`torch.manual_seed(0)`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# nuke kernel decorators -- will set meta-parameters manually`
[LANG] Added support for constexpr (#361) 2021-10-30 00:32:58 -07:00			`kwargs = {'BLOCK_M': BLOCK_M, 'BLOCK_N': BLOCK_N, 'BLOCK_K': BLOCK_K, 'SPLIT_K': SPLIT_K}`
[RUNTIME] Config hook v2.0 (#373) * Add pre_hook to triton.Config * Use argument names in triton.heuristics * Update base perf * Remove meta from heuristics 2021-11-22 03:20:59 +08:00			`pre_hook = None if SPLIT_K == 1 else lambda nargs: nargs['C'].zero_()`
			`configs = [triton.Config(kwargs=kwargs, num_warps=NWARP, num_stages=NSTAGE, pre_hook=pre_hook)]`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`kernel = triton.ops._matmul.kernel`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`kernel.configs = configs`
			`# kernel.run = kernel.run.run.run`

Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# get matrix shape`
			`M = BLOCK_M if M is None else M`
			`N = BLOCK_N if N is None else N`
			`K = BLOCK_K * SPLIT_K if K is None else K`
			`# allocate/transpose inputs`
[TESTS] Added bfloat16 tests (#430) 2022-01-14 15:38:32 +08:00			`DTYPE = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[DTYPE]`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00			`a = .1 * torch.randn((K, M) if AT else (M, K), device="cuda", dtype=DTYPE)`
			`b = .1 * torch.randn((N, K) if BT else (K, N), device="cuda", dtype=DTYPE)`
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`a = a.t() if AT else a`
			`b = b.t() if BT else b`
Deprecation of Triton-C and Replacement by decorated Python functions (#86) This PR implements a major overhaul of the frontend for Triton, and replaces Triton-C by a pure Python API in which kernels are defined as @triton.jit decorated functions. The documentation and tutorials have also been updated to accommodate these changes. See documentations for more information on the new API 2021-04-20 22:29:40 -04:00			`# run test`
[PYTHON] Added automated benchmark script (#63) This adds a bench functionality to the setup.py that can be used to run the benchmark suite and generates a bunch of csv files (and optionally plots) python setup.py bench python setup.py bench --with-plots python setup.py bench --filter=cross_entropy 2021-02-08 12:16:41 -08:00			`th_c = torch.matmul(a, b)`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00			`tt_c = triton.testing.catch_oor(lambda: triton.ops.matmul(a, b), pytest)`
[CI] Moved from `assert_allclose` to `assert_almost_equal` (#200) 2021-08-12 12:00:30 -07:00			`triton.testing.assert_almost_equal(th_c, tt_c)`