triton/python/test/unit/runtime/test_cache.py

import multiprocessing
import os
import re
import shutil
from collections import namedtuple

import pytest
import torch

import triton
import triton.language as tl
from triton.runtime.jit import JITFunction

tmpdir = ".tmp"


@triton.jit
def function_1(i):
    i = i + 1
    i = function_2(i)
    return i


@triton.jit
def function_2(i):
    i = i + 1
    return i


@triton.jit
def kernel(X, i, BLOCK: tl.constexpr):
    i = i + 1
    i = function_1(i)
    tl.store(X, i)


@triton.jit(do_not_specialize=["i"])
def kernel_nospec(X, i, BLOCK: tl.constexpr):
    i = i + 1
    i = function_1(i)
    tl.store(X, i)


def apply_src_change(target, old, new):
    kernel.hash = None
    function_1.hash = None
    function_2.hash = None
    function_1.src = function_1.src.replace(old, new)
    target.src = target.src.replace(old, new)
    ret = target.cache_key
    target.src = target.src.replace(new, old)
    return ret


def test_nochange():
    baseline = kernel.cache_key
    updated = apply_src_change(kernel, 'i + 1', 'i + 1')
    assert baseline == updated


def test_toplevel_change():
    baseline = kernel.cache_key
    updated = apply_src_change(kernel, 'i + 1', 'i + 2')
    assert baseline != updated


def test_nested1_change():
    baseline = kernel.cache_key
    updated = apply_src_change(function_1, 'i + 1', 'i + 2')
    assert baseline != updated


def reset_tmp_dir():
    os.environ["TRITON_CACHE_DIR"] = tmpdir
    if os.path.exists(tmpdir):
        shutil.rmtree(tmpdir)


def test_reuse():
    counter = 0

    def inc_counter(*args, **kwargs):
        nonlocal counter
        counter += 1
    JITFunction.cache_hook = inc_counter
    reset_tmp_dir()
    x = torch.empty(1, dtype=torch.int32, device='cuda')
    for i in range(10):
        kernel[(1,)](x, 1, BLOCK=1024)
    assert counter == 1


@pytest.mark.parametrize('mode', ['enable', 'disable'])
def test_specialize(mode):
    counter = 0

    def inc_counter(*args, **kwargs):
        nonlocal counter
        counter += 1
    JITFunction.cache_hook = inc_counter
    reset_tmp_dir()
    x = torch.empty(1, dtype=torch.int32, device='cuda')
    function = {'enable': kernel, 'disable': kernel_nospec}[mode]
    target = {'enable': 3, 'disable': 1}[mode]
    for i in [1, 2, 4, 8, 16, 32]:
        function[(1,)](x, i, BLOCK=512)
    assert counter == target


@pytest.mark.parametrize("value, value_type", [
    (-1, 'i32'), (0, 'i32'), (1, 'i32'), (-2**31, 'i32'), (2**31 - 1, 'i32'),
    (2**32, 'i64'), (2**63 - 1, 'i64'), (-2**63, 'i64'),
    (2**31, 'u32'), (2**32 - 1, 'u32'), (2**63, 'u64'), (2**64 - 1, 'u64')
])
def test_value_specialization(value: int, value_type: str, device='cuda') -> None:

    @triton.jit
    def kernel(VALUE, X):
        pass

    cache_str = None

    def get_cache_str(*args, **kwargs):
        nonlocal cache_str
        cache_str = kwargs["repr"]
    triton.JITFunction.cache_hook = get_cache_str
    reset_tmp_dir()
    x = torch.tensor([3.14159], device='cuda')
    kernel[(1, )](value, x)
    triton.JITFunction.cache_hook = None

    cache_str_match = re.match(r".*VALUE: (\w+).*", cache_str)
    spec_type = None if cache_str_match is None else cache_str_match.group(1)
    assert spec_type == value_type


def test_constexpr_not_callable() -> None:
    @triton.jit
    def kernel(X, c: tl.constexpr):
        tl.store(X, 2)

    x = torch.empty(1, dtype=torch.int32, device='cuda')
    error = False
    try:
        kernel[(1, )](x, c="str")
    except BaseException:
        error = True
    assert error is False
    # try and catch
    try:
        kernel[(1, )](x, c=tl.abs)
    except BaseException:
        error = True
    assert error is True


def test_jit_warmup_cache() -> None:
    @triton.jit
    def kernel_add(a, b, o, N: tl.constexpr):
        idx = tl.arange(0, N)
        tl.store(o + idx,
                 tl.load(a + idx) + tl.load(b + idx))

    args = [
        torch.randn(32, dtype=torch.float32, device="cuda"),
        torch.randn(32, dtype=torch.float32, device="cuda"),
        torch.randn(32, dtype=torch.float32, device="cuda"),
        32,
    ]
    assert len(kernel_add.cache) == 0
    kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))
    assert len(kernel_add.cache) == 1
    kernel_add.warmup(*args, grid=(1,))
    assert len(kernel_add.cache) == 1
    kernel_add.warmup(*args, grid=(1,))
    assert len(kernel_add.cache) == 1


def test_compile_in_subproc() -> None:
    @triton.jit
    def kernel_sub(a, b, o, N: tl.constexpr):
        idx = tl.arange(0, N)
        tl.store(o + idx,
                 tl.load(a + idx) - tl.load(b + idx) * 777)

    major, minor = torch.cuda.get_device_capability(0)
    cc = major * 10 + minor
    config = namedtuple("instance_descriptor", [
        "divisible_by_16", "equal_to_1"])(
        tuple(range(4)),
        ())

    proc = multiprocessing.Process(
        target=triton.compile,
        kwargs=dict(
            fn=kernel_sub,
            signature={0: "*fp32", 1: "*fp32", 2: "*fp32"},
            device=0,
            constants={3: 32},
            configs=[config],
            warm_cache_only=True,
            cc=cc,
        ))
    proc.start()
    proc.join()
    assert proc.exitcode == 0
[FRONTEND] Make triton.compile work without a cuda context (#708) This allows compiling in a subprocess. I'm not seeing a ton of speedup from this, but figure it is a good change anyway. 2022-09-24 13:41:47 -07:00			`import multiprocessing`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`import os`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00			`import re`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`import shutil`
[FRONTEND] Make triton.compile work without a cuda context (#708) This allows compiling in a subprocess. I'm not seeing a ton of speedup from this, but figure it is a good change anyway. 2022-09-24 13:41:47 -07:00			`from collections import namedtuple`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`import pytest`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00			`import torch`

			`import triton`
			`import triton.language as tl`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`from triton.runtime.jit import JITFunction`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00
			`tmpdir = ".tmp"`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`@triton.jit`
			`def function_1(i):`
			`i = i + 1`
			`i = function_2(i)`
			`return i`


			`@triton.jit`
			`def function_2(i):`
			`i = i + 1`
			`return i`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`@triton.jit`
			`def kernel(X, i, BLOCK: tl.constexpr):`
			`i = i + 1`
			`i = function_1(i)`
			`tl.store(X, i)`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`@triton.jit(do_not_specialize=["i"])`
			`def kernel_nospec(X, i, BLOCK: tl.constexpr):`
			`i = i + 1`
			`i = function_1(i)`
			`tl.store(X, i)`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`def apply_src_change(target, old, new):`
[FRONTEND] Bunch of fixes here and there (#436) 2022-01-20 10:55:59 -08:00			`kernel.hash = None`
			`function_1.hash = None`
			`function_2.hash = None`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`function_1.src = function_1.src.replace(old, new)`
			`target.src = target.src.replace(old, new)`
			`ret = target.cache_key`
			`target.src = target.src.replace(new, old)`
			`return ret`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`def test_nochange():`
			`baseline = kernel.cache_key`
			`updated = apply_src_change(kernel, 'i + 1', 'i + 1')`
			`assert baseline == updated`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`def test_toplevel_change():`
			`baseline = kernel.cache_key`
			`updated = apply_src_change(kernel, 'i + 1', 'i + 2')`
			`assert baseline != updated`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`def test_nested1_change():`
			`baseline = kernel.cache_key`
			`updated = apply_src_change(function_1, 'i + 1', 'i + 2')`
			`assert baseline != updated`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`def reset_tmp_dir():`
			`os.environ["TRITON_CACHE_DIR"] = tmpdir`
			`if os.path.exists(tmpdir):`
			`shutil.rmtree(tmpdir)`

[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`def test_reuse():`
			`counter = 0`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[FRONTEND] improved caching mechanism (#474) Co-authored-by: Greg Brockman <gdb@gregbrockman.com> Co-authored-by: Christopher Hesse <christopherhesse@users.noreply.github.com> 2022-03-15 12:20:51 -07:00			`def inc_counter(args, *kwargs):`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`nonlocal counter`
			`counter += 1`
			`JITFunction.cache_hook = inc_counter`
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`reset_tmp_dir()`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`x = torch.empty(1, dtype=torch.int32, device='cuda')`
			`for i in range(10):`
[FRONTEND] Better cache hook (#400) Added an additional `repr` argument to the cache hook, which represents a human-readable string representation of the signature and argument attributes associated with the compiled binary. 2021-12-21 21:29:47 -08:00			`kernel[(1,)](x, 1, BLOCK=1024)`
[RUNTIME] Bunch of bugfixes (#372) 2021-11-12 00:55:00 -08:00			`assert counter == 1`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00
			`@pytest.mark.parametrize('mode', ['enable', 'disable'])`
			`def test_specialize(mode):`
			`counter = 0`
[STYLE] run autopep8 and isort (#421) Run: ``` isort ./python autopep8 -i --ignore E501,E701,E731 $(find ./python/ -name '*.py') ``` with an `.isort.cfg` and then clean up a few warts. This PR should be a no-op; the idea is that this is all boring whitespace changes, and any config file changes will be in a different change to make it easier to review. 2022-01-06 14:34:17 -08:00
[FRONTEND] improved caching mechanism (#474) Co-authored-by: Greg Brockman <gdb@gregbrockman.com> Co-authored-by: Christopher Hesse <christopherhesse@users.noreply.github.com> 2022-03-15 12:20:51 -07:00			`def inc_counter(args, *kwargs):`
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`nonlocal counter`
			`counter += 1`
			`JITFunction.cache_hook = inc_counter`
			`reset_tmp_dir()`
			`x = torch.empty(1, dtype=torch.int32, device='cuda')`
			`function = {'enable': kernel, 'disable': kernel_nospec}[mode]`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`target = {'enable': 3, 'disable': 1}[mode]`
[RUNTIME] Restored `do_not_specialize` (#374) 2021-11-12 15:06:55 -08:00			`for i in [1, 2, 4, 8, 16, 32]:`
			`function[(1,)](x, i, BLOCK=512)`
			`assert counter == target`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00

			`@pytest.mark.parametrize("value, value_type", [`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`(-1, 'i32'), (0, 'i32'), (1, 'i32'), (-231, 'i32'), (231 - 1, 'i32'),`
			`(232, 'i64'), (263 - 1, 'i64'), (-2**63, 'i64'),`
			`(231, 'u32'), (232 - 1, 'u32'), (263, 'u64'), (264 - 1, 'u64')`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00			`])`
			`def test_value_specialization(value: int, value_type: str, device='cuda') -> None:`

			`@triton.jit`
			`def kernel(VALUE, X):`
			`pass`

			`cache_str = None`

			`def get_cache_str(args, *kwargs):`
			`nonlocal cache_str`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`cache_str = kwargs["repr"]`
			`triton.JITFunction.cache_hook = get_cache_str`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00			`reset_tmp_dir()`
			`x = torch.tensor([3.14159], device='cuda')`
			`kernel[(1, )](value, x)`
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`triton.JITFunction.cache_hook = None`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00
[FRONTEND] Complete rewrite of the runtime (#644) This PR completely rewrites the runtime of Triton to be more lean and clearly separate the compilation step from the just-in-time caching logic. This should substantially reduce launch overhead. 2022-09-18 08:51:48 -07:00			`cache_str_match = re.match(r".VALUE: (\w+).", cache_str)`
[FRONTEND] Semantic analysis refactor (#491) Moved dispatch.cc to semantic.py (@ptillet) Integer signedness analysis was moved from C++ to python (@daadaada) Cleaner frontend types (@daadaada) Moved SSA construction to a separate object (@ptillet) Co-authored-by: Yan Da <dyanab@connect.ust.hk> 2022-04-06 16:13:53 -07:00			`spec_type = None if cache_str_match is None else cache_str_match.group(1)`
			`assert spec_type == value_type`
[FRONTEND] Refresh cache when the source code of outlined functions are changed (#590) 2022-07-20 17:34:07 -07:00

			`def test_constexpr_not_callable() -> None:`
			`@triton.jit`
			`def kernel(X, c: tl.constexpr):`
			`tl.store(X, 2)`

			`x = torch.empty(1, dtype=torch.int32, device='cuda')`
			`error = False`
			`try:`
			`kernel[(1, )](x, c="str")`
			`except BaseException:`
			`error = True`
			`assert error is False`
			`# try and catch`
			`try:`
			`kernel[(1, )](x, c=tl.abs)`
			`except BaseException:`
			`error = True`
			`assert error is True`
[FRONTEND] Add warmup for triton.jit() (#684) This revives #671 , removing the static functions that may unnecessarily hold a reference to the grid and the JITFunction object Co-authored-by: Jason Ansel <jansel@jansel.net> 2022-09-21 12:13:20 -07:00

			`def test_jit_warmup_cache() -> None:`
			`@triton.jit`
			`def kernel_add(a, b, o, N: tl.constexpr):`
			`idx = tl.arange(0, N)`
			`tl.store(o + idx,`
			`tl.load(a + idx) + tl.load(b + idx))`

			`args = [`
			`torch.randn(32, dtype=torch.float32, device="cuda"),`
			`torch.randn(32, dtype=torch.float32, device="cuda"),`
			`torch.randn(32, dtype=torch.float32, device="cuda"),`
			`32,`
			`]`
			`assert len(kernel_add.cache) == 0`
			`kernel_add.warmup(torch.float32, torch.float32, torch.float32, 32, grid=(1,))`
			`assert len(kernel_add.cache) == 1`
			`kernel_add.warmup(*args, grid=(1,))`
			`assert len(kernel_add.cache) == 1`
			`kernel_add.warmup(*args, grid=(1,))`
			`assert len(kernel_add.cache) == 1`
[FRONTEND] Make triton.compile work without a cuda context (#708) This allows compiling in a subprocess. I'm not seeing a ton of speedup from this, but figure it is a good change anyway. 2022-09-24 13:41:47 -07:00

			`def test_compile_in_subproc() -> None:`
			`@triton.jit`
			`def kernel_sub(a, b, o, N: tl.constexpr):`
			`idx = tl.arange(0, N)`
			`tl.store(o + idx,`
			`tl.load(a + idx) - tl.load(b + idx) * 777)`

			`major, minor = torch.cuda.get_device_capability(0)`
			`cc = major * 10 + minor`
			`config = namedtuple("instance_descriptor", [`
			`"divisible_by_16", "equal_to_1"])(`
			`tuple(range(4)),`
			`())`

			`proc = multiprocessing.Process(`
			`target=triton.compile,`
			`kwargs=dict(`
			`fn=kernel_sub,`
			`signature={0: "fp32", 1: "fp32", 2: "*fp32"},`
			`device=0,`
			`constants={3: 32},`
			`configs=[config],`
			`warm_cache_only=True,`
			`cc=cc,`
			`))`
			`proc.start()`
			`proc.join()`
			`assert proc.exitcode == 0`