[FRONTEND] Backport new runtime from master (#706)

This PR merges the new runtime back into the `triton-mlir` branch. This adds caching and just-in-time compilation functionality to the triton-mlir project, and paves the way for re-using tests from the master branch.
2022-09-23 16:09:43 -07:00
parent ecd1bc33df
commit 22ec22c257
13 changed files with 790 additions and 419 deletions
--- a/python/tests/test_compiler.py
+++ b/python/tests/test_compiler.py
@@ -2,7 +2,6 @@ import torch

 import triton
 import triton.language as tl
-import triton.runtime as runtime

 # trigger the torch.device implicitly to ensure cuda context initialization
 torch.zeros([10], device=torch.device('cuda'))
@@ -16,30 +15,18 @@ def empty_kernel(X, stride_xm, BLOCK: tl.constexpr):
 def test_empty_kernel_cubin_compile():

    device = torch.cuda.current_device()
-    cubin = triton.compile(empty_kernel,
-                           "*fp32,i32,i32",
-                           device=device,
-                           constants={"BLOCK": 256},
-                           output="cubin")
+    kernel = triton.compile(empty_kernel,
+                            "*fp32,i32,i32",
+                            device=device,
+                            constants={"BLOCK": 256})

-    print('cubin size:', len(cubin))
-    assert len(cubin) > 0
+    assert len(kernel.asm["cubin"]) > 0


 def test_empty_kernel_launch():
-    device = torch.cuda.current_device()
-    binary = runtime.build_kernel(empty_kernel, "*fp32,i32,i32",
-                                  constants={"BLOCK": 256},
-                                  num_warps=4,
-                                  num_stages=3)
    grid = lambda META: (
        triton.cdiv(1024, META['BLOCK']) * triton.cdiv(1024, META['BLOCK']),
    )

    A = torch.zeros([1024], device="cuda")
-    runtime.launch_kernel(kernel=binary,
-                          grid=grid,
-                          device=device,
-                          X=A,
-                          stride_xm=256,
-                          BLOCK=tl.constexpr(256))
+    empty_kernel[grid](X=A, stride_xm=256, BLOCK=256)