[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch - Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes - Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
2021-09-09 00:04:28 -07:00
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions
--- a/python/test/language/test_core.py
+++ b/python/test/language/test_core.py
@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
    return kernel


+
+
 # generic test functions
 def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
    SIZE = 128
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
    # compare
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # parse ptx to make sure ld/st are vectorized
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    assert 'ld.global.v4' in ptx
    assert 'st.global.v4' in ptx

@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
        z_ref += z[0,:][None, :]
    z_ref = z_ref.to(torch.float16)
    # compare
-    ptx = pgm.asm('ptx')
+    ptx = pgm.asm['ptx']
    # print(ptx)
    triton.testing.assert_almost_equal(z_tri, z_ref)
    # make sure ld/st are vectorized
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
 # ---------------
 # test while
 # ---------------
+
+# ---------------
+# test noop
+#----------------
+def test_noop(device='cuda'):
+    @triton.jit
+    def kernel(**meta):
+        pass
+    x = triton.testing.random((1,), dtype=torch.int32, device=device)
+    kernel[(1, )](x)