[GENERAL] Removed deprecated driver files and added basic compatibility with rocm (#268)

- Removed driver module -- accelerator runtime is handled by pytorch
- Added basic support for ROCM based on @micmelesse 's PR -- now can execute empty kernel on AMD devices without any compile-time changes
- Now only using PREFER_SHARED for kernels when the size of shared memory is greater than 49k. Otherwise there can be poor L1 performance for broadcast tensors
This commit is contained in:
Philippe Tillet
2021-09-09 00:04:28 -07:00
committed by GitHub
parent 8bedcce9be
commit 94c83d30ce
47 changed files with 1376 additions and 30232 deletions

View File

@@ -34,6 +34,8 @@ def patch_kernel(template, to_replace):
return kernel
# generic test functions
def _test_unary(dtype_x, expr, torch_expr=None, device='cuda'):
SIZE = 128
@@ -425,7 +427,7 @@ def test_permute(dtype, shape, perm, device='cuda'):
# compare
triton.testing.assert_almost_equal(z_tri, z_ref)
# parse ptx to make sure ld/st are vectorized
ptx = pgm.asm('ptx')
ptx = pgm.asm['ptx']
assert 'ld.global.v4' in ptx
assert 'st.global.v4' in ptx
@@ -484,7 +486,7 @@ def test_dot(epilogue, device='cuda'):
z_ref += z[0,:][None, :]
z_ref = z_ref.to(torch.float16)
# compare
ptx = pgm.asm('ptx')
ptx = pgm.asm['ptx']
# print(ptx)
triton.testing.assert_almost_equal(z_tri, z_ref)
# make sure ld/st are vectorized
@@ -511,3 +513,13 @@ def test_dot(epilogue, device='cuda'):
# ---------------
# test while
# ---------------
# ---------------
# test noop
#----------------
def test_noop(device='cuda'):
@triton.jit
def kernel(**meta):
pass
x = triton.testing.random((1,), dtype=torch.int32, device=device)
kernel[(1, )](x)