Improve ROCm support. (#780)

- updates to support ROCm 5.2 - workarounds in tests where NV tools were used unconditionally - implemented `get_num_blocks()` and `add_memfence()` for AMD GPU - backported from history some atomics - added bf16 support - minor warnings cleanup - added dockerfile to run on a ROCm enabled machine Co-authored-by: B1tway <andrew.shukshov@gmail.com> Co-authored-by: Andrey Shukshov <36711069+B1tway@users.noreply.github.com>
2022-10-14 21:33:42 +03:00
parent 94d5c2e8b5
commit 406d03bfaf
17 changed files with 435 additions and 155 deletions
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -369,9 +369,6 @@ def test_atomic_rmw(op, dtype_x, mode, device='cuda'):
    ('float32', 'int32', True)
 ])
 def test_cast(dtype_x, dtype_z, bitcast, device='cuda'):
-    if torch.version.hip is not None:
-        assert 'bfloat' not in dtype_x 
-        assert 'bfloat' not in dtype_z

    SIZE = 1024
    x = triton.testing.random((SIZE, ), dtype=cvt[dtype_x], device=device)
--- a/python/test/unit/operators/test_matmul.py
+++ b/python/test/unit/operators/test_matmul.py
@@ -86,4 +86,4 @@ def test_op(BLOCK_M, BLOCK_N, BLOCK_K, SPLIT_K, NWARP, NSTAGE, M, N, K, AT, BT,
    # run test
    th_c = torch.matmul(a, b)
    tt_c = triton.testing.catch_oor(lambda : triton.ops.matmul(a, b), pytest)
-    triton.testing.assert_almost_equal(th_c, tt_c)
+    triton.testing.assert_almost_equal(th_c, tt_c)