[BACKEND] MMA->DotOperand conversion for chain dot of float32 tensors (#962)

Co-authored-by: Philippe Tillet <phil@openai.com>
2022-12-08 12:11:51 -08:00
parent 83f3b9165b
commit 3ed36dcb4d
2 changed files with 113 additions and 58 deletions
--- a/python/tests/test_core.py
+++ b/python/tests/test_core.py
@@ -1071,21 +1071,23 @@ def test_permute(dtype_str, shape, perm, device='cuda'):
 # # ---------------


-@pytest.mark.parametrize("epilogue, allow_tf32, dtype",
-                         [(epilogue, allow_tf32, dtype)
+@pytest.mark.parametrize("M, N, K, epilogue, allow_tf32, dtype",
+                         [(*shape, epilogue, allow_tf32, dtype)
+                          for shape in [(64, 64, 64)]
                          for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
                          for allow_tf32 in [True, False]
-                          for dtype in ['float16']
+                          for dtype in ['float16', 'float32']
                          if not (allow_tf32 and (dtype in ['float16']))])
-def test_dot(epilogue, allow_tf32, dtype, device='cuda'):
+def test_dot(M, N, K, epilogue, allow_tf32, dtype, device='cuda'):
    capability = torch.cuda.get_device_capability()
-    if capability[0] < 80:
+    if capability[0] < 8:
        if dtype == 'int8':
            pytest.skip("Only test int8 on devices with sm >= 80")
        elif dtype == 'float32' and allow_tf32:
            pytest.skip("Only test tf32 on devices with sm >= 80")

-    M, N, K = 64, 64, 64
+    torch.backends.cuda.matmul.allow_tf32 = allow_tf32
+
    num_warps = 4
    trans_a, trans_b = False, False

@@ -1130,7 +1132,8 @@ def test_dot(epilogue, allow_tf32, dtype, device='cuda'):
        if CHAIN_DOT:
            # tl.store(Zs, z)
            # tl.debug_barrier()
-            z = tl.dot(z.to(tl.float16), tl.load(Ws))
+            w = tl.load(Ws)
+            z = tl.dot(z.to(w.dtype), w)
        tl.store(Zs, z)
    # input
    rs = RandomState(17)
@@ -1180,14 +1183,18 @@ def test_dot(epilogue, allow_tf32, dtype, device='cuda'):
        z_ref = np.matmul(z_ref, w)
    # compare
    # print(z_ref[:,0], z_tri[:,0])
-    np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01)
+    if dtype == 'float32':
+        # XXX: Somehow there's a larger difference when we use float32
+        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01, atol=1e-3)
+    else:
+        np.testing.assert_allclose(z_ref, to_numpy(z_tri), rtol=0.01)
    # make sure ld/st are vectorized
    ptx = pgm.asm['ptx']
    assert 'ld.global.v4' in ptx
    assert 'st.global.v4' in ptx
    if allow_tf32:
        assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' in ptx
-    elif dtype == 'float32':
+    elif dtype == 'float32' and allow_tf32:
        assert 'mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32' not in ptx
    elif dtype == 'int8':
        assert 'mma.sync.aligned.m16n8k32.row.col.satfinite.s32.s8.s8.s32' in ptx