Revert "[BACKEND] Various bug fixes; making reductions faster (#533)".

This is a more stable commit that produce bitwise identical code to earlier versions. Using commits after this one may lead to slightly different numerics
2022-06-03 11:36:06 -07:00
parent efa04cac1f
commit a60374a597
11 changed files with 65 additions and 173 deletions
--- a/python/setup.py
+++ b/python/setup.py
@@ -79,7 +79,7 @@ class CMakeBuild(build_ext):

    def build_extension(self, ext):
        llvm_include_dir, llvm_library_dir = get_llvm()
-        self.debug = True
+        # self.debug = True
        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.path)))
        # create build directories
        build_suffix = 'debug' if self.debug else 'release'
--- a/python/test/unit/language/test_core.py
+++ b/python/test/unit/language/test_core.py
@@ -698,7 +698,6 @@ def test_reduce1d(dtype_str, shape, device='cuda'):

    rs = RandomState(17)
    x = numpy_random((shape,), dtype_str=dtype_str, rs=rs)
-    x[:] = 1
    # numpy result
    z_ref = np.sum(x).astype(getattr(np, dtype_str))
    # triton result
@@ -1133,25 +1132,3 @@ def test_constexpr_shape():
    x_tri = to_triton(np.empty((256, ), dtype=np.int32))
    kernel[(1,)](x_tri)
    np.testing.assert_equal(to_numpy(x_tri), np.arange(0, 256))
-
-# -------------
-# test if
-# -------------
-
-
-def test_if():
-
-    @triton.jit
-    def kernel(Cond, XTrue, XFalse, Ret):
-        pid = tl.program_id(0)
-        cond = tl.load(Cond)
-        if pid % 2:
-            tl.store(Ret, tl.load(XTrue))
-        else:
-            tl.store(Ret, tl.load(XFalse))
-
-    cond = torch.ones(1, dtype=torch.int32, device='cuda')
-    x_true = torch.tensor([3.14], dtype=torch.float32, device='cuda')
-    x_false = torch.tensor([1.51], dtype=torch.float32, device='cuda')
-    ret = torch.empty(1, dtype=torch.float32, device='cuda')
-    kernel[(1,)](cond, x_true, x_false, ret)
--- a/python/triton/language/core.py
+++ b/python/triton/language/core.py
@@ -32,8 +32,6 @@ def _to_tensor(x, builder):
        return _to_tensor(x.value, builder)
    elif isinstance(x, tensor):
        return x
-    elif x is None:
-        return None
    assert False, f'cannot convert {x} to tensor'


--- a/python/triton/language/semantic.py
+++ b/python/triton/language/semantic.py
@@ -559,7 +559,7 @@ def cast(input: tl.tensor,
         dst_ty: tl.dtype,
         builder: ir.builder) -> tl.tensor:
    src_ty = input.type
-    if src_ty.is_block() and not dst_ty.is_block():
+    if src_ty.is_block():
        dst_ty = tl.block_type(dst_ty, input.type.get_block_shapes())
    if src_ty == dst_ty:
        return input
--- a/python/tutorials/03-matrix-multiplication.py
+++ b/python/tutorials/03-matrix-multiplication.py
@@ -252,7 +252,6 @@ def matmul_kernel(
 # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`
@triton.jit
 def leaky_relu(x):
-    x = x + 1
    return tl.where(x >= 0, x, 0.01 * x)


@@ -297,7 +296,7 @@ def matmul(a, b, activation=None):
 torch.manual_seed(0)
 a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-triton_output = matmul(a, b, activation=leaky_relu)
+triton_output = matmul(a, b, activation=None)
 torch_output = torch.matmul(a, b)
 print(f"triton_output={triton_output}")
 print(f"torch_output={torch_output}")
@@ -306,8 +305,6 @@ if triton.testing.allclose(triton_output, torch_output):
 else:
    print("❌ Triton and Torch differ")

-print(matmul_kernel.cache_key)
-exit()
 # %%
 # Benchmark
 # --------------