Revert "[BACKEND] Various bug fixes; making reductions faster (#533)".

This is a more stable commit that produce bitwise identical code to earlier versions. Using commits after this one may lead to slightly different numerics
2022-06-03 11:36:06 -07:00
parent efa04cac1f
commit a60374a597
11 changed files with 65 additions and 173 deletions
--- a/python/tutorials/03-matrix-multiplication.py
+++ b/python/tutorials/03-matrix-multiplication.py
@@ -252,7 +252,6 @@ def matmul_kernel(
 # we can fuse `leaky_relu` by providing it as an `ACTIVATION` meta-parameter in `_matmul`
@triton.jit
 def leaky_relu(x):
-    x = x + 1
    return tl.where(x >= 0, x, 0.01 * x)


@@ -297,7 +296,7 @@ def matmul(a, b, activation=None):
 torch.manual_seed(0)
 a = torch.randn((512, 512), device='cuda', dtype=torch.float16)
 b = torch.randn((512, 512), device='cuda', dtype=torch.float16)
-triton_output = matmul(a, b, activation=leaky_relu)
+triton_output = matmul(a, b, activation=None)
 torch_output = torch.matmul(a, b)
 print(f"triton_output={triton_output}")
 print(f"torch_output={torch_output}")
@@ -306,8 +305,6 @@ if triton.testing.allclose(triton_output, torch_output):
 else:
    print("❌ Triton and Torch differ")

-print(matmul_kernel.cache_key)
-exit()
 # %%
 # Benchmark
 # --------------