[GENERAL] Merged v1.0alpha into master. Added features are:

- A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill
2021-01-11 19:20:34 -05:00
parent c0bc7ed8b0
commit 083bbd1e8d
75 changed files with 2688 additions and 4512 deletions
--- a/python/examples/tutorials/conv2d.py
+++ b/python/examples/tutorials/conv2d.py
@@ -171,7 +171,7 @@ class _conv(torch.autograd.Function):
          _conv.kernel[dtype] = (delta, triton.kernel(_conv.src, num_warps=[2, 4], defines=defines))
      delta, kernel = _conv.kernel[dtype]
      # allocate output
-      c = triton.empty([Z, CO, P, Q], dtype=dtype)
+      c = torch.empty([Z, CO, P, Q], dtype=dtype)
      # enqueue
      grid = lambda opt: [triton.cdiv(Z*P*Q, opt.d('TM')), 
                          triton.cdiv(CO, opt.d('TN'))]