[GENERAL] Merged v1.0alpha into master. Added features are:

- A100 support via mma.16816 - Thread swizzling for conflict-free shared memory accesses without padding - Complete overhaul of the LLVM code generation in codegen/selection/generator.cc to remove overengineering - Added debugging capabilities in the Python binding - Compilation error for kernels that spill
2021-01-11 19:20:34 -05:00
parent c0bc7ed8b0
commit 083bbd1e8d
75 changed files with 2688 additions and 4512 deletions
--- a/python/examples/batchnorm.py
+++ b/python/examples/batchnorm.py
@@ -1,56 +0,0 @@
-import triton
-import numpy as np
-from enum import Enum
-
-class MODE(Enum):
-  TF = 1
-  TORCH = 2
-
-try:
-  import tensorflow as tf
-  mode = MODE.TF
-except ModuleNotFoundError:
-  pass
-
-try:
-  import torch
-  mode = MODE.TORCH
-except ModuleNotFoundError:
-  pass
-
-
-C, H, W, B = 32, 1, 1, 128
-
-x = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32)
-gamma = np.random.uniform(-1, 1, C).astype(np.float32)
-beta = np.random.uniform(-1, 1, C).astype(np.float32)
-dy = np.random.uniform(-1, 1, (C, H, W, B)).astype(np.float32)
-
-if mode == MODE.TORCH:
-    fw_x = torch.from_numpy(x).cuda()
-    fw_gamma = torch.from_numpy(gamma).cuda()
-    fw_beta = torch.from_numpy(beta).cuda()
-    fw_dy = torch.from_numpy(dy).cuda()
-    # register gradients
-    fw_x.requires_grad_(True)
-    fw_gamma.requires_grad_(True)
-    fw_beta.requires_grad_(True)
-    # execute
-    fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4)
-    fw_y.backward(fw_dy)
- 
-if mode == MODE.TF:
-   fw_x = tf.placeholder(shape=x.shape, dtype=x.dtype)
-   fw_gamma = tf.placeholder(shape=gamma.shape, dtype=gamma.dtype)
-   fw_beta = tf.placeholder(shape=beta.shape, dtype=beta.dtype)
-   fw_dy = tf.placeholder(shape=dy.shape, dtype=dy.dtype)
-   # execute
-   fw_y = triton.ops.batchnorm(fw_x, fw_gamma, fw_beta, 1e-4)
-   fw_mean, fw_var = tf.nn.moments(fw_x, [1, 2, 3])
-   fw_dx, fw_dgamma, fw_dbeta = tf.gradients(fw_y, [fw_x, fw_gamma, fw_beta], fw_dy)
-   # run
-   sess = tf.InteractiveSession()
-   feed_dict = {fw_x: x, fw_gamma: gamma, fw_beta: beta, fw_dy: dy}
-   sess.run(tf.global_variables_initializer())
-   result = sess.run([fw_dx, fw_dgamma, fw_dbeta], feed_dict=feed_dict)
-   print(result)
--- a/python/examples/einsum.py
+++ b/python/examples/einsum.py
@@ -1,213 +0,0 @@
-import triton
-import torch
-from torch.utils.cpp_extension import load
-import numpy as np
-#import utils
-from time import time
-
-torch.manual_seed(0)
-
-#torch.backends.cudnn.benchmark = True
-
-configs = []
-
-# Matrix multiplication
-MNK = [
-        (512, 512 ,512), 
-        (2048, 2048, 2048),
-        #(8192, 8192, 8192),
-       
-        (64, 64, 64000),
-        (64, 64, 128000),
-        (256, 256, 64000),
-        (256, 256, 128000),
-
-        (1536, 16, 1536),
-        (1536, 32, 1536),
-        (1536, 64, 1536),
-        # (1536, 128, 1536),
-        # (4096, 16, 4096),
-        # (4096, 32, 4096),
-        # (4096, 64, 4096),
-        # (4096, 128, 4096),
-    
-        # (127008, 768, 576) 
-      ]
-for M, N, K in MNK:
-    matmul = lambda a, b: torch.matmul(a, b)
-    configs += [([M, K], [K, N], [M, N], matmul, 'mk,kn->mn', dict(), None, None, None)]
-#for M, N, K in MNK:
-#    matmul = lambda a, b: torch.matmul(a.t(), b)
-#    configs += [([M, K], [M, N], [K, N], None, 'mk,mn->kn', dict(), None, None, None)]
-#for M, N, K in MNK:
-#    matmul = lambda a, b: torch.matmul(a, b.t())
-#    configs += [([M, N], [K, N], [M, K], None, 'mn,kn->mk', dict(), None, None, None)]
-
-# Relative attention
-NTHSE = [
-          (16, 512, 1, 64, 64), 
-        #  (16, 512, 1, 128, 128),
-        #  (16, 512, 1, 256, 256),
-        #  (16, 512, 1, 256, 512),
-          (16, 512, 8, 64, 64), 
-        #  (16, 512, 8, 128, 128),
-        #  (16, 512, 8, 256, 256),
-        #  (16, 512, 8, 256, 512),
-
-        #  (64, 1024, 1, 64, 64), 
-          (64, 1024, 1, 128, 128),
-        #  (64, 1024, 1, 256, 256),
-        #  (64, 1024, 1, 256, 512),
-        #  (64, 1024, 8, 64, 64), 
-          (64, 1024, 8, 128, 128),
-        #  (64, 1024, 8, 256, 256),
-        #  (64, 1024, 8, 256, 512),
-
-        #  (128, 1024, 1, 64, 64), 
-        #  (128, 1024, 1, 128, 128),
-        #  (128, 1024, 1, 256, 256),
-          (128, 1024, 1, 256, 512),
-        #  (128, 1024, 8, 64, 64), 
-        #  (128, 1024, 8, 128, 128),
-        #  (128, 1024, 8, 256, 256),
-          #(128, 1024, 8, 256, 512)
-        ]
-#for N, T, H, S, E in NTHSE:
-#    configs += [([N, T, H, S], [H, E, S], [N, H, T, E], None, 'nths,hes->nhte', dict(), None, None, None)]
-#for N, T, H, S, E in NTHSE:
-#    configs += [([N, H, T, E], [N, T, H, S], [H, E, S], None, 'nhte,nths->hes', dict(), None, None, None)]
-#for N, T, H, S, E in NTHSE:
-#    configs += [([N, H, T, E], [H, E, S], [N, T, H, S], None, 'nhte,hes->nths', dict(), None, None, None)]
-
-# 1D Dense convolution
-NCHKR = [
-        #(1, 1152, 12602, 512, 3)
-        ]
-for N, C, H, K, R in NCHKR:
-    torch_fn = lambda a, b: torch.nn.functional.conv1d(a, b.permute(2, 0, 1))
-    configs += [([N, C, H], 
-                 [C, R, K], 
-                 [N, K, H - R + 1], 
-                 torch_fn, 
-                 'nc(h+r),crk->nkh',
-                 dict(), None, None, None)]
-
-# 2D Dense convolution
-NCHWKRS = [
-          #(8, 64, 128, 128, 768, 3, 3),
-          #(128, 3, 32, 32, 64, 3, 3),
-          #(1, 1024, 32, 112, 112, 1024, 3, 3),
-          #(8, 512, 32, 32, 1024, 3, 3)
-          ]
-for N, C, G, H, W, K, R, S in NCHWKRS:
-    stride = 2
-    torch_fn = lambda a, b: torch.nn.functional.conv2d(a, b.permute(3, 0, 1, 2), stride=stride, groups=G)
-    P = (H - R + 1) // stride
-    Q = (W - S + 1) // stride
-    transform_a = lambda a: a.view(N, G, C // G, H, W)
-    transform_b = lambda b: b.view(C // G, R, S, G, K // G)
-    transform_c = lambda c: c.view(N, K, P, Q)
-    configs += [([N, C, H, W], 
-                  [C // G, R, S, K], 
-                  [N, G, K // G, P, Q], 
-                  torch_fn, 
-                  'ngc(h*2+r)(w*2+s),crsgk->ngkhw',
-                  dict(), transform_a, transform_b, transform_c)]
-
-
-# 3D Dense Convolution
-NCDHWKTRS = [
-           #(8, 32, 27, 100, 100, 64, 3, 3, 3),
-           #(8, 64, 23, 48, 48, 256, 3, 3, 3),
-           #(8, 256, 19, 22, 22, 640, 3, 3, 3),
-           #(8, 640, 15, 36, 36, 384, 3, 3, 3)
-          ]
-for N, C, D, H, W, K, T, R, S in NCDHWKTRS:
-    torch_fn = lambda a, b: torch.nn.functional.conv3d(a, b.permute(4, 0, 1, 2, 3))
-    configs += [([N, C, D, H, W], 
-                 [C, T, R, S, K], 
-                 [N, K, D - T + 1, H - R + 1, W - R + 1], 
-                 torch_fn, 
-                 'nc(d+t)(h+r)(w+s),ctrsk->nkdhw',
-                 dict(), None, None, None)]
-
-
-# Shift convolution
-shift_cuda = torch.utils.cpp_extension.load(
-    'shift_cuda', ['kernels/shift_cuda.cpp', 
-                   'kernels/shift_cuda_kernel.cu'],
-    extra_cflags=['-O3'])
-class shift(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, x, shift):
-        ctx.save_for_backward(shift)
-        return shift_cuda.forward(x, shift)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        shift, = ctx.saved_tensors
-        grad_output = shift_cuda.backward(grad_output, shift)
-
-        return grad_output, None
-
-NCHWKRS = [
-          #(8, 64, 128, 128, 128, 3, 3),
-          #(8, 128, 64, 64, 256, 3, 3),
-          #(8, 256, 32, 32, 512, 3, 3),
-          #(8, 512, 32, 32, 1024, 3, 3)
-          ]
-for N, C, H, W, K, R, S in NCHWKRS:
-    shift_h = np.random.randint(R, size=C, dtype=np.int32) - R//2
-    shift_w = np.random.randint(S, size=C, dtype=np.int32) - S//2
-    def shift_conv(a, b, **kwargs):
-        shift_h, shift_w = kwargs['sh'], kwargs['sw']
-        shift_torch =  np.column_stack((shift_w*-1, shift_h*-1))
-        shift_torch = torch.from_numpy(shift_torch).cuda()
-        a = shift.apply(a, shift_torch)
-        b = b.permute(1, 0)
-        b = b.reshape(b.shape[0], b.shape[1], 1, 1)
-        return torch.nn.functional.conv2d(a, b)
-    configs += [([N, C, H, W], 
-                  [C, K], 
-                  [N, K, H, W], 
-                  shift_conv, 
-                  'nc(h + sh[c])(w + sw[c]),ck->nkhw',
-                  {'sh': shift_h, 'sw': shift_w},
-                  None, None, None)]
-
-# Benchmark
-torch.set_num_threads(1)
-for a_shape, b_shape, c_shape, torch_fn, expr, arrays, \
-    transform_a, transform_b, transform_c in configs:
-    dtype = torch.cuda.FloatTensor
-    # initialize input tensors
-    a = torch.rand(*a_shape).type(dtype).cuda()
-    b = torch.rand(*b_shape).type(dtype).cuda()
-    # reference output
-    if torch_fn:
-        rc = torch_fn(a, b, **arrays)
-    else:
-        rc = torch.einsum(expr, a, b)
-    # triton output
-    ta = a if transform_a is None else transform_a(a)
-    tb = b if transform_b is None else transform_b(b)
-    tc = torch.empty(c_shape, device=a.device)
-    triton.ops.einsum(expr, ta, tb, tc, arrays = arrays, bench = True)
-    ctx = triton.ops._einsum.registry[tc]
-    tc = tc if transform_c is None else transform_c(tc)
-    # performance relative to equivalent matrix multiplication
-    B, M, N, K = ctx.matmul_B, ctx.matmul_M, ctx.matmul_N, ctx.matmul_K
-    cmp_eqbmm = True
-    if cmp_eqbmm:
-        a = torch.rand(B, M, K).type(dtype).cuda()
-        b = torch.rand(B, K, N).type(dtype).cuda()
-        c = torch.empty((B, M, N), device=a.device).cuda()
-        tmmc = triton.ops.einsum('bmk,bkn->bmn', a, b, c, bench = True)
-        ratio = triton.ops._einsum.registry[tmmc].forward_ms / ctx.forward_ms
-        cmp_str = f'({ratio:4.2f})'
-    else:
-        cmp_str = ''
-    # test and benchmark
-    bench = 2. * B * M * N * K / ctx.forward_ms * 1e-3
-    diff = (tc - rc).abs().max() / rc.abs().max()
-    print(f'{expr:>15}; {str(a_shape):>20}; {str(b_shape):>20};          {bench:4.2f} {cmp_str};          {diff:4.2f}')
--- a/python/examples/kernels/shift_cuda.cpp
+++ b/python/examples/kernels/shift_cuda.cpp
@@ -1,42 +0,0 @@
-#include <torch/torch.h>
-
-#include <vector>
-
-// CUDA forward declarations
-
-at::Tensor shift_cuda_forward(
-    const at::Tensor input,
-    const at::Tensor shift);
-
-at::Tensor shift_cuda_backward(
-    const at::Tensor grad_input,
-    const at::Tensor shift);
-
-// C++ interface
-
-// NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
-#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
-
-at::Tensor shift_forward(
-    const at::Tensor input,
-    const at::Tensor shift) {
-  CHECK_INPUT(input);
-  CHECK_INPUT(shift);
-
-  return shift_cuda_forward(input, shift);
-}
-
-at::Tensor shift_backward(
-    const at::Tensor grad_input,
-    const at::Tensor shift) {
-  CHECK_INPUT(grad_input);
-  CHECK_INPUT(shift);
-  return shift_cuda_backward(grad_input, shift);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("forward", &shift_forward, "Shift forward (CUDA)");
-  m.def("backward", &shift_backward, "Shift backward (CUDA)");
-}
--- a/python/examples/kernels/shift_cuda_kernel.cu
+++ b/python/examples/kernels/shift_cuda_kernel.cu
@@ -1,111 +0,0 @@
-#include <ATen/ATen.h>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <vector>
-
-namespace {
-template <typename scalar_t>
-__global__ void shift_cuda_forward_kernel(
-    const scalar_t* __restrict__ input,
-    const int32_t* __restrict__ shift,
-    scalar_t* __restrict__ output,
-    const int32_t B,
-    const int32_t C,
-    const int32_t H,
-    const int32_t W) {
-  const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32_t size = B*C*H*W;
-  
-  const int32_t CHW = C*H*W;
-  const int32_t HW = H*W;
-  const int32_t b = idx / CHW;
-  const int32_t c = (idx - b*CHW) / HW;
-  const int32_t h = (idx - b*CHW - c*HW) / W;
-  const int32_t w = idx - b*CHW - c*HW - h*W;
-  const int32_t target_w = w + shift[2*c];
-  const int32_t target_h = h + shift[2*c + 1];
-  const int32_t target_idx = b*CHW + c*HW + target_h*W + target_w;
-  if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) {
-      output[target_idx] = input[idx];
-  }
-}
-
-template <typename scalar_t>
-__global__ void shift_cuda_backward_kernel(
-    const scalar_t* __restrict__ grad_input,
-    scalar_t* __restrict__ grad_output,
-    const int32_t* __restrict__ shift,
-    const int32_t B,
-    const int32_t C,
-    const int32_t W,
-    const int32_t H) {
-  const int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  const int32_t size = B*C*W*H;
-  const int32_t CWH = C*W*H;
-  const int32_t WH = W*H;
-  const int32_t b = idx / CWH;
-  const int32_t c = (idx - b*CWH) / WH;
-  const int32_t w = (idx - b*CWH - c*WH) / W;
-  const int32_t h = idx - b*CWH - c*WH - w*H;
-  const int32_t target_w = w - shift[2*c];
-  const int32_t target_h = h - shift[2*c + 1];
-  const int32_t target_idx = b*CWH + c*WH + target_w*W + target_h;
-  if (idx < size && target_w >= 0 && target_w < W && target_h >= 0 && target_h < H) {
-      grad_output[target_idx] = grad_input[idx];
-  }
-}
-} // namespace
-
-at::Tensor shift_cuda_forward(
-    const at::Tensor input,
-    const at::Tensor shift) {
-  const auto B = input.size(0);
-  const auto C = input.size(1);
-  const auto H = input.size(2);
-  const auto W = input.size(3);
-  const auto size = B*C*W*H;
-  const int threads = 1024;
-  const int blocks = (size + threads - 1) / threads;
-  auto output = at::zeros_like(input);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(input.type(), "shift_forward_cuda", ([&] {
-    shift_cuda_forward_kernel<scalar_t><<<blocks, threads>>>(
-        input.data<scalar_t>(),
-        shift.data<int32_t>(),
-        output.data<scalar_t>(),
-        B,
-        C,
-        H,
-        W);
-  }));
-
-  return output;
-}
-
-at::Tensor shift_cuda_backward(
-    const at::Tensor grad_input,
-    const at::Tensor shift) {
-  const auto B = grad_input.size(0);
-  const auto C = grad_input.size(1);
-  const auto H = grad_input.size(2);
-  const auto W = grad_input.size(3);
-  const auto size = B*C*W*H;
-  const int threads = 1024;
-  const int blocks = (size + threads - 1) / threads;
-  auto grad_output = at::zeros_like(grad_input);
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(grad_input.type(), "shift_backward_cuda", ([&] {
-    shift_cuda_backward_kernel<scalar_t><<<blocks, threads>>>(
-        grad_input.data<scalar_t>(),
-        grad_output.data<scalar_t>(),
-        shift.data<int32_t>(),
-        B,
-        C,
-        H,
-        W);
-  }));
-
-  return grad_output;
-}
--- a/python/examples/test.py
+++ b/python/examples/test.py
@@ -1,109 +0,0 @@
-import triton
-import numpy
-import torch
-import itertools
-
-torch.manual_seed(0)
-numpy.random.seed(0)
-
-def to_sparse(expr, data, layout, shape, block):
-    # shape of result
-    sparse = None
-    shape_ret = []
-    for i, d in enumerate(expr):
-        if d.isupper() and sparse is None:
-            sparse = i
-            shape_ret.append(int(layout.sum()))
-        if d.isupper():
-            shape_ret.append(block[d])
-        else:
-            shape_ret.append(shape[i])
-    # iterator
-    steps = [block[d] if d.isupper() else 1 for d in expr]
-    it = [range(0, shape[i], steps[i]) for i in range(len(expr))]
-    # create result
-    ret = torch.empty(*shape_ret, dtype=data.dtype, device=data.device)
-    blockid = 0
-    nzblockid = 0
-    for curr in itertools.product(*it):
-        if all([curr[i] == it[i][0] for i in range(len(curr)) if expr[i].isupper()]):
-            blockid = 0
-            nzblockid = 0
-        data_slice = [slice(curr[i], curr[i] + steps[i], 1) for i in range(len(curr))]
-        ret_slice = [slice(0, block[expr[i]], 1) if expr[i].isupper() else slice(curr[i], curr[i] + 1) for i in range(len(curr))]
-        ret_slice.insert(sparse, nzblockid)
-        if int(layout.view(-1)[blockid]):
-            ret[ret_slice] = data[data_slice]
-            nzblockid += 1
-        blockid += 1
-    return ret
-
-def to_dense(expr, data, layout, shape, block):
-    sparse = None
-    for i, d in enumerate(expr):
-        if d.isupper() and sparse is None:
-            sparse = i
-
-    ret = torch.zeros(*shape, dtype=data.dtype, device=data.device)
-    steps = [block[d] if d.isupper() else 1 for d in expr]
-    it = [range(0, shape[i], steps[i]) for i in range(len(expr))]
-    blockid = 0
-    nzblockid = 0
-    for curr in itertools.product(*it):
-        if all([curr[i] == it[i][0] for i in range(len(curr)) if expr[i].isupper()]):
-            blockid = 0
-            nzblockid = 0
-        ret_slice = [slice(curr[i], curr[i] + steps[i], 1) for i in range(len(curr))]
-        data_slice = [slice(0, block[expr[i]], 1) if expr[i].isupper() else slice(curr[i], curr[i] + 1) for i in range(len(curr))]
-        data_slice.insert(sparse, nzblockid)
-        if int(layout.view(-1)[blockid]):
-            ret[ret_slice] = data[data_slice]
-            nzblockid += 1
-        blockid += 1
-    return ret
-
-def test_expr(expr, shape, blocks):
-    # decompose expr
-    expr_a, expr_bc = expr.split(",")
-    expr_b, expr_c  = expr_bc.split("->")
-    # check with argument is sparse
-    sparse_a = any(x.isupper() for x in expr_a)
-    sparse_b = any(x.isupper() for x in expr_b)
-    sparse_c = any(x.isupper() for x in expr_c)
-    # allocate data
-    shape_a = [shape[d.lower()] for d in expr_a]
-    shape_b = [shape[d.lower()] for d in expr_b]
-    shape_c = [shape[d.lower()] for d in expr_c]
-    ref_a = torch.rand(*shape_a, device='cuda')
-    ref_b = torch.rand(*shape_b, device='cuda')
-    ref_c = torch.zeros(*shape_c, device='cuda')
-    # layouts
-    layout_a = [shape[d.lower()]//blocks[d] for d in expr_a if d.isupper()]
-    layout_b = [shape[d.lower()]//blocks[d] for d in expr_b if d.isupper()]
-    layout_c = [shape[d.lower()]//blocks[d] for d in expr_c if d.isupper()]
-    layout_a = torch.randint(0, 2, layout_a, device='cuda')
-    layout_b = torch.randint(0, 2, layout_b, device='cuda')
-    layout_c = torch.randint(0, 2, layout_c, device='cuda')
-    # triton computation
-    triton_a = to_sparse(expr_a, ref_a, layout_a, shape_a, blocks) if sparse_a else ref_a
-    triton_b = to_sparse(expr_b, ref_b, layout_b, shape_b, blocks) if sparse_b else ref_b
-    layouts  = {expr_a: layout_a, expr_b: layout_b, expr_c: layout_c}
-    triton_c = triton.ops.einsum(expr, triton_a, triton_b, layouts, blocks)
-    torch.cuda.synchronize()
-    # reference computation
-    ref_a = to_dense(expr_a, triton_a, layout_a, shape_a, blocks) if sparse_a else ref_a
-    ref_b = to_dense(expr_b, triton_b, layout_b, shape_b, blocks) if sparse_b else ref_b
-    ref_c = torch.einsum(expr.lower(), ref_a, ref_b)
-    if sparse_c:
-        ref_c = to_sparse(expr_c, ref_c, layout_c, shape_c, blocks)
-    torch.cuda.synchronize()
-    print((ref_c - triton_c).abs().max())
-
-
-
-
-# shape characteristics
-test_expr('bHMK,bhkn->bhmn', {'b': 2, 'h': 2, 'm': 256, 'k': 256, 'n': 256}, {'H': 1, 'M': 32, 'K': 32})
-test_expr('bhmk,bHKN->bhmn', {'b': 2, 'h': 2, 'm': 256, 'k': 256, 'n': 256}, {'H': 1, 'K': 32, 'N': 32})
-test_expr('bhmk,bhkn->bHMN', {'b': 2, 'h': 2, 'm': 256, 'k': 256, 'n': 256}, {'H': 1, 'M': 32, 'N': 32})
-
--- a/python/examples/tutorials/vec_add.py
+++ b/python/examples/tutorials/vec_add.py
--- a/python/examples/tutorials/conv2d.py
+++ b/python/examples/tutorials/conv2d.py
@@ -171,7 +171,7 @@ class _conv(torch.autograd.Function):
          _conv.kernel[dtype] = (delta, triton.kernel(_conv.src, num_warps=[2, 4], defines=defines))
      delta, kernel = _conv.kernel[dtype]
      # allocate output
-      c = triton.empty([Z, CO, P, Q], dtype=dtype)
+      c = torch.empty([Z, CO, P, Q], dtype=dtype)
      # enqueue
      grid = lambda opt: [triton.cdiv(Z*P*Q, opt.d('TM')), 
                          triton.cdiv(CO, opt.d('TN'))]
--- a/python/examples/tutorials/mat_copy.py
+++ b/python/examples/tutorials/mat_copy.py
--- a/python/examples/tutorials/mat_mul.py
+++ b/python/examples/tutorials/mat_mul.py
@@ -3,6 +3,9 @@ import triton

 class _dot(torch.autograd.Function):
    src = """
+#define STM 4
+#define STN 4
+
 __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
                    TYPE * B __noalias __readonly __aligned(16),
                    TYPE * C __noalias __aligned(16),
@@ -14,20 +17,26 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
                    int ldb __multipleof(8),
                    int ldc __multipleof(8)) {
      // prologue
-      int ridx = get_program_id(0);
-      int ridy = get_program_id(1);
-      int ridz = get_program_id(2);
-      int gridx = M / TM;
-      int gridy = N / TN;
-      int rid = ridx + ridy * gridx;
-      ridx = rid / gridy;
-      ridy = rid % gridy;
-      int rm[TM] = ridx * TM + 0 ... TM;
-      int rn[TN] = ridy * TN + 0 ... TN;
+      int pid = get_program_id(0);
+      int pidz = get_program_id(2);
+      int gridm = M / TM;
+      int gridn = N / TN;
+      int stgridm = (gridm + STM - 1) / STM;
+      int stgridn = (gridn + STN - 1) / STN;
+      int stid = pid / (STM * STN);
+      int laneid = pid % (STM * STN);
+      int stm = stid / stgridn;
+      int stn = stid % stgridn;
+      int lanem = laneid / STN;
+      int lanen = laneid % STN;
+      int pidm = stm*STM + lanem;
+      int pidn = stn*STN + lanen;
+      int rm[TM] = pidm * TM + 0 ... TM;
+      int rn[TN] = pidn * TN + 0 ... TN;

      // reduction splitting
      K           = K / TZ;
-      int rk[TK]  = ridz * K + 0 ... TK;
+      int rk[TK]  = pidz * K + 0 ... TK;

      // pointers to operands
      int offa[TM, TK] = rk[newaxis, :] * STRIDE_AK + rm[:, newaxis] * STRIDE_AM;
@@ -44,11 +53,11 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
      // reduction loop
      float acc[TM, TN] = 0;
      for(int k = K; k > 0; k -= TK){
-        acc += a @ b;
        bool checka[TM, TK] = k > TK;
        bool checkb[TK, TN] = k > TK;
        pa += TK * STRIDE_AK;
        pb += TK * STRIDE_BK;
+        acc += a @ b;
        a = *?(checka)pa;
        b = *?(checkb)pb;
      }
@@ -56,8 +65,8 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
      TYPE c[TM, TN] = acc;

      // epilogue
-      int rxm[TM] = ridx * TM + 0 ... TM;
-      int rxn[TN] = ridy * TN + 0 ... TN;
+      int rxm[TM] = pidm * TM + 0 ... TM;
+      int rxn[TN] = pidn * TN + 0 ... TN;
      int offc[TM, TN] = rxm[:, newaxis] * ldc + rxn[newaxis, :];
      TYPE* pc[TM, TN] = C + offc;
      bool checkc[TM, TN] = (rxm[:, newaxis] < M) && (rxn[newaxis, :] < N);
@@ -66,7 +75,7 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
      *?(checkc) pc = c;
 #else
      // accumulate partial result using spin-locks
-      int *plock  = locks + rid;
+      int *plock  = locks + pid;
      int *pcount = plock + get_num_programs(0) * get_num_programs(1);
      for(int repeat = 1; repeat == 1; repeat = atomic_cas(plock, 0, 1));
      int count = *pcount;
@@ -100,7 +109,7 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
                'STRIDE_BN': '1', 'STRIDE_BK': 'ldb',
                'TM'   : [128],
                'TN'   : [128],
-                'TK'   : [16],
+                'TK'   : [32],
                'TZ'   : [1]
            }
            _dot.kernel[dtype] = triton.kernel(_dot.src, num_warps=[4], defines=defines)
@@ -109,9 +118,10 @@ __global__ void dot(TYPE * A __noalias __readonly __aligned(16),
        M, K = a.shape
        K, N = b.shape
        c = torch.empty([M,N], dtype=dtype, device=a.device)
+        print(kernel.asm('sass', c.device))
+        print(kernel.asm('ptx', c.device))
        # enqueue
-        grid = lambda opt: [triton.cdiv(M, opt.d('TM')), 
-                            triton.cdiv(N, opt.d('TN'))]
+        grid = lambda opt: [triton.cdiv(M, opt.d('TM'))*triton.cdiv(N, opt.d('TN'))]
        time = kernel(a, b, c, 1., M, N, K, 
                      a.stride(0), b.stride(0), c.stride(0), grid=grid)
        return c
@@ -130,6 +140,4 @@ b = torch.rand((K, N)).cuda().half()

 zc  = torch.matmul(a,b)
 zc_ = dot(a,b)
-
-
 print(torch.allclose(zc, zc_))
--- a/python/examples/tutorials/mat_transpose.py
+++ b/python/examples/tutorials/mat_transpose.py