ugh bug in shift-conv striding

2019-07-10 17:00:22 -07:00
parent f665c742f9
commit 4ca83f1935
5 changed files with 143 additions and 26 deletions
--- a/examples/python/pytorch/run.py
+++ b/examples/python/pytorch/run.py
@@ -6,10 +6,97 @@ import torch.nn.functional as F
 import torch.optim as optim
 from torchvision import datasets, transforms
 import triton
 from torch.utils.cpp_extension import load
 from torch.distributions import categorical
-class Net(nn.Module):
+shift_cuda = load(
    'shift_cuda', ['/home/philippe/development/shiftnet/kernels/shift_cuda.cpp',
                   '/home/philippe/development/shiftnet/kernels/shift_cuda_kernel.cu'], extra_cflags=['-O3'])
 class shift(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, shift):
        ctx.save_for_backward(shift)
        return shift_cuda.forward(x, shift)
    @staticmethod
    def backward(ctx, grad_output):
        shift, = ctx.saved_tensors
        grad_output = shift_cuda.backward(grad_output, shift)
        return grad_output, None
 class Shift(nn.Module):
    def __init__(self, in_channels, kernel_size):
        super(Shift, self).__init__()
        self.channels = in_channels
        self.kernel_size = kernel_size
        if kernel_size == 3:
            p = torch.Tensor([0., 1., 0.])
        elif kernel_size == 5:
            p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1])
        elif kernel_size == 7:
            p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075])
        elif kernel_size == 9:
            p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05])
        else:
            raise RuntimeError('Unsupported kernel size')
        shift_t = categorical.Categorical(p).sample((in_channels, 2)) - (kernel_size // 2)
        self.register_buffer('shift_t', shift_t.int())
    def forward(self, x):
        if x.is_cuda:
            return shift.apply(x, self.shift_t)
        else:
            print('Shift only supports GPU for now..')
            assert False
    def extra_repr(self):
        s = ('{channels}, kernel_size={kernel_size}')
        return s.format(**self.__dict__)
 def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilation=1):
    return nn.Sequential(
        Shift(in_planes, kernel_size),
        nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
                  padding=0, groups=groups, bias=False)
    )
 class NetReference(nn.Module):
    def __init__(self):
-        super(Net, self).__init__()
+        super(NetReference, self).__init__()
        #self.conv1 = ShiftConv2d(1, 32, 3, 2)
        self.conv1 = triton.ShiftConv2d(1, 32, 3, 2)
        self.bn1 = nn.BatchNorm2d(32)
        #self.conv2a = ShiftConv2d(32, 32, 3, 1)
        self.conv2b = triton.ShiftConv2d(32, 32, 3, 2)
        #self.conv2b = ShiftConv2d(32, 32, 3, 2)
        self.bn2 = nn.BatchNorm2d(32)
        self.fc1 = nn.Linear(32*7*7, 500)
        self.fc2 = nn.Linear(500, 10)
    def forward(self, x):
        x = x.permute(1, 2, 3, 0).contiguous()
        x = self.conv1(x)
        x = x.permute(3, 0, 1, 2).contiguous()
        x = self.bn1(x)
        x = F.relu(x)
        x = x.permute(1, 2, 3, 0).contiguous()
        x = self.conv2b(x)
        x = x.permute(3, 0, 1, 2).contiguous()
        x = self.bn2(x)
        x = F.relu(x)
        x = x.view(-1, 32*7*7)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)
 class NetTriton(nn.Module):
    def __init__(self):
        super(NetTriton, self).__init__()
        self.conv1 = triton.ShiftConv2d(1, 32, 3, 2)
        self.bn1 = triton.BatchNorm2d(32)
        self.conv2 = triton.ShiftConv2d(32, 64, 3, 2)
@@ -23,6 +110,7 @@ class Net(nn.Module):
        x = self.bn1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = x.permute(3, 0, 1, 2).contiguous()
        x = x.view(-1, 64*7*7)
@@ -30,6 +118,8 @@ class Net(nn.Module):
        x = self.fc2(x)
        return F.log_softmax(x, dim=1)
 Net = NetReference()
 def train(args, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
@@ -107,7 +197,7 @@ def main():
        batch_size=args.test_batch_size, shuffle=True, **kwargs)
-    model = Net().to(device)
+    model = Net.to(device)
    optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
    for epoch in range(1, args.epochs + 1):
--- a/examples/python/pytorch/shift.cpp
+++ b/examples/python/pytorch/shift.cpp
@@ -53,6 +53,8 @@ torch::Tensor shift_y(
    int64_t R, int64_t S,
    int64_t stride_h, int64_t stride_w,
    const torch::Tensor shift_h, const torch::Tensor shift_w) {
  CHECK_INPUT(x);
  CHECK_INPUT(w);
  // shapes for a
  int64_t Ca  = x.size(0);
  int64_t H   = x.size(1);
@@ -76,6 +78,8 @@ torch::Tensor shift_dx(
    int64_t R, int64_t S,
    int64_t stride_h, int64_t stride_w,
    const torch::Tensor shift_h, const torch::Tensor shift_w) {
  CHECK_INPUT(dy);
  CHECK_INPUT(w);
  // shapes for a
  int64_t Ca  = dy.size(0);
  int64_t H   = dy.size(1);
@@ -104,6 +108,8 @@ torch::Tensor shift_dw(
    int64_t R, int64_t S,
    int64_t stride_h, int64_t stride_w,
    const torch::Tensor shift_h, const torch::Tensor shift_w) {
  CHECK_INPUT(dy);
  CHECK_INPUT(x);
  // shapes for a
  int64_t F    = dy.size(0);
  int64_t Ha   = dy.size(1);
--- a/examples/python/pytorch/triton.py
+++ b/examples/python/pytorch/triton.py
@@ -1,5 +1,6 @@
 import torch
 import math
 import numpy as np
 from torch.nn.modules.utils import _single, _pair, _triple
 from torch.distributions import categorical
@@ -117,11 +118,13 @@ class ShiftConvFunction(torch.autograd.Function):
        shift_w = ctx.shift_w
        dx = dw = dbias = None
        if ctx.needs_input_grad[0]:
-            dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
+            dx = torch.ops.triton.shift_conv_dx(dy.contiguous(), weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
        if ctx.needs_input_grad[1]:
-            dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
+            dw = torch.ops.triton.shift_conv_dw(dy.contiguous(), input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
        if ctx.needs_input_grad[2]:
            dbias = torch.sum(dy, (1, 2, 3))
        #print('dx', ctx.needs_input_grad[0], np.isnan(dx.cpu().numpy()).any())
        #print('dw', ctx.needs_input_grad[1], np.isnan(dw.cpu().numpy()).any())
        return dx, dw, dbias, None, None, None, None
@@ -149,7 +152,7 @@ class _ShiftConvNd(torch.nn.Module):
    def make_shift(self, kernel_size):
        if kernel_size == 3:
-            p = torch.Tensor([0.3, 0.4, 0.3])
+            p = torch.Tensor([0., 1., 0.])
        elif kernel_size == 5:
            p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1])
        elif kernel_size == 7:
--- a/examples/python/tensorflow/run.py
+++ b/examples/python/tensorflow/run.py
@@ -58,8 +58,8 @@ def blocksparse_matmul_grad(op, dy):
    return (dx, dw)
 def run_shift():
-    B, C, H, W = 16, 16, 4, 4
+    B, C, H, W = 16, 1, 4, 4
-    R, S, F = 3, 3, 4
+    R, S, F = 3, 3, 32
    stride_h, stride_w = 2, 2
    np.random.seed(2)
    a = tf.placeholder(tf.float32, shape=[C, H, W, B])
@@ -68,8 +68,8 @@ def run_shift():
    hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32)
    c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w))
    # feed values
-    ha = np.random.rand(C, H, W, B)
+    ha = np.ones((C, H, W, B), dtype=np.float32)
-    hb = np.random.rand(C, F)
+    hb = np.ones((C, F), dtype=np.float32)
    sess = tf.InteractiveSession()
    # test
    grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B),
@@ -128,5 +128,5 @@ def run_batchnorm():
    print(np.max(np.abs(dg_t - dg_n)))
    print(np.max(np.abs(db_t - db_n)))
-#run_shift()
+run_shift()
-run_batchnorm()
+#run_batchnorm()
--- a/lib/dnn/shift.cpp
+++ b/lib/dnn/shift.cpp
@@ -158,7 +158,7 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel,
  unsigned TM = ranges[0], TN = ranges[1];
  std::array<size_t, 3> grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1};
  if(ty_ == BPROP)
-    ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4);
+    ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*stride_h_*stride_w_*4);
  stream->enqueue(kernel, grid, {nthreads, 1, 1});
 }
@@ -217,6 +217,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a,
 if(ty_ == FPROP){
  os << R"(
  int32 rawhc[TM] = rxa / ABS;
  int32 rab[TM] = rxa % ABS;
  int32 raw[TM] = (rawhc % AW)*stride_w;
  int32 rahc[TM] = rawhc / AW;
  int32 rah[TM] = (rahc % AH)*stride_h;
@@ -227,26 +228,32 @@ if(ty_ == FPROP){
  int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis];
  int32 inc_true[TM, TK] = d[newaxis, :];
  int32 inc_false[TM, TK] = rka[newaxis, :] * lda;
-  int32 inc[TM, TK] = interior ? inc_true : inc_false;)";
+  int32 inc[TM, TK] = interior ? inc_true : inc_false;
  rxa = rab + raw*ABS + rah*ABS*AW;
  int32 offa0[TM, TK] = rxa[:, newaxis];)";
 }
 else{
  os << "  int32 offa0[" << AS << "] = rxa" << bca1 << lda1 << ";" << std::endl;
 }
 if(ty_ == WGRAD){
  os << R"(
  __constant__ int32* pd[TN] = delta + ryb;
  int32 d[TN] = *pd;
-  int32 shift[TK, TN] = d[newaxis, :];)";
+  int32 shift[TK, TN] = d[newaxis, :];
  int32 rbwhc[TK] = rkb / ABS;
  int32 rbw[TK] = (rbwhc % AW)*stride_w;
  int32 rbhc[TK] = rbwhc / AW;
  int32 rbh[TK] = (rbhc % AH)*stride_h;
  )";
 }
  os << R"(
-  )" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(;
+  )" << a_ty_ << "* pa[" << AS << "] = a + offa0 + " << rka << bca0 << lda0 << R"(;
  )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(;
  int1 checka[)" << AS << "] = (rka < K)" << bca0  << R"(;
  int1 checkb[)" << BS << "] = (rkb < K)" << bcb0  << R"(;
  )" << a_ty_ << "   a[" << AS << R"(] = checka ? *pa : 0;)";
 if(ty_ == WGRAD){
  os << R"(
    int32 rbwhc[TK] = rkb / ABS;
    int32 rbw[TK] = (rbwhc % AW)*stride_w;
    int32 rbhc[TK] = rbwhc / AW;
    int32 rbh[TK] = (rbhc % AH)*stride_h;
    int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));
    int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));
    int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis];
@@ -301,17 +308,28 @@ else{
  os << R"(
  }
  int32 rxc[TM] = get_global_range[TM](0);
-  int32 ryc[TN] = get_global_range[TN](1);
+  int32 ryc[TN] = get_global_range[TN](1);)";
-  fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis];
+  if(ty_ == BPROP){
  os << R"(
  int32 rcwhc[TM] = rxc / ABS;
  int32 rcb[TM] = rxc % ABS;
  int32 rcw[TM] = (rcwhc % AW)*stride_w;
  int32 rchc[TM] = rcwhc / AW;
  int32 rch[TM] = (rchc % AH)*stride_h;
  rxc = rcb + rcw*ABS + rch*ABS*AW;
  int32 offc0[TM, TN] = rxc[:, newaxis];)";
  }
  else{
  os << R"(
  int32 offc0[TM, TN] = rxc[:, newaxis];)";
  }
  os << R"("
  fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + offc0;
  int1 checkc0[TM] = rxc < M;
  int1 checkc1[TN] = ryc < N;
  int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)";
 if(ty_ == BPROP){
  os << R"(
  int32 rcwhc[TM] = rxc / ABS;
  int32 rcw[TM] = (rcwhc % AW)*stride_w;
  int32 rchc[TM] = rcwhc / AW;
  int32 rch[TM] = (rchc % AH)*stride_h;
  int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));
  int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));
  int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis];