ugh bug in shift-conv striding

This commit is contained in:
Philippe Tillet
2019-07-10 17:00:22 -07:00
parent f665c742f9
commit 4ca83f1935
5 changed files with 143 additions and 26 deletions

View File

@@ -6,10 +6,97 @@ import torch.nn.functional as F
import torch.optim as optim import torch.optim as optim
from torchvision import datasets, transforms from torchvision import datasets, transforms
import triton import triton
from torch.utils.cpp_extension import load
from torch.distributions import categorical
class Net(nn.Module): shift_cuda = load(
'shift_cuda', ['/home/philippe/development/shiftnet/kernels/shift_cuda.cpp',
'/home/philippe/development/shiftnet/kernels/shift_cuda_kernel.cu'], extra_cflags=['-O3'])
class shift(torch.autograd.Function):
@staticmethod
def forward(ctx, x, shift):
ctx.save_for_backward(shift)
return shift_cuda.forward(x, shift)
@staticmethod
def backward(ctx, grad_output):
shift, = ctx.saved_tensors
grad_output = shift_cuda.backward(grad_output, shift)
return grad_output, None
class Shift(nn.Module):
def __init__(self, in_channels, kernel_size):
super(Shift, self).__init__()
self.channels = in_channels
self.kernel_size = kernel_size
if kernel_size == 3:
p = torch.Tensor([0., 1., 0.])
elif kernel_size == 5:
p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1])
elif kernel_size == 7:
p = torch.Tensor([0.075, 0.1, 0.175, 0.3, 0.175, 0.1, 0.075])
elif kernel_size == 9:
p = torch.Tensor([0.05, 0.075, 0.1, 0.175, 0.2, 0.175, 0.1, 0.075, 0.05])
else:
raise RuntimeError('Unsupported kernel size')
shift_t = categorical.Categorical(p).sample((in_channels, 2)) - (kernel_size // 2)
self.register_buffer('shift_t', shift_t.int())
def forward(self, x):
if x.is_cuda:
return shift.apply(x, self.shift_t)
else:
print('Shift only supports GPU for now..')
assert False
def extra_repr(self):
s = ('{channels}, kernel_size={kernel_size}')
return s.format(**self.__dict__)
def ShiftConv2d(in_planes, out_planes, kernel_size=3, stride=1, groups=1, dilation=1):
return nn.Sequential(
Shift(in_planes, kernel_size),
nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
padding=0, groups=groups, bias=False)
)
class NetReference(nn.Module):
def __init__(self): def __init__(self):
super(Net, self).__init__() super(NetReference, self).__init__()
#self.conv1 = ShiftConv2d(1, 32, 3, 2)
self.conv1 = triton.ShiftConv2d(1, 32, 3, 2)
self.bn1 = nn.BatchNorm2d(32)
#self.conv2a = ShiftConv2d(32, 32, 3, 1)
self.conv2b = triton.ShiftConv2d(32, 32, 3, 2)
#self.conv2b = ShiftConv2d(32, 32, 3, 2)
self.bn2 = nn.BatchNorm2d(32)
self.fc1 = nn.Linear(32*7*7, 500)
self.fc2 = nn.Linear(500, 10)
def forward(self, x):
x = x.permute(1, 2, 3, 0).contiguous()
x = self.conv1(x)
x = x.permute(3, 0, 1, 2).contiguous()
x = self.bn1(x)
x = F.relu(x)
x = x.permute(1, 2, 3, 0).contiguous()
x = self.conv2b(x)
x = x.permute(3, 0, 1, 2).contiguous()
x = self.bn2(x)
x = F.relu(x)
x = x.view(-1, 32*7*7)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return F.log_softmax(x, dim=1)
class NetTriton(nn.Module):
def __init__(self):
super(NetTriton, self).__init__()
self.conv1 = triton.ShiftConv2d(1, 32, 3, 2) self.conv1 = triton.ShiftConv2d(1, 32, 3, 2)
self.bn1 = triton.BatchNorm2d(32) self.bn1 = triton.BatchNorm2d(32)
self.conv2 = triton.ShiftConv2d(32, 64, 3, 2) self.conv2 = triton.ShiftConv2d(32, 64, 3, 2)
@@ -23,6 +110,7 @@ class Net(nn.Module):
x = self.bn1(x) x = self.bn1(x)
x = F.relu(x) x = F.relu(x)
x = self.conv2(x) x = self.conv2(x)
x = self.bn2(x)
x = F.relu(x) x = F.relu(x)
x = x.permute(3, 0, 1, 2).contiguous() x = x.permute(3, 0, 1, 2).contiguous()
x = x.view(-1, 64*7*7) x = x.view(-1, 64*7*7)
@@ -30,6 +118,8 @@ class Net(nn.Module):
x = self.fc2(x) x = self.fc2(x)
return F.log_softmax(x, dim=1) return F.log_softmax(x, dim=1)
Net = NetReference()
def train(args, model, device, train_loader, optimizer, epoch): def train(args, model, device, train_loader, optimizer, epoch):
model.train() model.train()
for batch_idx, (data, target) in enumerate(train_loader): for batch_idx, (data, target) in enumerate(train_loader):
@@ -107,7 +197,7 @@ def main():
batch_size=args.test_batch_size, shuffle=True, **kwargs) batch_size=args.test_batch_size, shuffle=True, **kwargs)
model = Net().to(device) model = Net.to(device)
optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)
for epoch in range(1, args.epochs + 1): for epoch in range(1, args.epochs + 1):

View File

@@ -53,6 +53,8 @@ torch::Tensor shift_y(
int64_t R, int64_t S, int64_t R, int64_t S,
int64_t stride_h, int64_t stride_w, int64_t stride_h, int64_t stride_w,
const torch::Tensor shift_h, const torch::Tensor shift_w) { const torch::Tensor shift_h, const torch::Tensor shift_w) {
CHECK_INPUT(x);
CHECK_INPUT(w);
// shapes for a // shapes for a
int64_t Ca = x.size(0); int64_t Ca = x.size(0);
int64_t H = x.size(1); int64_t H = x.size(1);
@@ -76,6 +78,8 @@ torch::Tensor shift_dx(
int64_t R, int64_t S, int64_t R, int64_t S,
int64_t stride_h, int64_t stride_w, int64_t stride_h, int64_t stride_w,
const torch::Tensor shift_h, const torch::Tensor shift_w) { const torch::Tensor shift_h, const torch::Tensor shift_w) {
CHECK_INPUT(dy);
CHECK_INPUT(w);
// shapes for a // shapes for a
int64_t Ca = dy.size(0); int64_t Ca = dy.size(0);
int64_t H = dy.size(1); int64_t H = dy.size(1);
@@ -104,6 +108,8 @@ torch::Tensor shift_dw(
int64_t R, int64_t S, int64_t R, int64_t S,
int64_t stride_h, int64_t stride_w, int64_t stride_h, int64_t stride_w,
const torch::Tensor shift_h, const torch::Tensor shift_w) { const torch::Tensor shift_h, const torch::Tensor shift_w) {
CHECK_INPUT(dy);
CHECK_INPUT(x);
// shapes for a // shapes for a
int64_t F = dy.size(0); int64_t F = dy.size(0);
int64_t Ha = dy.size(1); int64_t Ha = dy.size(1);

View File

@@ -1,5 +1,6 @@
import torch import torch
import math import math
import numpy as np
from torch.nn.modules.utils import _single, _pair, _triple from torch.nn.modules.utils import _single, _pair, _triple
from torch.distributions import categorical from torch.distributions import categorical
@@ -117,11 +118,13 @@ class ShiftConvFunction(torch.autograd.Function):
shift_w = ctx.shift_w shift_w = ctx.shift_w
dx = dw = dbias = None dx = dw = dbias = None
if ctx.needs_input_grad[0]: if ctx.needs_input_grad[0]:
dx = torch.ops.triton.shift_conv_dx(dy, weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) dx = torch.ops.triton.shift_conv_dx(dy.contiguous(), weight, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
if ctx.needs_input_grad[1]: if ctx.needs_input_grad[1]:
dw = torch.ops.triton.shift_conv_dw(dy, input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w) dw = torch.ops.triton.shift_conv_dw(dy.contiguous(), input, bias, width[0], width[1], stride[0], stride[1], shift_h, shift_w)
if ctx.needs_input_grad[2]: if ctx.needs_input_grad[2]:
dbias = torch.sum(dy, (1, 2, 3)) dbias = torch.sum(dy, (1, 2, 3))
#print('dx', ctx.needs_input_grad[0], np.isnan(dx.cpu().numpy()).any())
#print('dw', ctx.needs_input_grad[1], np.isnan(dw.cpu().numpy()).any())
return dx, dw, dbias, None, None, None, None return dx, dw, dbias, None, None, None, None
@@ -149,7 +152,7 @@ class _ShiftConvNd(torch.nn.Module):
def make_shift(self, kernel_size): def make_shift(self, kernel_size):
if kernel_size == 3: if kernel_size == 3:
p = torch.Tensor([0.3, 0.4, 0.3]) p = torch.Tensor([0., 1., 0.])
elif kernel_size == 5: elif kernel_size == 5:
p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1]) p = torch.Tensor([0.1, 0.25, 0.3, 0.25, 0.1])
elif kernel_size == 7: elif kernel_size == 7:

View File

@@ -58,8 +58,8 @@ def blocksparse_matmul_grad(op, dy):
return (dx, dw) return (dx, dw)
def run_shift(): def run_shift():
B, C, H, W = 16, 16, 4, 4 B, C, H, W = 16, 1, 4, 4
R, S, F = 3, 3, 4 R, S, F = 3, 3, 32
stride_h, stride_w = 2, 2 stride_h, stride_w = 2, 2
np.random.seed(2) np.random.seed(2)
a = tf.placeholder(tf.float32, shape=[C, H, W, B]) a = tf.placeholder(tf.float32, shape=[C, H, W, B])
@@ -68,8 +68,8 @@ def run_shift():
hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32) hshift_w = np.random.randint(- (S//2), R//2 + 1, size=C, dtype=np.int32)
c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w)) c = module.shift_conv(a, b, stride_h=stride_h, stride_w=stride_w, shift_h=tf.make_tensor_proto(hshift_h), shift_w=tf.make_tensor_proto(hshift_w))
# feed values # feed values
ha = np.random.rand(C, H, W, B) ha = np.ones((C, H, W, B), dtype=np.float32)
hb = np.random.rand(C, F) hb = np.ones((C, F), dtype=np.float32)
sess = tf.InteractiveSession() sess = tf.InteractiveSession()
# test # test
grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B), grads = tf.test.compute_gradient([a, b], [(C, H, W, B), (C, F)], c, (F, H//stride_h, W//stride_w, B),
@@ -128,5 +128,5 @@ def run_batchnorm():
print(np.max(np.abs(dg_t - dg_n))) print(np.max(np.abs(dg_t - dg_n)))
print(np.max(np.abs(db_t - db_n))) print(np.max(np.abs(db_t - db_n)))
#run_shift() run_shift()
run_batchnorm() #run_batchnorm()

View File

@@ -158,7 +158,7 @@ void shift::enqueue_impl(driver::stream *stream, driver::kernel *kernel,
unsigned TM = ranges[0], TN = ranges[1]; unsigned TM = ranges[0], TN = ranges[1];
std::array<size_t, 3> grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1}; std::array<size_t, 3> grid = {(M_ + TM - 1)/TM, (N_ + TN - 1)/TN, 1};
if(ty_ == BPROP) if(ty_ == BPROP)
((driver::cu_buffer*)c)->set_zero(stream, M_*N_*4); ((driver::cu_buffer*)c)->set_zero(stream, M_*N_*stride_h_*stride_w_*4);
stream->enqueue(kernel, grid, {nthreads, 1, 1}); stream->enqueue(kernel, grid, {nthreads, 1, 1});
} }
@@ -217,6 +217,7 @@ void shift(restrict read_only align(16) )" << a_ty_ << R"( *a,
if(ty_ == FPROP){ if(ty_ == FPROP){
os << R"( os << R"(
int32 rawhc[TM] = rxa / ABS; int32 rawhc[TM] = rxa / ABS;
int32 rab[TM] = rxa % ABS;
int32 raw[TM] = (rawhc % AW)*stride_w; int32 raw[TM] = (rawhc % AW)*stride_w;
int32 rahc[TM] = rawhc / AW; int32 rahc[TM] = rawhc / AW;
int32 rah[TM] = (rahc % AH)*stride_h; int32 rah[TM] = (rahc % AH)*stride_h;
@@ -227,26 +228,32 @@ if(ty_ == FPROP){
int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis]; int1 interior[TM, TK] = interiorh[:, newaxis] && interiorw[:, newaxis];
int32 inc_true[TM, TK] = d[newaxis, :]; int32 inc_true[TM, TK] = d[newaxis, :];
int32 inc_false[TM, TK] = rka[newaxis, :] * lda; int32 inc_false[TM, TK] = rka[newaxis, :] * lda;
int32 inc[TM, TK] = interior ? inc_true : inc_false;)"; int32 inc[TM, TK] = interior ? inc_true : inc_false;
rxa = rab + raw*ABS + rah*ABS*AW;
int32 offa0[TM, TK] = rxa[:, newaxis];)";
}
else{
os << " int32 offa0[" << AS << "] = rxa" << bca1 << lda1 << ";" << std::endl;
} }
if(ty_ == WGRAD){ if(ty_ == WGRAD){
os << R"( os << R"(
__constant__ int32* pd[TN] = delta + ryb; __constant__ int32* pd[TN] = delta + ryb;
int32 d[TN] = *pd; int32 d[TN] = *pd;
int32 shift[TK, TN] = d[newaxis, :];)"; int32 shift[TK, TN] = d[newaxis, :];
int32 rbwhc[TK] = rkb / ABS;
int32 rbw[TK] = (rbwhc % AW)*stride_w;
int32 rbhc[TK] = rbwhc / AW;
int32 rbh[TK] = (rbhc % AH)*stride_h;
)";
} }
os << R"( os << R"(
)" << a_ty_ << "* pa[" << AS << "] = a + rxa" << bca1 << lda1 << " + " << rka << bca0 << lda0 << R"(; )" << a_ty_ << "* pa[" << AS << "] = a + offa0 + " << rka << bca0 << lda0 << R"(;
)" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(; )" << b_ty_ << "* pb[" << BS << "] = b + ryb" << bcb1 << ldb1 << " + " << rkb << bcb0 << ldb0 << R"(;
int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(; int1 checka[)" << AS << "] = (rka < K)" << bca0 << R"(;
int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(; int1 checkb[)" << BS << "] = (rkb < K)" << bcb0 << R"(;
)" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0;)"; )" << a_ty_ << " a[" << AS << R"(] = checka ? *pa : 0;)";
if(ty_ == WGRAD){ if(ty_ == WGRAD){
os << R"( os << R"(
int32 rbwhc[TK] = rkb / ABS;
int32 rbw[TK] = (rbwhc % AW)*stride_w;
int32 rbhc[TK] = rbwhc / AW;
int32 rbh[TK] = (rbhc % AH)*stride_h;
int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h)); int1 interiorh[TK] = (rbh >= pad_h) && (rbh < (AH - pad_h));
int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w)); int1 interiorw[TK] = (rbw >= pad_w) && (rbw < (AW - pad_w));
int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; int1 interior[TK, TN] = interiorh[:, newaxis] && interiorw[:, newaxis];
@@ -301,17 +308,28 @@ else{
os << R"( os << R"(
} }
int32 rxc[TM] = get_global_range[TM](0); int32 rxc[TM] = get_global_range[TM](0);
int32 ryc[TN] = get_global_range[TN](1); int32 ryc[TN] = get_global_range[TN](1);)";
fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + rxc[:, newaxis]; if(ty_ == BPROP){
os << R"(
int32 rcwhc[TM] = rxc / ABS;
int32 rcb[TM] = rxc % ABS;
int32 rcw[TM] = (rcwhc % AW)*stride_w;
int32 rchc[TM] = rcwhc / AW;
int32 rch[TM] = (rchc % AH)*stride_h;
rxc = rcb + rcw*ABS + rch*ABS*AW;
int32 offc0[TM, TN] = rxc[:, newaxis];)";
}
else{
os << R"(
int32 offc0[TM, TN] = rxc[:, newaxis];)";
}
os << R"("
fp32* pc[TM, TN] = c + ryc[newaxis, :]*M + offc0;
int1 checkc0[TM] = rxc < M; int1 checkc0[TM] = rxc < M;
int1 checkc1[TN] = ryc < N; int1 checkc1[TN] = ryc < N;
int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)"; int1 checkc[TM, TN] = checkc0[:, newaxis] && checkc1[newaxis, :];)";
if(ty_ == BPROP){ if(ty_ == BPROP){
os << R"( os << R"(
int32 rcwhc[TM] = rxc / ABS;
int32 rcw[TM] = (rcwhc % AW)*stride_w;
int32 rchc[TM] = rcwhc / AW;
int32 rch[TM] = (rchc % AH)*stride_h;
int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h)); int1 interiorh[TM] = (rch >= pad_h) && (rch < (AH - pad_h));
int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w)); int1 interiorw[TM] = (rcw >= pad_w) && (rcw < (AW - pad_w));
int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis]; int1 interior[TM, TN] = interiorh[:, newaxis] && interiorw[:, newaxis];