[dnn] better specification of recompilation key

This commit is contained in:
Philippe Tillet
2019-08-02 17:42:48 -07:00
parent 3b92ddf7e6
commit d9945692a9
31 changed files with 418 additions and 428 deletions

View File

@@ -58,7 +58,7 @@ public:
triton::driver::cu_buffer m(ctx, fw_m->tensor_data().size(), (CUdeviceptr)fw_m->tensor_data().data(), false);
triton::driver::cu_buffer v(ctx, fw_v->tensor_data().size(), (CUdeviceptr)fw_v->tensor_data().data(), false);
// create config
triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "fp32");
triton::dnn::batchnorm_forward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING);
batchnorm.enqueue(stream, {&y, &m, &v, &x, &g, &b});
}
@@ -126,7 +126,7 @@ public:
triton::driver::cu_buffer dg(ctx, fw_dg->tensor_data().size(), (CUdeviceptr)fw_dg->tensor_data().data(), false);
triton::driver::cu_buffer db(ctx, fw_db->tensor_data().size(), (CUdeviceptr)fw_db->tensor_data().data(), false);
// create config
triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "fp32");
triton::dnn::batchnorm_backward batchnorm(C, 1, H, W, B, "float", triton::dnn::FULL_TUNING);
batchnorm.enqueue(stream, {&dx, &dg, &db, &dy, &x, &g, &m, &v});
}

View File

@@ -128,9 +128,9 @@ public:
triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false);
triton::driver::cu_buffer dlut(ctx, lut.tensor_data().size(), (CUdeviceptr)lut.tensor_data().data(), false);
// create profile
triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "fp16", params_.bsize, params_.locks, params_.blocks, OP);
triton::dnn::blocksparse::dot dot(N, params_.K, params_.segments, params_.C, "half", params_.bsize, params_.locks, params_.blocks, OP);
// blocksparse matmul
triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::FULL_TUNING);
triton::dnn::base* op = dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING);
triton::driver::buffer* locks_buffer = ((triton::dnn::blocksparse::dot*)op)->get_locks();
Tensor *tmp = nullptr;
TensorShape tmp_shapes;

View File

@@ -61,7 +61,7 @@ public:
stride_d, stride_h, stride_w,
pad_d, pad_h, pad_w,
1, 1, 1,
"fp16", "fp16",
"half", "half",
triton::dnn::conv::FPROP, has_bias);
// allocate output
auto c_shapes = conv.c_shapes();

View File

@@ -49,7 +49,7 @@ class DotOp : public OpKernel {
triton::driver::cu_buffer db(ctx, b.tensor_data().size(), (CUdeviceptr)b.tensor_data().data(), false);
triton::driver::cu_buffer dc(ctx, c->tensor_data().size(), (CUdeviceptr)c->tensor_data().data(), false);
// template
triton::dnn::dot dot(M, N, K, false, false, "fp16", "fp16", 8, 8);
triton::dnn::dot dot(M, N, K, false, false, "half", "half", 8, 8, 8);
dot.enqueue(stream, {&da, &db, &dc});
}

View File

@@ -105,7 +105,7 @@ def batch_norm_grad(op, dy, mean, var):
def run_batchnorm():
C, H, W, B = 32, 14, 14, 64
C, H, W, B = 8, 4, 4, 32
np.random.seed(0)
# Placeholders
x = tf.placeholder(tf.float32, shape=[C, H, W, B])
@@ -131,6 +131,6 @@ def run_batchnorm():
print(np.max(np.abs(dg_t - dg_n)))
print(np.max(np.abs(db_t - db_n)))
run_dot()
#run_dot()
#run_shift()
#run_batchnorm()
run_batchnorm()

View File

@@ -106,7 +106,7 @@ public:
triton::dnn::shift shift(B, C, D, H, W, T, R_, S_, F,
stride_h_, stride_w_,
shift_h_data, shift_w_data,
"fp16", "fp16", OP, has_bias, layout_);
"half", "half", OP, has_bias, layout_);
// shapes for c
std::vector<int64> c_shapes;