[dnn/blocksparse/dot] prototype version seems to pass basic test

This commit is contained in:
Philippe Tillet
2019-07-27 21:21:36 -07:00
parent 2a377bc8b1
commit 17cb2db356
18 changed files with 402 additions and 205 deletions

View File

@@ -5,7 +5,7 @@ if(${TensorFlow_FOUND})
include_directories("${CUDA_HOME}/include")
link_directories(${TF_LIB})
add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI})
add_library(tf_blocksparse SHARED dot.cpp conv.cpp shift.cpp batchnorm.cpp)
add_library(tf_blocksparse SHARED blocksparse.cpp dot.cpp conv.cpp shift.cpp batchnorm.cpp)
target_link_libraries(tf_blocksparse tensorflow_framework triton)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py
${CMAKE_CURRENT_BINARY_DIR}/run.py

View File

@@ -3,7 +3,8 @@
#include "triton/driver/buffer.h"
#include "triton/driver/backend.h"
#include "triton/driver/stream.h"
#include "triton/jit.h"
#include "triton/runtime/jit.h"
#include "triton/dnn/blocksparse/dot.h"
#define EIGEN_USE_GPU
#include "tensorflow/core/framework/op.h"
@@ -20,106 +21,88 @@ using shape_inference::InferenceContext;
using shape_inference::ShapeHandle;
using GPUDevice = Eigen::GpuDevice;
const char* src =
R"(
const tunable int32 TM = {16, 32, 64, 128};
const tunable int32 TN = {16, 32, 64, 128};
const tunable int32 TK = {8};
const tunable int32 GZ = {1};
void bsmm (restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C,
int32 M, int32 N, int32 K,
int32 lda, int32 ldb, int32 ldc,
int32 *locks, int32 grid0, int32 grid1) {
}
)";
Status XpropShape(InferenceContext* ctx)
{
int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K));
int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis));
int K; TF_RETURN_IF_ERROR(ctx->GetAttr( "K", &K));
int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis));
// C ==> K
ShapeHandle x = ctx->input(0);
int rank = ctx->Rank(x);
//printf("XpropShape: %d\n", rank);
if (rank > 0)
{
std::vector<DimensionHandle> shape;
shape.reserve(rank);
for (int i = 0; i < rank; i++)
shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i));
ctx->set_output(0, ctx->MakeShape(shape));
}
else
ctx->set_output(0, ctx->UnknownShape());
ctx->set_output(1, ctx->UnknownShape());
return Status::OK();
// C ==> K
ShapeHandle x = ctx->input(0);
int rank = ctx->Rank(x);
//printf("XpropShape: %d\n", rank);
if (rank > 0)
{
std::vector<DimensionHandle> shape;
shape.reserve(rank);
for (int i = 0; i < rank; i++)
shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i));
ctx->set_output(0, ctx->MakeShape(shape));
}
else
ctx->set_output(0, ctx->UnknownShape());
ctx->set_output(1, ctx->UnknownShape());
return Status::OK();
}
REGISTER_OP("BlocksparseMatmul")
.Input("x: T")
.Input("w: T")
.Input("lut: int64")
.Input("lut_dx: int64")
.Input("lut_dw: int64")
.Input("gate: ngate * float")
.Output("y: T")
.Output("temp: int32")
.Attr("T: {half, float, bfloat16}")
.Attr("blocks: int >=0")
.Attr("bsize: int")
.Attr("segments: int = 0")
.Attr("segments_dx: int = 0")
.Attr("locks: int = 0")
.Attr("locks_dx: int = 0")
.Attr("axis: int = 1")
.Attr("C: int >=0")
.Attr("K: int >=0")
.Attr("shared: int = 0")
.Attr("shared_dx: int = 0")
.Attr("alpha: float = 1.0")
.Attr("beta: float = 0.0")
.Attr("gated_dw: bool = false")
.Attr("gate_grad: bool = false")
.Attr("bench: int = 0")
.Attr("ngate: int >= 0")
.SetShapeFn(XpropShape)
.Doc(R"doc(
Multiply the matrix "a" by the blocksparse matrix "b".
)doc");
REGISTER_OP("TritonBlocksparseMatmul")
.Input("x: T")
.Input("w: T")
.Input("lut: int64")
.Input("lut_dx: int64")
.Input("lut_dw: int64")
.Input("gate: ngate * float")
.Output("y: T")
.Output("temp: int32")
.Attr("T: {half, float, bfloat16}")
.Attr("blocks: int >=0")
.Attr("bsize: int")
.Attr("segments: int = 0")
.Attr("segments_dx: int = 0")
.Attr("locks: int = 0")
.Attr("locks_dx: int = 0")
.Attr("axis: int = 1")
.Attr("C: int >=0")
.Attr("K: int >=0")
.Attr("shared: int = 0")
.Attr("shared_dx: int = 0")
.Attr("alpha: float = 1.0")
.Attr("beta: float = 0.0")
.Attr("gated_dw: bool = false")
.Attr("gate_grad: bool = false")
.Attr("bench: int = 0")
.Attr("ngate: int >= 0")
.SetShapeFn(XpropShape)
.Doc(R"doc(
Multiply the matrix "a" by the blocksparse matrix "b".
)doc");
typedef struct bsmm_params
{
const int* Lut;
const float* Gate;
int* Lock;
//float4* Scratch;
int blocks;
int bsize;
int segments;
int locks;
int C;
int K;
int N;
int shared;
int pcount;
uint blk_a;
uint blk_A;
uint blk_b;
uint blk_B;
float alpha;
float beta;
CUstream stream;
const int* Lut;
const float* Gate;
int* Lock;
int blocks;
int bsize;
int segments;
int locks;
int C;
int K;
int N;
int shared;
int pcount;
uint blk_a;
uint blk_A;
uint blk_b;
uint blk_B;
float alpha;
float beta;
CUstream stream;
} bsmm_params;
class BlocksparseMatmulOp : public OpKernel {
public:
public:
explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", &params_.segments));
OP_REQUIRES_OK(ctx, ctx->GetAttr("locks", &params_.locks ));
@@ -147,6 +130,51 @@ class BlocksparseMatmulOp : public OpKernel {
}
void Compute(OpKernelContext* context){
// get device/stream
GPUDevice device = context->eigen_device<GPUDevice>();
triton::driver::cu_stream sstream(device.stream(), false);
triton::driver::context* ctx = sstream.context();
triton::driver::stream* stream = &sstream;
// get inputs
const Tensor& a = context->input(0);
const Tensor& b = context->input(1);
const Tensor& lut = context->input(2);
// allocate c
TensorShape shape_c;
int N = 1;
int rank_a = a.dims();
for (int i = 0; i < rank_a; i++)
if (i != axis_) {
shape_c.AddDim(a.dim_size(i));
N *= a.dim_size(i);
}
else
shape_c.AddDim(params_.K);
Tensor* c = nullptr;
OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c));
// grid and block
int blkN = 128, gridN = (N + 127)/128, modN128 = N & 127;
if (axis_ == 1 || (modN128 > 0 && modN128 <= 64) || gridN * params_.segments < SMs_*4){
blkN = 64;
gridN = (N + 63)/64;
}
// allocate locks
Tensor* locks;
TensorShape shape_l;
if (params_.locks > 0)
shape_l.AddDim(gridN * params_.locks * 2);
OP_REQUIRES_OK(context, context->allocate_output(1, shape_l, &locks));
// initialize default compute device
triton::runtime::jit jit(ctx);
// matrix multiplication parameters
triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat<float>().data(), false);
triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat<float>().data(), false);
triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat<float>().data(), false);
// triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat<int32>().data(), false);
triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat<int64>().data(), false);
// blocksparse matmul
triton::dnn::blocksparse::dot dot(N, params_.K, params_.C);
dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING);
}
private:
@@ -157,4 +185,4 @@ private:
char bench_string_[256];
};
REGISTER_KERNEL_BUILDER(Name("BlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint<float>("T"), BlocksparseMatmulOp);
REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint<float>("T"), BlocksparseMatmulOp);

View File

@@ -5,7 +5,6 @@
#include "triton/driver/stream.h"
#include "triton/runtime/jit.h"
#include "triton/tools/bench.hpp"
#include "triton/dnn/gemm.h"
#include "triton/dnn/conv.h"
#define EIGEN_USE_GPU

View File

@@ -5,7 +5,7 @@
#include "triton/driver/stream.h"
#include "triton/runtime/jit.h"
#include "triton/tools/bench.hpp"
#include "triton/dnn/gemm.h"
#include "triton/dnn/dot.h"
#define EIGEN_USE_GPU
#include "tensorflow/core/framework/op.h"

View File

@@ -19,10 +19,10 @@
using namespace tensorflow;
using GPUDevice = Eigen::GpuDevice;
template<triton::dnn::shift::op_t OP>
template<triton::dnn::op_t OP>
class ShiftConvOp : public OpKernel {
public:
explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::shift::NCHW) {
explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::NCHW) {
context->GetAttr("shift_h", &h_shift_h_);
context->GetAttr("shift_w", &h_shift_w_);
context->GetAttr("stride_h", &stride_h_);
@@ -32,13 +32,13 @@ public:
}
void ExtractShapes(const Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B) {
if(layout_ == triton::dnn::shift::CHWN){
if(layout_ == triton::dnn::CHWN){
C = x.dim_size(0);
H = x.dim_size(1);
W = x.dim_size(2);
B = x.dim_size(3);
}
else if(layout_ == triton::dnn::shift::NCHW){
else if(layout_ == triton::dnn::NCHW){
B = x.dim_size(0);
C = x.dim_size(1);
H = x.dim_size(2);
@@ -52,7 +52,7 @@ public:
void FillShapes(OpKernelContext* context,
int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F,
const Tensor& tf_a, const Tensor& tf_b) {
if(OP == triton::dnn::shift::WGRAD) {
if(OP == triton::dnn::WGRAD) {
int64_t Ha, Wa, Ba;
int64_t Hb, Wb, Bb;
ExtractShapes(tf_a, F, Ha, Wa, Ba);
@@ -68,19 +68,19 @@ public:
// shapes for a
int64_t Ca;
ExtractShapes(tf_a, Ca, H, W, B);
if(OP == triton::dnn::shift::BPROP){
if(OP == triton::dnn::BPROP){
H *= stride_h_;
W *= stride_w_;
}
// shapes for b
int64_t Cb = tf_b.dim_size(0);
F = tf_b.dim_size(1);
if(OP == triton::dnn::shift::BPROP)
if(OP == triton::dnn::BPROP)
std::swap(Cb, F);
// checks
OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels"));
C = Ca;
if(OP == triton::dnn::shift::BPROP)
if(OP == triton::dnn::BPROP)
std::swap(C, F);
}
}
@@ -122,7 +122,7 @@ public:
triton::driver::cu_buffer da(ctx, (CUdeviceptr)tf_a.flat<Eigen::half>().data(), false);
triton::driver::cu_buffer db(ctx, (CUdeviceptr)tf_b.flat<Eigen::half>().data(), false);
triton::driver::cu_buffer dc(ctx, (CUdeviceptr)tf_c->flat<Eigen::half>().data(), false);
shift.enqueue(stream, {&da, &db, &dc}, false);
shift.enqueue(stream, {&da, &db, &dc}, triton::dnn::PARTIAL_TUNING);
}
private:
@@ -132,10 +132,10 @@ private:
int stride_w_;
int R_;
int S_;
triton::dnn::shift::layout_t layout_;
triton::dnn::layout_t layout_;
};
REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::FPROP>);
REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::FPROP>);
REGISTER_OP("ShiftConv")
.Input("a: float16")
.Input("b: float16")
@@ -145,7 +145,7 @@ REGISTER_OP("ShiftConv")
.Attr("stride_w: int")
.Output("c: float16");
REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::BPROP>);
REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::BPROP>);
REGISTER_OP("ShiftConvDx")
.Input("a: float16")
.Input("b: float16")
@@ -155,7 +155,7 @@ REGISTER_OP("ShiftConvDx")
.Attr("stride_w: int")
.Output("c: float16");
REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::WGRAD>);
REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::WGRAD>);
REGISTER_OP("ShiftConvDw")
.Input("a: float16")
.Input("b: float16")