[dnn/blocksparse/dot] prototype version seems to pass basic test

2019-07-27 21:21:36 -07:00
parent 2a377bc8b1
commit 17cb2db356
18 changed files with 402 additions and 205 deletions
--- a/examples/python/tensorflow/CMakeLists.txt
+++ b/examples/python/tensorflow/CMakeLists.txt
@@ -5,7 +5,7 @@ if(${TensorFlow_FOUND})
  include_directories("${CUDA_HOME}/include")
  link_directories(${TF_LIB})
  add_definitions(-D_GLIBCXX_USE_CXX11_ABI=${TF_ABI})
-  add_library(tf_blocksparse SHARED dot.cpp conv.cpp shift.cpp batchnorm.cpp)
+  add_library(tf_blocksparse SHARED blocksparse.cpp dot.cpp conv.cpp shift.cpp batchnorm.cpp)
  target_link_libraries(tf_blocksparse tensorflow_framework triton)
  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/run.py
                  ${CMAKE_CURRENT_BINARY_DIR}/run.py
--- a/examples/python/tensorflow/blocksparse.cpp
+++ b/examples/python/tensorflow/blocksparse.cpp
@@ -3,7 +3,8 @@
 #include "triton/driver/buffer.h"
 #include "triton/driver/backend.h"
 #include "triton/driver/stream.h"
-#include "triton/jit.h"
+#include "triton/runtime/jit.h"
+#include "triton/dnn/blocksparse/dot.h"

 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/op.h"
@@ -20,106 +21,88 @@ using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 using GPUDevice = Eigen::GpuDevice;

-
-const char* src =
-R"(
-const tunable int32 TM = {16, 32, 64, 128};
-const tunable int32 TN = {16, 32, 64, 128};
-const tunable int32 TK = {8};
-const tunable int32 GZ = {1};
-
-void bsmm (restrict read_only fp32 *A, restrict read_only fp32 *B, fp32 *C,
-           int32 M, int32 N, int32 K,
-           int32 lda, int32 ldb, int32 ldc,
-           int32 *locks, int32 grid0, int32 grid1) {
-
-}
-)";
-
 Status XpropShape(InferenceContext* ctx)
 {
-    int    K; TF_RETURN_IF_ERROR(ctx->GetAttr(   "K",    &K));
-    int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis));
+  int    K; TF_RETURN_IF_ERROR(ctx->GetAttr(   "K",    &K));
+  int axis; TF_RETURN_IF_ERROR(ctx->GetAttr("axis", &axis));

-    // C ==> K
-    ShapeHandle x = ctx->input(0);
-    int rank = ctx->Rank(x);
-    //printf("XpropShape: %d\n", rank);
-    if (rank > 0)
-    {
-        std::vector<DimensionHandle> shape;
-        shape.reserve(rank);
-        for (int i = 0; i < rank; i++)
-            shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i));
-
-        ctx->set_output(0, ctx->MakeShape(shape));
-    }
-    else
-        ctx->set_output(0, ctx->UnknownShape());
-    ctx->set_output(1, ctx->UnknownShape());
-    return Status::OK();
+  // C ==> K
+  ShapeHandle x = ctx->input(0);
+  int rank = ctx->Rank(x);
+  //printf("XpropShape: %d\n", rank);
+  if (rank > 0)
+  {
+    std::vector<DimensionHandle> shape;
+    shape.reserve(rank);
+    for (int i = 0; i < rank; i++)
+      shape.push_back(i == axis ? ctx->MakeDim(K) : ctx->Dim(x, i));
+    ctx->set_output(0, ctx->MakeShape(shape));
+  }
+  else
+    ctx->set_output(0, ctx->UnknownShape());
+  ctx->set_output(1, ctx->UnknownShape());
+  return Status::OK();
 }


-REGISTER_OP("BlocksparseMatmul")
-    .Input("x: T")
-    .Input("w: T")
-    .Input("lut: int64")
-    .Input("lut_dx: int64")
-    .Input("lut_dw: int64")
-    .Input("gate: ngate * float")
-    .Output("y: T")
-    .Output("temp: int32")
-    .Attr("T: {half, float, bfloat16}")
-    .Attr("blocks: int >=0")
-    .Attr("bsize: int")
-    .Attr("segments: int = 0")
-    .Attr("segments_dx: int = 0")
-    .Attr("locks: int = 0")
-    .Attr("locks_dx: int = 0")
-    .Attr("axis: int = 1")
-    .Attr("C: int >=0")
-    .Attr("K: int >=0")
-    .Attr("shared: int = 0")
-    .Attr("shared_dx: int = 0")
-    .Attr("alpha: float = 1.0")
-    .Attr("beta: float = 0.0")
-    .Attr("gated_dw: bool = false")
-    .Attr("gate_grad: bool = false")
-    .Attr("bench: int = 0")
-    .Attr("ngate: int >= 0")
-    .SetShapeFn(XpropShape)
-    .Doc(R"doc(
-Multiply the matrix "a" by the blocksparse matrix "b".
-)doc");
+REGISTER_OP("TritonBlocksparseMatmul")
+.Input("x: T")
+.Input("w: T")
+.Input("lut: int64")
+.Input("lut_dx: int64")
+.Input("lut_dw: int64")
+.Input("gate: ngate * float")
+.Output("y: T")
+.Output("temp: int32")
+.Attr("T: {half, float, bfloat16}")
+.Attr("blocks: int >=0")
+.Attr("bsize: int")
+.Attr("segments: int = 0")
+.Attr("segments_dx: int = 0")
+.Attr("locks: int = 0")
+.Attr("locks_dx: int = 0")
+.Attr("axis: int = 1")
+.Attr("C: int >=0")
+.Attr("K: int >=0")
+.Attr("shared: int = 0")
+.Attr("shared_dx: int = 0")
+.Attr("alpha: float = 1.0")
+.Attr("beta: float = 0.0")
+.Attr("gated_dw: bool = false")
+.Attr("gate_grad: bool = false")
+.Attr("bench: int = 0")
+.Attr("ngate: int >= 0")
+.SetShapeFn(XpropShape)
+.Doc(R"doc(
+     Multiply the matrix "a" by the blocksparse matrix "b".
+     )doc");


 typedef struct bsmm_params
 {
-    const int* Lut;
-    const float* Gate;
-    int* Lock;
-    //float4* Scratch;
-    int blocks;
-    int bsize;
-    int segments;
-    int locks;
-    int C;
-    int K;
-    int N;
-    int shared;
-    int pcount;
-    uint blk_a;
-    uint blk_A;
-    uint blk_b;
-    uint blk_B;
-    float alpha;
-    float beta;
-    CUstream stream;
+  const int* Lut;
+  const float* Gate;
+  int* Lock;
+  int blocks;
+  int bsize;
+  int segments;
+  int locks;
+  int C;
+  int K;
+  int N;
+  int shared;
+  int pcount;
+  uint blk_a;
+  uint blk_A;
+  uint blk_b;
+  uint blk_B;
+  float alpha;
+  float beta;
+  CUstream stream;
 } bsmm_params;

 class BlocksparseMatmulOp : public OpKernel {
- public:
+public:
  explicit BlocksparseMatmulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("segments", &params_.segments));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("locks",    &params_.locks   ));
@@ -147,6 +130,51 @@ class BlocksparseMatmulOp : public OpKernel {
  }

  void Compute(OpKernelContext* context){
+    // get device/stream
+    GPUDevice device =  context->eigen_device<GPUDevice>();
+    triton::driver::cu_stream sstream(device.stream(), false);
+    triton::driver::context* ctx = sstream.context();
+    triton::driver::stream* stream = &sstream;
+    // get inputs
+    const Tensor& a = context->input(0);
+    const Tensor& b = context->input(1);
+    const Tensor& lut = context->input(2);
+    // allocate c
+    TensorShape shape_c;
+    int N     = 1;
+    int rank_a = a.dims();
+    for (int i = 0; i < rank_a; i++)
+      if (i != axis_) {
+        shape_c.AddDim(a.dim_size(i));
+        N *= a.dim_size(i);
+      }
+      else
+        shape_c.AddDim(params_.K);
+    Tensor* c = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, shape_c, &c));
+    // grid and block
+    int blkN = 128, gridN = (N + 127)/128, modN128 = N & 127;
+    if (axis_ == 1 || (modN128 > 0 && modN128 <= 64) || gridN * params_.segments < SMs_*4){
+      blkN  = 64;
+      gridN = (N + 63)/64;
+    }
+    // allocate locks
+    Tensor* locks;
+    TensorShape shape_l;
+    if (params_.locks > 0)
+      shape_l.AddDim(gridN * params_.locks * 2);
+    OP_REQUIRES_OK(context, context->allocate_output(1, shape_l, &locks));
+    // initialize default compute device
+    triton::runtime::jit jit(ctx);
+    // matrix multiplication parameters
+    triton::driver::cu_buffer da(ctx, (CUdeviceptr)a.flat<float>().data(), false);
+    triton::driver::cu_buffer db(ctx, (CUdeviceptr)b.flat<float>().data(), false);
+    triton::driver::cu_buffer dc(ctx, (CUdeviceptr)c->flat<float>().data(), false);
+//    triton::driver::cu_buffer dlocks(ctx, (CUdeviceptr)locks->flat<int32>().data(), false);
+    triton::driver::cu_buffer dlut(ctx, (CUdeviceptr)lut.flat<int64>().data(), false);
+    // blocksparse matmul
+    triton::dnn::blocksparse::dot dot(N, params_.K, params_.C);
+    dot.enqueue(stream, {&da, &db, &dc, &dlut}, triton::dnn::NO_TUNING);
  }

 private:
@@ -157,4 +185,4 @@ private:
  char bench_string_[256];
 };

-REGISTER_KERNEL_BUILDER(Name("BlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint<float>("T"), BlocksparseMatmulOp);
+REGISTER_KERNEL_BUILDER(Name("TritonBlocksparseMatmul").Device(DEVICE_GPU).TypeConstraint<float>("T"), BlocksparseMatmulOp);
--- a/examples/python/tensorflow/conv.cpp
+++ b/examples/python/tensorflow/conv.cpp
@@ -5,7 +5,6 @@
 #include "triton/driver/stream.h"
 #include "triton/runtime/jit.h"
 #include "triton/tools/bench.hpp"
-#include "triton/dnn/gemm.h"
 #include "triton/dnn/conv.h"

 #define EIGEN_USE_GPU
--- a/examples/python/tensorflow/dot.cpp
+++ b/examples/python/tensorflow/dot.cpp
@@ -5,7 +5,7 @@
 #include "triton/driver/stream.h"
 #include "triton/runtime/jit.h"
 #include "triton/tools/bench.hpp"
-#include "triton/dnn/gemm.h"
+#include "triton/dnn/dot.h"

 #define EIGEN_USE_GPU
 #include "tensorflow/core/framework/op.h"
--- a/examples/python/tensorflow/shift.cpp
+++ b/examples/python/tensorflow/shift.cpp
@@ -19,10 +19,10 @@
 using namespace tensorflow;
 using GPUDevice = Eigen::GpuDevice;

-template<triton::dnn::shift::op_t OP>
+template<triton::dnn::op_t OP>
 class ShiftConvOp : public OpKernel {
 public:
-  explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::shift::NCHW) {
+  explicit ShiftConvOp(OpKernelConstruction* context) : OpKernel(context), layout_(triton::dnn::NCHW) {
    context->GetAttr("shift_h", &h_shift_h_);
    context->GetAttr("shift_w", &h_shift_w_);
    context->GetAttr("stride_h", &stride_h_);
@@ -32,13 +32,13 @@ public:
  }

  void ExtractShapes(const Tensor &x, int64_t &C, int64_t &H, int64_t &W, int64_t &B) {
-    if(layout_ == triton::dnn::shift::CHWN){
+    if(layout_ == triton::dnn::CHWN){
      C  = x.dim_size(0);
      H  = x.dim_size(1);
      W  = x.dim_size(2);
      B  = x.dim_size(3);
    }
-    else if(layout_ == triton::dnn::shift::NCHW){
+    else if(layout_ == triton::dnn::NCHW){
      B  = x.dim_size(0);
      C  = x.dim_size(1);
      H  = x.dim_size(2);
@@ -52,7 +52,7 @@ public:
  void FillShapes(OpKernelContext* context,
                  int64_t &C, int64_t &H, int64_t &W, int64_t &B, int64_t &F,
                  const Tensor& tf_a, const Tensor& tf_b) {
-    if(OP == triton::dnn::shift::WGRAD) {
+    if(OP == triton::dnn::WGRAD) {
      int64_t Ha, Wa, Ba;
      int64_t Hb, Wb, Bb;
      ExtractShapes(tf_a, F, Ha, Wa, Ba);
@@ -68,19 +68,19 @@ public:
      // shapes for a
      int64_t Ca;
      ExtractShapes(tf_a, Ca, H, W, B);
-      if(OP == triton::dnn::shift::BPROP){
+      if(OP == triton::dnn::BPROP){
        H *= stride_h_;
        W *= stride_w_;
      }
      // shapes for b
      int64_t Cb  = tf_b.dim_size(0);
      F   = tf_b.dim_size(1);
-      if(OP == triton::dnn::shift::BPROP)
+      if(OP == triton::dnn::BPROP)
        std::swap(Cb, F);
      // checks
      OP_REQUIRES(context, Ca == Cb, tensorflow::errors::InvalidArgument("operands must have the same number of channels"));
      C = Ca;
-      if(OP == triton::dnn::shift::BPROP)
+      if(OP == triton::dnn::BPROP)
        std::swap(C, F);
    }
  }
@@ -122,7 +122,7 @@ public:
    triton::driver::cu_buffer da(ctx,      (CUdeviceptr)tf_a.flat<Eigen::half>().data(), false);
    triton::driver::cu_buffer db(ctx,      (CUdeviceptr)tf_b.flat<Eigen::half>().data(), false);
    triton::driver::cu_buffer dc(ctx,      (CUdeviceptr)tf_c->flat<Eigen::half>().data(), false);
-    shift.enqueue(stream, {&da, &db, &dc}, false);
+    shift.enqueue(stream, {&da, &db, &dc}, triton::dnn::PARTIAL_TUNING);
  }

 private:
@@ -132,10 +132,10 @@ private:
  int stride_w_;
  int R_;
  int S_;
-  triton::dnn::shift::layout_t layout_;
+  triton::dnn::layout_t layout_;
 };

-REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::FPROP>);
+REGISTER_KERNEL_BUILDER(Name("ShiftConv").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::FPROP>);
 REGISTER_OP("ShiftConv")
    .Input("a: float16")
    .Input("b: float16")
@@ -145,7 +145,7 @@ REGISTER_OP("ShiftConv")
    .Attr("stride_w: int")
    .Output("c: float16");

-REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::BPROP>);
+REGISTER_KERNEL_BUILDER(Name("ShiftConvDx").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::BPROP>);
 REGISTER_OP("ShiftConvDx")
    .Input("a: float16")
    .Input("b: float16")
@@ -155,7 +155,7 @@ REGISTER_OP("ShiftConvDx")
    .Attr("stride_w: int")
    .Output("c: float16");

-REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::shift::WGRAD>);
+REGISTER_KERNEL_BUILDER(Name("ShiftConvDw").Device(DEVICE_GPU), ShiftConvOp<triton::dnn::WGRAD>);
 REGISTER_OP("ShiftConvDw")
    .Input("a: float16")
    .Input("b: float16")