[lang] added templates for reductions

2019-09-10 15:54:16 -04:00
parent 41acac6ba1
commit 2781cdcf93
8 changed files with 124 additions and 15 deletions
--- a/include/triton/lang/token.h
+++ b/include/triton/lang/token.h
@@ -180,6 +180,9 @@ public:
    PLUS,
    MINUS,
    CAST,
+    REDUCE_ADD,
+    REDUCE_MAX,
+    REDUCE_MIN,

    // For preprocessor
    PP_IF,
--- a/include/triton/runtime/function.h
+++ b/include/triton/runtime/function.h
@@ -70,7 +70,7 @@ public:
  struct options_space_t {
    typedef std::pair<std::string, std::vector<std::string>> define_t;
    std::vector<define_t> defines;
-    std::vector<size_t> num_warps;
+    std::vector<int> num_warps;
  };

  struct options_t {
--- a/lib/lang/parser.cc
+++ b/lib/lang/parser.cc
@@ -453,14 +453,27 @@ Expr* Parser::ParseSubScripting(Expr* lhs) {
  TileType::ShapeInt shape;
  size_t i = 0;
  const Token* tok;
+  std::vector<std::pair<int, int>> redList;
  do {
    tok = ts_.Next();
-    if(tok->tag_ == ':')
-      shape.push_back(lhsShape[i++]);
-    else if(tok->tag_ == Token::NEWAXIS)
-      shape.push_back(1);
-    else
-      Error(tok, "only ':' and newaxis are supported in subscripts");
+    switch(tok->tag_) {
+      case ':':
+        shape.push_back(lhsShape[i++]);
+        break;
+
+      case Token::NEWAXIS:
+        shape.push_back(1);
+        break;
+
+//      case Token::ADD:
+//      case Token::SUB:
+//        redList.push_back({i, tok->tag_});
+//        break;
+
+      default:
+        Error(tok, "Unexpected subscript symbol encountered at dimension %d", i);
+        break;
+    }
  }while(ts_.Try(','));
  ts_.Expect(']');
  if(lhsShape.size() > i)
--- a/tests/common/src/reduce.h
+++ b/tests/common/src/reduce.h
@@ -0,0 +1,27 @@
+namespace src {
+
+    const char *reduce1d =
+R"(
+void reduce1d(TYPE * X __noalias __readonly __aligned(16),
+              TYPE * Y __noalias __readonly __aligned(16),
+              int N) {
+}
+)";
+
+
+    const char *reduce2d =
+R"(
+void reduce2d(TYPE * X __noalias __readonly __aligned(16),
+            TYPE * Y __noalias __writeonly __aligned(16),
+            int M, int N, int ldx) {
+  int ridm = get_program_id(0);
+  int ridn = get_program_id(1);
+  int rm[TM] = ridm * TM + 0 ... TM;
+  int rn[TN] = ridn * TN + 0 ... TN;
+  TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
+  TYPE* py[TM, TN] = Y + rm[:, newaxis];
+  *py = (*px)[:, +];
+}
+)";
+
+}
--- a/tests/common/util.h
+++ b/tests/common/util.h
@@ -31,6 +31,13 @@ enum order_t {
  COLMAJOR
 };

+template<class T>
+void init_rand(std::vector<T>& x) {
+  for(size_t i = 0; i < x.size(); i++)
+    x[i] = static_cast<T>((double)rand()/RAND_MAX);
+}
+
+

 namespace aux{
 template<std::size_t...> struct seq{};
--- a/tests/unit/CMakeLists.txt
+++ b/tests/unit/CMakeLists.txt
@@ -1,4 +1,4 @@
-foreach(PROG dot)
+foreach(PROG dot reduce)
     set(TARGET unit_${PROG})
     add_executable(${TARGET} ${PROG}.cc)
     set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET})
--- a/tests/unit/dot.cc
+++ b/tests/unit/dot.cc
@@ -50,7 +50,7 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K,
 }


-bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){
+bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, int nwarp){
  typedef float NumericT;
  std::string ty = "float";
  size_t dt_nbytes = sizeof(NumericT);
@@ -62,12 +62,9 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_
  int32_t ldb = BT ? N : K;
  int32_t ldc = M;
  srand(0);
-  for(size_t i = 0; i < ha.size(); i++)
-    ha[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
-  for(size_t i = 0; i < hb.size(); i++)
-    hb[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
-  for(size_t i = 0; i < hc.size(); i++)
-    hc[i] = static_cast<NumericT>((double)0);
+  init_rand(ha);
+  init_rand(hb);
+  init_rand(hc);
  auto dc = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hc.size()*dt_nbytes));
  auto da = std::shared_ptr<drv::buffer>(drv::buffer::create(context, ha.size()*dt_nbytes));
  auto db = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hb.size()*dt_nbytes));
--- a/tests/unit/reduce.cc
+++ b/tests/unit/reduce.cc
@@ -0,0 +1,62 @@
+#include <iomanip>
+#include <cstring>
+#include <sstream>
+#include <cstdio>
+#include "triton/driver/backend.h"
+#include "triton/driver/stream.h"
+#include "triton/tools/bench.hpp"
+#include "triton/external/half.hpp"
+#include "triton/runtime/function.h"
+#include "src/reduce.h"
+#include "cuda/cublas.h"
+#include "util.h"
+
+namespace drv = triton::driver;
+namespace rt = triton::runtime;
+
+
+bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){
+  typedef float NumericT;
+  std::string ty = "float";
+  size_t dt_nbytes = sizeof(NumericT);
+  drv::context* context = stream->context();
+  std::vector<NumericT> hy(M);
+  std::vector<NumericT> hx(M*N);
+  srand(0);
+  init_rand(hy);
+  init_rand(hx);
+  auto dy = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hy.size()*dt_nbytes));
+  auto dx = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hx.size()*dt_nbytes));
+  stream->write(&*dy, true, 0, hy);
+  stream->write(&*dx, true, 0, hx);
+  rt::function::options_space_t opt;
+  opt.defines.push_back({"TYPE", {ty}});
+  opt.defines.push_back({"TM", {std::to_string(M)}});
+  opt.defines.push_back({"TN", {std::to_string(N)}});
+  opt.num_warps = {nwarp};
+  rt::function function(src::reduce2d, opt);
+  function({&*dy, &*dx, M, N, M}, grid2d(M, N), stream);
+  stream->synchronize();
+}
+
+int main() {
+  // initialize default compute device
+  auto context = triton::driver::backend::contexts::get_default();
+  triton::driver::stream* stream = triton::driver::stream::create(context);
+  // shapes to benchmark
+  typedef std::tuple<int, int, std::string> config_t;
+  std::vector<config_t> configs = {
+    config_t{32, 32, "+"}
+  };
+  // does the work
+  int M, N;
+  std::string op;
+  for(const auto& c: configs){
+    std::tie(M, N, op) = c;
+    std::cout << "Testing " << c << " ... " << std::flush;
+    if(do_test(stream, M, N, op, 1))
+      std::cout << " Pass! " << std::endl;
+    else
+      std::cout << " Fail! " << std::endl;
+  }
+}