[lang] added templates for reductions

This commit is contained in:
Philippe Tillet
2019-09-10 15:54:16 -04:00
parent 41acac6ba1
commit 2781cdcf93
8 changed files with 124 additions and 15 deletions

View File

@@ -180,6 +180,9 @@ public:
PLUS,
MINUS,
CAST,
REDUCE_ADD,
REDUCE_MAX,
REDUCE_MIN,
// For preprocessor
PP_IF,

View File

@@ -70,7 +70,7 @@ public:
struct options_space_t {
typedef std::pair<std::string, std::vector<std::string>> define_t;
std::vector<define_t> defines;
std::vector<size_t> num_warps;
std::vector<int> num_warps;
};
struct options_t {

View File

@@ -453,14 +453,27 @@ Expr* Parser::ParseSubScripting(Expr* lhs) {
TileType::ShapeInt shape;
size_t i = 0;
const Token* tok;
std::vector<std::pair<int, int>> redList;
do {
tok = ts_.Next();
if(tok->tag_ == ':')
shape.push_back(lhsShape[i++]);
else if(tok->tag_ == Token::NEWAXIS)
shape.push_back(1);
else
Error(tok, "only ':' and newaxis are supported in subscripts");
switch(tok->tag_) {
case ':':
shape.push_back(lhsShape[i++]);
break;
case Token::NEWAXIS:
shape.push_back(1);
break;
// case Token::ADD:
// case Token::SUB:
// redList.push_back({i, tok->tag_});
// break;
default:
Error(tok, "Unexpected subscript symbol encountered at dimension %d", i);
break;
}
}while(ts_.Try(','));
ts_.Expect(']');
if(lhsShape.size() > i)

27
tests/common/src/reduce.h Normal file
View File

@@ -0,0 +1,27 @@
namespace src {
const char *reduce1d =
R"(
void reduce1d(TYPE * X __noalias __readonly __aligned(16),
TYPE * Y __noalias __readonly __aligned(16),
int N) {
}
)";
const char *reduce2d =
R"(
void reduce2d(TYPE * X __noalias __readonly __aligned(16),
TYPE * Y __noalias __writeonly __aligned(16),
int M, int N, int ldx) {
int ridm = get_program_id(0);
int ridn = get_program_id(1);
int rm[TM] = ridm * TM + 0 ... TM;
int rn[TN] = ridn * TN + 0 ... TN;
TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
TYPE* py[TM, TN] = Y + rm[:, newaxis];
*py = (*px)[:, +];
}
)";
}

View File

@@ -31,6 +31,13 @@ enum order_t {
COLMAJOR
};
template<class T>
void init_rand(std::vector<T>& x) {
for(size_t i = 0; i < x.size(); i++)
x[i] = static_cast<T>((double)rand()/RAND_MAX);
}
namespace aux{
template<std::size_t...> struct seq{};

View File

@@ -1,4 +1,4 @@
foreach(PROG dot)
foreach(PROG dot reduce)
set(TARGET unit_${PROG})
add_executable(${TARGET} ${PROG}.cc)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME ${TARGET})

View File

@@ -50,7 +50,7 @@ void cpu_ref(bool AT_, bool BT_, size_t M, size_t N, size_t K,
}
bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, size_t nwarp){
bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_t K, int32_t TM, int32_t TN, int32_t TK, int nwarp){
typedef float NumericT;
std::string ty = "float";
size_t dt_nbytes = sizeof(NumericT);
@@ -62,12 +62,9 @@ bool do_test(drv::stream* stream, bool AT, bool BT, int32_t M, int32_t N, int32_
int32_t ldb = BT ? N : K;
int32_t ldc = M;
srand(0);
for(size_t i = 0; i < ha.size(); i++)
ha[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
for(size_t i = 0; i < hb.size(); i++)
hb[i] = static_cast<NumericT>((float)rand()/RAND_MAX);
for(size_t i = 0; i < hc.size(); i++)
hc[i] = static_cast<NumericT>((double)0);
init_rand(ha);
init_rand(hb);
init_rand(hc);
auto dc = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hc.size()*dt_nbytes));
auto da = std::shared_ptr<drv::buffer>(drv::buffer::create(context, ha.size()*dt_nbytes));
auto db = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hb.size()*dt_nbytes));

62
tests/unit/reduce.cc Normal file
View File

@@ -0,0 +1,62 @@
#include <iomanip>
#include <cstring>
#include <sstream>
#include <cstdio>
#include "triton/driver/backend.h"
#include "triton/driver/stream.h"
#include "triton/tools/bench.hpp"
#include "triton/external/half.hpp"
#include "triton/runtime/function.h"
#include "src/reduce.h"
#include "cuda/cublas.h"
#include "util.h"
namespace drv = triton::driver;
namespace rt = triton::runtime;
bool do_test(drv::stream* stream, int M, int N, std::string op, int nwarp){
typedef float NumericT;
std::string ty = "float";
size_t dt_nbytes = sizeof(NumericT);
drv::context* context = stream->context();
std::vector<NumericT> hy(M);
std::vector<NumericT> hx(M*N);
srand(0);
init_rand(hy);
init_rand(hx);
auto dy = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hy.size()*dt_nbytes));
auto dx = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hx.size()*dt_nbytes));
stream->write(&*dy, true, 0, hy);
stream->write(&*dx, true, 0, hx);
rt::function::options_space_t opt;
opt.defines.push_back({"TYPE", {ty}});
opt.defines.push_back({"TM", {std::to_string(M)}});
opt.defines.push_back({"TN", {std::to_string(N)}});
opt.num_warps = {nwarp};
rt::function function(src::reduce2d, opt);
function({&*dy, &*dx, M, N, M}, grid2d(M, N), stream);
stream->synchronize();
}
int main() {
// initialize default compute device
auto context = triton::driver::backend::contexts::get_default();
triton::driver::stream* stream = triton::driver::stream::create(context);
// shapes to benchmark
typedef std::tuple<int, int, std::string> config_t;
std::vector<config_t> configs = {
config_t{32, 32, "+"}
};
// does the work
int M, N;
std::string op;
for(const auto& c: configs){
std::tie(M, N, op) = c;
std::cout << "Testing " << c << " ... " << std::flush;
if(do_test(stream, M, N, op, 1))
std::cout << " Pass! " << std::endl;
else
std::cout << " Fail! " << std::endl;
}
}