[tests] [common] added reduce.h to common headers
This commit is contained in:
@@ -380,7 +380,16 @@ void layout::run(ir::module &mod) {
|
|||||||
if(auto *red = dynamic_cast<ir::reduce_inst*>(i)) {
|
if(auto *red = dynamic_cast<ir::reduce_inst*>(i)) {
|
||||||
id++;
|
id++;
|
||||||
ir::value *arg = red->get_operand(0);
|
ir::value *arg = red->get_operand(0);
|
||||||
layouts_[id] = new layout_shared_t(get(arg), axes_->get(arg), arg->get_type()->get_tile_shapes(), {red}, red->get_type()->get_scalar_ty(), id, align_);
|
unsigned axis = red->get_axis();
|
||||||
|
// shape
|
||||||
|
auto shapes = arg->get_type()->get_tile_shapes();
|
||||||
|
unsigned shape_ax = shapes[axis];
|
||||||
|
const layout_t *layout = get(arg);
|
||||||
|
unsigned per_thread = layout->nts[axis];
|
||||||
|
unsigned depth = shape_ax / per_thread;
|
||||||
|
shapes[axis] = depth;
|
||||||
|
// create layout
|
||||||
|
layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_);
|
||||||
tmp_[red] = id;
|
tmp_[red] = id;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
@@ -784,18 +784,10 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) {
|
|||||||
partial[pidx] = accumulate(partial[pidx], current);
|
partial[pidx] = accumulate(partial[pidx], current);
|
||||||
});
|
});
|
||||||
|
|
||||||
// depth
|
|
||||||
unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis];
|
|
||||||
unsigned per_thread = arg_tile->axis(axis).values.size();
|
|
||||||
unsigned depth = shape_ax / per_thread;
|
|
||||||
|
|
||||||
// shapes
|
|
||||||
auto shared_shapes = arg_tile->get_shapes();
|
|
||||||
shared_shapes[axis] = depth;
|
|
||||||
|
|
||||||
// reduce within blocks
|
// reduce within blocks
|
||||||
machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x)));
|
machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x)));
|
||||||
shared_tile *stile = (shared_tile*)slayout->create(x);
|
shared_tile *stile = (shared_tile*)slayout->create(x);
|
||||||
|
unsigned depth = stile->get_shapes()[axis];
|
||||||
|
|
||||||
unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace();
|
unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace();
|
||||||
Type *res_ty = builder_->getFloatTy();
|
Type *res_ty = builder_->getFloatTy();
|
||||||
@@ -832,7 +824,7 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
tgt_->add_barrier(mod_, *builder_);
|
tgt_->add_barrier(mod_, *builder_);
|
||||||
|
// write back
|
||||||
distributed_tile* x_tile = (distributed_tile*)tmap_.at(x);
|
distributed_tile* x_tile = (distributed_tile*)tmap_.at(x);
|
||||||
x_tile->for_each([&](indices_t idx) {
|
x_tile->for_each([&](indices_t idx) {
|
||||||
indices_t red_idx = idx;
|
indices_t red_idx = idx;
|
||||||
|
148
tests/common/reduce.h
Normal file
148
tests/common/reduce.h
Normal file
@@ -0,0 +1,148 @@
|
|||||||
|
#include <iomanip>
|
||||||
|
#include <cstring>
|
||||||
|
#include <sstream>
|
||||||
|
#include <cstdio>
|
||||||
|
#include "triton/driver/backend.h"
|
||||||
|
#include "triton/driver/stream.h"
|
||||||
|
#include "triton/tools/bench.hpp"
|
||||||
|
#include "triton/external/half.hpp"
|
||||||
|
#include "triton/runtime/function.h"
|
||||||
|
#include "src/reduce.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
namespace drv = triton::driver;
|
||||||
|
namespace rt = triton::runtime;
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
void cc_reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, size_t axis, const std::vector<int>& shapes) {
|
||||||
|
assert(axis <= shapes.size() - 1);
|
||||||
|
// remove shape at index axis to get outer dimensions
|
||||||
|
std::vector<int> outer = shapes;
|
||||||
|
outer.erase(outer.begin() + axis);
|
||||||
|
// retrieve shape at index axis to get inner dimension
|
||||||
|
int inner = shapes[axis];
|
||||||
|
// accumualtion function
|
||||||
|
auto acc = get_accumulator<T>(op);
|
||||||
|
// iterate over outer dimensions
|
||||||
|
_loop_nest(outer, [&](const std::vector<int>& y_idx) {
|
||||||
|
T ret = 0;
|
||||||
|
auto x_idx = y_idx;
|
||||||
|
x_idx.insert(x_idx.begin() + axis, 0);
|
||||||
|
// accumulate over inner dimensions
|
||||||
|
for(int z = 0; z < inner; z++){
|
||||||
|
x_idx[axis] = z;
|
||||||
|
ret = acc(ret, x[offset(x_idx, shapes)]);
|
||||||
|
}
|
||||||
|
y[offset(y_idx, outer)] = ret;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
enum run_mode_t {
|
||||||
|
BENCH,
|
||||||
|
TEST
|
||||||
|
};
|
||||||
|
|
||||||
|
void triton_reduce_nd(drv::stream* stream, const std::vector<int32_t>& shape,
|
||||||
|
int axis, reduce_op_t op,
|
||||||
|
const std::vector<int32_t>& x_order, const std::vector<int32_t>& y_order,
|
||||||
|
std::vector<std::vector<std::string>> TS,
|
||||||
|
run_mode_t mode, std::vector<double>& bench, bool &test) {
|
||||||
|
typedef float NumericT;
|
||||||
|
std::string ty = "float";
|
||||||
|
size_t dtsize = sizeof(NumericT);
|
||||||
|
drv::context* context = stream->context();
|
||||||
|
size_t axy = (axis == 0) ? 1 : 0;
|
||||||
|
|
||||||
|
// rank
|
||||||
|
size_t rank = shape.size();
|
||||||
|
// size
|
||||||
|
size_t size = 1;
|
||||||
|
for(int32_t d: shape)
|
||||||
|
size *= d;
|
||||||
|
std::vector<std::string> shapename = {"S0", "S1", "S2"};
|
||||||
|
// strides for x
|
||||||
|
std::vector<std::string> x_strides = {"1"};
|
||||||
|
for(size_t d = 0; d < rank - 1; d++)
|
||||||
|
x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]);
|
||||||
|
// strides for y
|
||||||
|
std::vector<std::string> y_strides = {"1"};
|
||||||
|
for(size_t d = 0; d < rank - 1; d++)
|
||||||
|
y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]);
|
||||||
|
|
||||||
|
// create inputs
|
||||||
|
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
||||||
|
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, size*dtsize));
|
||||||
|
// create options
|
||||||
|
rt::function::options_space_t opt;
|
||||||
|
|
||||||
|
// type
|
||||||
|
opt.defines.push_back({"TYPE", {ty}});
|
||||||
|
// x strides
|
||||||
|
for(size_t d = 0; d < rank; d++)
|
||||||
|
opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}});
|
||||||
|
// y strides
|
||||||
|
for(size_t d = 0; d < rank; d++)
|
||||||
|
opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}});
|
||||||
|
if(TS.empty())
|
||||||
|
TS = tile_nd(rank);
|
||||||
|
// tile size
|
||||||
|
for(size_t d = 0; d < rank; d++)
|
||||||
|
opt.defines.push_back({"TS" + std::to_string(d), TS[d]});
|
||||||
|
// non-reduced axis
|
||||||
|
std::string RY = (axis == 0) ? "rn" : "rm";
|
||||||
|
opt.defines.push_back({"TY", {std::to_string(shape[axy])}});
|
||||||
|
opt.defines.push_back({"RY", {RY}});
|
||||||
|
// reduction broadcasting
|
||||||
|
std::string RED = "";
|
||||||
|
for(int n = 0; n < 2; n++){
|
||||||
|
if(n > 0)
|
||||||
|
RED += ", ";
|
||||||
|
RED += (n==axis) ? to_str(op) : ":";
|
||||||
|
}
|
||||||
|
opt.defines.push_back({"RED", {RED}});
|
||||||
|
|
||||||
|
opt.num_warps = {4};
|
||||||
|
|
||||||
|
// kernel
|
||||||
|
rt::function function(src::reduce2d, opt);
|
||||||
|
|
||||||
|
// grid
|
||||||
|
std::vector<rt::arg> args = {&*dx, &*dy};
|
||||||
|
for(int32_t d: shape)
|
||||||
|
args.push_back(d);
|
||||||
|
args.push_back(shape[0]);
|
||||||
|
std::vector<std::string> ts = {"TS0", "TS1", "TS2"};
|
||||||
|
auto grid = grid_nd(shape, ts);
|
||||||
|
|
||||||
|
// metrics
|
||||||
|
if(mode == BENCH){
|
||||||
|
auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; };
|
||||||
|
double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream);
|
||||||
|
bench.push_back(gbps(triton_ns));
|
||||||
|
}
|
||||||
|
|
||||||
|
// test triton
|
||||||
|
if(mode == TEST){
|
||||||
|
std::vector<NumericT> hy(shape[axy]);
|
||||||
|
std::vector<NumericT> ry(shape[axy]);
|
||||||
|
std::vector<NumericT> hx(shape[0]*shape[1]);
|
||||||
|
init_zeros(hy);
|
||||||
|
init_rand(hx);
|
||||||
|
stream->write(&*dx, true, 0, hx);
|
||||||
|
function(args, grid, stream);
|
||||||
|
stream->synchronize();
|
||||||
|
stream->read(&*dy, true, 0, hy);
|
||||||
|
cc_reduce_nd(ry, hx, op, axis, shape);
|
||||||
|
test = testing::diff(hy, ry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
|
||||||
|
std::vector<double> bench;
|
||||||
|
bool test;
|
||||||
|
std::vector<std::vector<std::string>> TSS;
|
||||||
|
for(int32_t d: shape)
|
||||||
|
TSS.push_back({std::to_string(d)});
|
||||||
|
triton_reduce_nd(stream, shape, axis, op, {0, 1}, {0, 1}, TSS, TEST, bench, test);
|
||||||
|
return test;
|
||||||
|
}
|
@@ -16,9 +16,9 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16),
|
|||||||
int M, int N, int ldx) {
|
int M, int N, int ldx) {
|
||||||
int ridm = get_program_id(0);
|
int ridm = get_program_id(0);
|
||||||
int ridn = get_program_id(1);
|
int ridn = get_program_id(1);
|
||||||
int rm[TM] = ridm * TM + 0 ... TM;
|
int rm[TS0] = ridm * TS0 + 0 ... TS0;
|
||||||
int rn[TN] = ridn * TN + 0 ... TN;
|
int rn[TS1] = ridn * TS1 + 0 ... TS1;
|
||||||
TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
|
TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx;
|
||||||
TYPE* py[TY] = Y + RY;
|
TYPE* py[TY] = Y + RY;
|
||||||
*py = (*px)[RED];
|
*py = (*px)[RED];
|
||||||
}
|
}
|
||||||
|
@@ -8,76 +8,10 @@
|
|||||||
#include "triton/tools/bench.hpp"
|
#include "triton/tools/bench.hpp"
|
||||||
#include "triton/external/half.hpp"
|
#include "triton/external/half.hpp"
|
||||||
#include "triton/runtime/function.h"
|
#include "triton/runtime/function.h"
|
||||||
#include "src/reduce.h"
|
|
||||||
#include "cuda/cublas.h"
|
#include "cuda/cublas.h"
|
||||||
|
#include "reduce.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
namespace drv = triton::driver;
|
|
||||||
namespace rt = triton::runtime;
|
|
||||||
|
|
||||||
template<class T>
|
|
||||||
void reduce_nd(std::vector<T> &y, const std::vector<T> &x, reduce_op_t op, size_t axis, const std::vector<int>& shapes) {
|
|
||||||
assert(axis <= shapes.size() - 1);
|
|
||||||
// remove shape at index axis to get outer dimensions
|
|
||||||
std::vector<int> outer = shapes;
|
|
||||||
outer.erase(outer.begin() + axis);
|
|
||||||
// retrieve shape at index axis to get inner dimension
|
|
||||||
int inner = shapes[axis];
|
|
||||||
// accumualtion function
|
|
||||||
auto acc = get_accumulator<T>(op);
|
|
||||||
// iterate over outer dimensions
|
|
||||||
_loop_nest(outer, [&](const std::vector<int>& y_idx) {
|
|
||||||
T ret = 0;
|
|
||||||
auto x_idx = y_idx;
|
|
||||||
x_idx.insert(x_idx.begin() + axis, 0);
|
|
||||||
// accumulate over inner dimensions
|
|
||||||
for(int z = 0; z < inner; z++){
|
|
||||||
x_idx[axis] = z;
|
|
||||||
ret = acc(ret, x[offset(x_idx, shapes)]);
|
|
||||||
}
|
|
||||||
y[offset(y_idx, outer)] = ret;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool do_test(drv::stream* stream, std::vector<int> shape, int axis, reduce_op_t op, int nwarp){
|
|
||||||
typedef float NumericT;
|
|
||||||
std::string ty = "float";
|
|
||||||
size_t dt_nbytes = sizeof(NumericT);
|
|
||||||
drv::context* context = stream->context();
|
|
||||||
size_t axy = (axis == 0) ? 1 : 0;
|
|
||||||
std::string RY = (axis == 0) ? "rn" : "rm";
|
|
||||||
std::vector<NumericT> hy(shape[axy]);
|
|
||||||
std::vector<NumericT> ry(shape[axy]);
|
|
||||||
std::vector<NumericT> hx(shape[0]*shape[1]);
|
|
||||||
srand(0);
|
|
||||||
init_zeros(hy);
|
|
||||||
init_rand(hx);
|
|
||||||
auto dy = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hy.size()*dt_nbytes));
|
|
||||||
auto dx = std::shared_ptr<drv::buffer>(drv::buffer::create(context, hx.size()*dt_nbytes));
|
|
||||||
stream->write(&*dy, true, 0, hy);
|
|
||||||
stream->write(&*dx, true, 0, hx);
|
|
||||||
rt::function::options_space_t opt;
|
|
||||||
opt.defines.push_back({"TYPE", {ty}});
|
|
||||||
opt.defines.push_back({"TM", {std::to_string(shape[0])}});
|
|
||||||
opt.defines.push_back({"TN", {std::to_string(shape[1])}});
|
|
||||||
opt.defines.push_back({"TY", {std::to_string(shape[axy])}});
|
|
||||||
opt.defines.push_back({"RY", {RY}});
|
|
||||||
std::string RED = "";
|
|
||||||
for(int n = 0; n < 2; n++){
|
|
||||||
if(n > 0)
|
|
||||||
RED += ", ";
|
|
||||||
RED += (n==axis) ? to_str(op) : ":";
|
|
||||||
}
|
|
||||||
opt.defines.push_back({"RED", {RED}});
|
|
||||||
opt.num_warps = {nwarp};
|
|
||||||
rt::function function(src::reduce2d, opt);
|
|
||||||
function({&*dx, &*dy, shape[0], shape[1], shape[0]}, grid2d(shape[0], shape[1]), stream);
|
|
||||||
stream->synchronize();
|
|
||||||
stream->read(&*dy, true, 0, hy);
|
|
||||||
reduce_nd(ry, hx, op, axis, shape);
|
|
||||||
return testing::diff(hy, ry);
|
|
||||||
}
|
|
||||||
|
|
||||||
int main() {
|
int main() {
|
||||||
// initialize default compute device
|
// initialize default compute device
|
||||||
|
Reference in New Issue
Block a user