diff --git a/lib/codegen/analysis/layout.cc b/lib/codegen/analysis/layout.cc index 1066a5cae..eda55e451 100644 --- a/lib/codegen/analysis/layout.cc +++ b/lib/codegen/analysis/layout.cc @@ -380,7 +380,16 @@ void layout::run(ir::module &mod) { if(auto *red = dynamic_cast(i)) { id++; ir::value *arg = red->get_operand(0); - layouts_[id] = new layout_shared_t(get(arg), axes_->get(arg), arg->get_type()->get_tile_shapes(), {red}, red->get_type()->get_scalar_ty(), id, align_); + unsigned axis = red->get_axis(); + // shape + auto shapes = arg->get_type()->get_tile_shapes(); + unsigned shape_ax = shapes[axis]; + const layout_t *layout = get(arg); + unsigned per_thread = layout->nts[axis]; + unsigned depth = shape_ax / per_thread; + shapes[axis] = depth; + // create layout + layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_); tmp_[red] = id; } }); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 8fbdbeded..d9d1e1cec 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -784,18 +784,10 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { partial[pidx] = accumulate(partial[pidx], current); }); - // depth - unsigned shape_ax = arg->get_type()->get_tile_shapes()[axis]; - unsigned per_thread = arg_tile->axis(axis).values.size(); - unsigned depth = shape_ax / per_thread; - - // shapes - auto shared_shapes = arg_tile->get_shapes(); - shared_shapes[axis] = depth; - // reduce within blocks machine_layout_t *slayout = machine_layouts_.at(layouts_->get(layouts_->tmp(x))); shared_tile *stile = (shared_tile*)slayout->create(x); + unsigned depth = stile->get_shapes()[axis]; unsigned addr_space = sh_mem_ptr_->getType()->getPointerAddressSpace(); Type *res_ty = builder_->getFloatTy(); @@ -832,7 +824,7 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { } } tgt_->add_barrier(mod_, *builder_); - + // write back distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); x_tile->for_each([&](indices_t idx) { indices_t red_idx = idx; diff --git a/tests/common/reduce.h b/tests/common/reduce.h new file mode 100644 index 000000000..86e066638 --- /dev/null +++ b/tests/common/reduce.h @@ -0,0 +1,148 @@ +#include +#include +#include +#include +#include "triton/driver/backend.h" +#include "triton/driver/stream.h" +#include "triton/tools/bench.hpp" +#include "triton/external/half.hpp" +#include "triton/runtime/function.h" +#include "src/reduce.h" +#include "util.h" + +namespace drv = triton::driver; +namespace rt = triton::runtime; + +template +void cc_reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { + assert(axis <= shapes.size() - 1); + // remove shape at index axis to get outer dimensions + std::vector outer = shapes; + outer.erase(outer.begin() + axis); + // retrieve shape at index axis to get inner dimension + int inner = shapes[axis]; + // accumualtion function + auto acc = get_accumulator(op); + // iterate over outer dimensions + _loop_nest(outer, [&](const std::vector& y_idx) { + T ret = 0; + auto x_idx = y_idx; + x_idx.insert(x_idx.begin() + axis, 0); + // accumulate over inner dimensions + for(int z = 0; z < inner; z++){ + x_idx[axis] = z; + ret = acc(ret, x[offset(x_idx, shapes)]); + } + y[offset(y_idx, outer)] = ret; + }); +} + +enum run_mode_t { + BENCH, + TEST +}; + +void triton_reduce_nd(drv::stream* stream, const std::vector& shape, + int axis, reduce_op_t op, + const std::vector& x_order, const std::vector& y_order, + std::vector> TS, + run_mode_t mode, std::vector& bench, bool &test) { + typedef float NumericT; + std::string ty = "float"; + size_t dtsize = sizeof(NumericT); + drv::context* context = stream->context(); + size_t axy = (axis == 0) ? 1 : 0; + + // rank + size_t rank = shape.size(); + // size + size_t size = 1; + for(int32_t d: shape) + size *= d; + std::vector shapename = {"S0", "S1", "S2"}; + // strides for x + std::vector x_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]); + // strides for y + std::vector y_strides = {"1"}; + for(size_t d = 0; d < rank - 1; d++) + y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]); + + // create inputs + auto dx = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + auto dy = std::unique_ptr(drv::buffer::create(context, size*dtsize)); + // create options + rt::function::options_space_t opt; + + // type + opt.defines.push_back({"TYPE", {ty}}); + // x strides + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}}); + // y strides + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}}); + if(TS.empty()) + TS = tile_nd(rank); + // tile size + for(size_t d = 0; d < rank; d++) + opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); + // non-reduced axis + std::string RY = (axis == 0) ? "rn" : "rm"; + opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); + opt.defines.push_back({"RY", {RY}}); + // reduction broadcasting + std::string RED = ""; + for(int n = 0; n < 2; n++){ + if(n > 0) + RED += ", "; + RED += (n==axis) ? to_str(op) : ":"; + } + opt.defines.push_back({"RED", {RED}}); + + opt.num_warps = {4}; + + // kernel + rt::function function(src::reduce2d, opt); + + // grid + std::vector args = {&*dx, &*dy}; + for(int32_t d: shape) + args.push_back(d); + args.push_back(shape[0]); + std::vector ts = {"TS0", "TS1", "TS2"}; + auto grid = grid_nd(shape, ts); + + // metrics + if(mode == BENCH){ + auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; }; + double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); + bench.push_back(gbps(triton_ns)); + } + + // test triton + if(mode == TEST){ + std::vector hy(shape[axy]); + std::vector ry(shape[axy]); + std::vector hx(shape[0]*shape[1]); + init_zeros(hy); + init_rand(hx); + stream->write(&*dx, true, 0, hx); + function(args, grid, stream); + stream->synchronize(); + stream->read(&*dy, true, 0, hy); + cc_reduce_nd(ry, hx, op, axis, shape); + test = testing::diff(hy, ry); + } +} + +bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t op, int nwarp){ + std::vector bench; + bool test; + std::vector> TSS; + for(int32_t d: shape) + TSS.push_back({std::to_string(d)}); + triton_reduce_nd(stream, shape, axis, op, {0, 1}, {0, 1}, TSS, TEST, bench, test); + return test; +} diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index 3a77e960e..cc44ca5fc 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -16,9 +16,9 @@ void reduce2d(TYPE * X __noalias __readonly __aligned(16), int M, int N, int ldx) { int ridm = get_program_id(0); int ridn = get_program_id(1); - int rm[TM] = ridm * TM + 0 ... TM; - int rn[TN] = ridn * TN + 0 ... TN; - TYPE* px[TM, TN] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; + int rm[TS0] = ridm * TS0 + 0 ... TS0; + int rn[TS1] = ridn * TS1 + 0 ... TS1; + TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; TYPE* py[TY] = Y + RY; *py = (*px)[RED]; } diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 3c3754133..0ee43cbc5 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -8,76 +8,10 @@ #include "triton/tools/bench.hpp" #include "triton/external/half.hpp" #include "triton/runtime/function.h" -#include "src/reduce.h" #include "cuda/cublas.h" +#include "reduce.h" #include "util.h" -namespace drv = triton::driver; -namespace rt = triton::runtime; - -template -void reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, size_t axis, const std::vector& shapes) { - assert(axis <= shapes.size() - 1); - // remove shape at index axis to get outer dimensions - std::vector outer = shapes; - outer.erase(outer.begin() + axis); - // retrieve shape at index axis to get inner dimension - int inner = shapes[axis]; - // accumualtion function - auto acc = get_accumulator(op); - // iterate over outer dimensions - _loop_nest(outer, [&](const std::vector& y_idx) { - T ret = 0; - auto x_idx = y_idx; - x_idx.insert(x_idx.begin() + axis, 0); - // accumulate over inner dimensions - for(int z = 0; z < inner; z++){ - x_idx[axis] = z; - ret = acc(ret, x[offset(x_idx, shapes)]); - } - y[offset(y_idx, outer)] = ret; - }); -} - - -bool do_test(drv::stream* stream, std::vector shape, int axis, reduce_op_t op, int nwarp){ - typedef float NumericT; - std::string ty = "float"; - size_t dt_nbytes = sizeof(NumericT); - drv::context* context = stream->context(); - size_t axy = (axis == 0) ? 1 : 0; - std::string RY = (axis == 0) ? "rn" : "rm"; - std::vector hy(shape[axy]); - std::vector ry(shape[axy]); - std::vector hx(shape[0]*shape[1]); - srand(0); - init_zeros(hy); - init_rand(hx); - auto dy = std::shared_ptr(drv::buffer::create(context, hy.size()*dt_nbytes)); - auto dx = std::shared_ptr(drv::buffer::create(context, hx.size()*dt_nbytes)); - stream->write(&*dy, true, 0, hy); - stream->write(&*dx, true, 0, hx); - rt::function::options_space_t opt; - opt.defines.push_back({"TYPE", {ty}}); - opt.defines.push_back({"TM", {std::to_string(shape[0])}}); - opt.defines.push_back({"TN", {std::to_string(shape[1])}}); - opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); - opt.defines.push_back({"RY", {RY}}); - std::string RED = ""; - for(int n = 0; n < 2; n++){ - if(n > 0) - RED += ", "; - RED += (n==axis) ? to_str(op) : ":"; - } - opt.defines.push_back({"RED", {RED}}); - opt.num_warps = {nwarp}; - rt::function function(src::reduce2d, opt); - function({&*dx, &*dy, shape[0], shape[1], shape[0]}, grid2d(shape[0], shape[1]), stream); - stream->synchronize(); - stream->read(&*dy, true, 0, hy); - reduce_nd(ry, hx, op, axis, shape); - return testing::diff(hy, ry); -} int main() { // initialize default compute device