From abe3fbb480efb25662d501d220cbab4eebde9995 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 20 Oct 2019 01:01:53 -0400 Subject: [PATCH] [test] [reduce] added test for 1D reduction --- lib/codegen/analysis/axes.cc | 2 + lib/codegen/selection/generator.cc | 5 +- lib/ir/instructions.cc | 6 +-- lib/lang/ast.cc | 7 ++- lib/lang/code_gen.cc | 3 ++ tests/common/reduce.h | 85 ++++++++++++++++-------------- tests/common/src/reduce.h | 25 ++++++--- tests/unit/reduce.cc | 1 + 8 files changed, 79 insertions(+), 55 deletions(-) diff --git a/lib/codegen/analysis/axes.cc b/lib/codegen/analysis/axes.cc index c446558a8..6271e224b 100644 --- a/lib/codegen/analysis/axes.cc +++ b/lib/codegen/analysis/axes.cc @@ -106,6 +106,8 @@ void axes::update_graph_elementwise(ir::instruction *i) { } void axes::update_graph_no_edge(ir::instruction *i) { + if(!i->get_type()->is_tile_ty()) + return; auto rank = i->get_type()->get_tile_rank(); for(unsigned d = 0; d < rank; d++) graph_.add_edge({i, d}, {i, d}); diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index d9d1e1cec..d7b9bc6a3 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -825,13 +825,12 @@ void generator::visit_reduce_inst(ir::reduce_inst* x) { } tgt_->add_barrier(mod_, *builder_); // write back - distributed_tile* x_tile = (distributed_tile*)tmap_.at(x); - x_tile->for_each([&](indices_t idx) { + for_each(x, [&](indices_t idx) { indices_t red_idx = idx; red_idx.insert(red_idx.begin() + axis, builder_->getInt32(0)); Value *read_offset = shared_tile::shared_offset(*builder_, stile->get_shapes(), stile->get_perm(), stile->get_order(), red_idx); Value *read_ptr = builder_->CreateGEP(base_ptr, read_offset); - x_tile->set_value(idx, builder_->CreateLoad(read_ptr)); + set_value(x, idx, builder_->CreateLoad(read_ptr)); }); } diff --git a/lib/ir/instructions.cc b/lib/ir/instructions.cc index 01f961402..0be815a51 100644 --- a/lib/ir/instructions.cc +++ b/lib/ir/instructions.cc @@ -651,10 +651,10 @@ type* reduce_inst::get_res_type(value *arg, unsigned axis) { ir::tile_type::tile_shapes_t shapes = arg->get_type()->get_tile_shapes(); shapes.erase(shapes.begin() + axis); type *scalar_ty = arg->get_type()->get_scalar_ty(); - if(shapes.size() == 0) + if(shapes.empty()) +// shapes.push_back(1); return scalar_ty; - else - return tile_type::get(scalar_ty, shapes); + return tile_type::get(scalar_ty, shapes); } reduce_inst::reduce_inst(value *arg, op_t op, unsigned axis, const std::string &name, instruction *next) diff --git a/lib/lang/ast.cc b/lib/lang/ast.cc index b0a50adc3..bf0c7e964 100644 --- a/lib/lang/ast.cc +++ b/lib/lang/ast.cc @@ -567,7 +567,7 @@ void BinaryOp::AssignOpTypeChecking() { // The other constraints are lefted to cast operator rhs_ = Expr::MayCast(rhs_, ScalarOrLikeTile(rhs_, lhsScalType)); type_ = lhs_->Type(); - Broadcast(this, lhs_, rhs_, type_); + rhs_ = UnaryOp::New(Token::CAST, rhs_, type_); } /* @@ -688,7 +688,10 @@ void UnaryOp::ReduceOpTypeChecking() { Error(this, "array expected for reduction operation"); auto shape = tileType->Shape(); shape.erase(shape.begin() + ax); - type_ = TileType::New(shape, tileType->Derived()); + if(shape.empty()) + type_ = tileType->Derived(); + else + type_ = TileType::New(shape, tileType->Derived()); } void UnaryOp::TransOpTypeChecking() { diff --git a/lib/lang/code_gen.cc b/lib/lang/code_gen.cc index 4bf9d4a2c..aee604b4a 100644 --- a/lib/lang/code_gen.cc +++ b/lib/lang/code_gen.cc @@ -467,6 +467,9 @@ ir::value* Generator::GenBroadcastOp(ir::value* src, ir::type* dst_ty) { return bld_->create_broadcast(src, dst_shapes); } } + else if(src->get_type()->is_tile_ty() && src->get_type()->get_tile_num_elements() == 1){ + return bld_->create_downcast(src); + } return src; } diff --git a/tests/common/reduce.h b/tests/common/reduce.h index 86e066638..ba4e6e470 100644 --- a/tests/common/reduce.h +++ b/tests/common/reduce.h @@ -19,6 +19,8 @@ void cc_reduce_nd(std::vector &y, const std::vector &x, reduce_op_t op, si // remove shape at index axis to get outer dimensions std::vector outer = shapes; outer.erase(outer.begin() + axis); + if(outer.empty()) + outer.push_back(1); // retrieve shape at index axis to get inner dimension int inner = shapes[axis]; // accumualtion function @@ -42,7 +44,7 @@ enum run_mode_t { TEST }; -void triton_reduce_nd(drv::stream* stream, const std::vector& shape, +void triton_reduce_nd(drv::stream* stream, const std::vector& shape_x, int axis, reduce_op_t op, const std::vector& x_order, const std::vector& y_order, std::vector> TS, @@ -53,86 +55,91 @@ void triton_reduce_nd(drv::stream* stream, const std::vector& shape, drv::context* context = stream->context(); size_t axy = (axis == 0) ? 1 : 0; + // shape + std::vector shape_y = shape_x; + shape_y.erase(shape_y.begin() + axis); + // rank - size_t rank = shape.size(); + int rank_x = shape_x.size(); + int rank_y = shape_y.size(); + // size - size_t size = 1; - for(int32_t d: shape) - size *= d; - std::vector shapename = {"S0", "S1", "S2"}; + size_t size_x = 1; + for(int32_t d: shape_x) + size_x *= d; + size_t size_y = 1; + for(int32_t d: shape_y) + size_y *= d; + // strides for x + std::vector x_shapename = {"S0", "S1", "S2"}; std::vector x_strides = {"1"}; - for(size_t d = 0; d < rank - 1; d++) - x_strides.push_back(x_strides[d] + " * " + shapename[x_order[d]]); + for(int d = 0; d < rank_x - 1; d++) + x_strides.push_back(x_strides[d] + " * " + x_shapename[x_order[d]]); + // strides for y + std::vector y_shapename = x_shapename; + y_shapename.erase(y_shapename.begin() + axis); std::vector y_strides = {"1"}; - for(size_t d = 0; d < rank - 1; d++) - y_strides.push_back(y_strides[d] + " * " + shapename[y_order[d]]); + for(int d = 0; d < rank_y - 1; d++) + y_strides.push_back(y_strides[d] + " * " + y_shapename[y_order[d]]); - // create inputs - auto dx = std::unique_ptr(drv::buffer::create(context, size*dtsize)); - auto dy = std::unique_ptr(drv::buffer::create(context, size*dtsize)); - // create options + // options rt::function::options_space_t opt; - - // type opt.defines.push_back({"TYPE", {ty}}); - // x strides - for(size_t d = 0; d < rank; d++) + for(int d = 0; d < rank_x; d++) opt.defines.push_back({"STRIDE_XS" + std::to_string(x_order[d]), {x_strides[d]}}); - // y strides - for(size_t d = 0; d < rank; d++) + for(int d = 0; d < rank_y; d++) opt.defines.push_back({"STRIDE_YS" + std::to_string(y_order[d]), {y_strides[d]}}); if(TS.empty()) - TS = tile_nd(rank); - // tile size - for(size_t d = 0; d < rank; d++) + TS = tile_nd(rank_x); + for(int d = 0; d < rank_x; d++) opt.defines.push_back({"TS" + std::to_string(d), TS[d]}); - // non-reduced axis - std::string RY = (axis == 0) ? "rn" : "rm"; - opt.defines.push_back({"TY", {std::to_string(shape[axy])}}); + std::string RY = (axis == 0) ? "rs1" : "rs0"; + opt.defines.push_back({"TY", {std::to_string(shape_x[axy])}}); opt.defines.push_back({"RY", {RY}}); - // reduction broadcasting std::string RED = ""; - for(int n = 0; n < 2; n++){ + for(int n = 0; n < rank_x; n++){ if(n > 0) RED += ", "; RED += (n==axis) ? to_str(op) : ":"; } opt.defines.push_back({"RED", {RED}}); - - opt.num_warps = {4}; + opt.num_warps = {1}; // kernel - rt::function function(src::reduce2d, opt); + rt::function function(src::reduce_nd[rank_x - 1], opt); + + // input buffers + auto dx = std::unique_ptr(drv::buffer::create(context, size_x*dtsize)); + auto dy = std::unique_ptr(drv::buffer::create(context, size_y*dtsize)); // grid std::vector args = {&*dx, &*dy}; - for(int32_t d: shape) + for(int32_t d: shape_x) args.push_back(d); - args.push_back(shape[0]); std::vector ts = {"TS0", "TS1", "TS2"}; - auto grid = grid_nd(shape, ts); + auto grid = grid_nd(shape_x, ts); // metrics if(mode == BENCH){ - auto gbps = [&](double ns) { return 2 * size * dtsize / (ns * 1e-9) * 1e-9; }; + auto gbps = [&](double ns) { return 2 * size_x * dtsize / (ns * 1e-9) * 1e-9; }; double triton_ns = triton::tools::bench([&]() { function(args, grid, stream);}, stream); bench.push_back(gbps(triton_ns)); } // test triton if(mode == TEST){ - std::vector hy(shape[axy]); - std::vector ry(shape[axy]); - std::vector hx(shape[0]*shape[1]); + std::vector hy(size_y); + std::vector ry(size_y); + std::vector hx(size_x); init_zeros(hy); init_rand(hx); stream->write(&*dx, true, 0, hx); function(args, grid, stream); stream->synchronize(); stream->read(&*dy, true, 0, hy); - cc_reduce_nd(ry, hx, op, axis, shape); + cc_reduce_nd(ry, hx, op, axis, shape_x); test = testing::diff(hy, ry); } } diff --git a/tests/common/src/reduce.h b/tests/common/src/reduce.h index cc44ca5fc..158e68567 100644 --- a/tests/common/src/reduce.h +++ b/tests/common/src/reduce.h @@ -4,7 +4,11 @@ namespace src { R"( void reduce1d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __readonly __aligned(16), - int N) { + int S0) { + int pid0 = get_program_id(0); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + TYPE* px[TS0] = X + rs0; + *Y = (*px)[RED]; } )"; @@ -13,15 +17,20 @@ void reduce1d(TYPE * X __noalias __readonly __aligned(16), R"( void reduce2d(TYPE * X __noalias __readonly __aligned(16), TYPE * Y __noalias __writeonly __aligned(16), - int M, int N, int ldx) { - int ridm = get_program_id(0); - int ridn = get_program_id(1); - int rm[TS0] = ridm * TS0 + 0 ... TS0; - int rn[TS1] = ridn * TS1 + 0 ... TS1; - TYPE* px[TS0, TS1] = X + rm[:, newaxis] + rn[newaxis, :] * ldx; - TYPE* py[TY] = Y + RY; + int S0, int S1) { + int pid0 = get_program_id(0); + int pid1 = get_program_id(1); + int rs0[TS0] = pid0 * TS0 + 0 ... TS0; + int rs1[TS1] = pid1 * TS1 + 0 ... TS1; + TYPE* px[TS0, TS1] = X + rs0[:, newaxis] * STRIDE_XS0 + + rs1[newaxis, :] * STRIDE_XS1; + TYPE* py[TY] = Y + RY * STRIDE_YS0; *py = (*px)[RED]; } )"; + + const char* reduce_nd[] = {reduce1d, reduce2d}; + + } diff --git a/tests/unit/reduce.cc b/tests/unit/reduce.cc index 0ee43cbc5..5a311686c 100644 --- a/tests/unit/reduce.cc +++ b/tests/unit/reduce.cc @@ -20,6 +20,7 @@ int main() { // shapes to benchmark typedef std::tuple, int, reduce_op_t> config_t; std::vector configs = { + config_t{{32}, 0, ADD}, config_t{{32, 32}, 0, MAX}, config_t{{32, 32}, 1, ADD}, config_t{{32, 64}, 0, ADD},