From 0970fe12dd9f8d4f8b1a3cd953905f18358db179 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Sun, 18 Aug 2019 15:39:36 -0700 Subject: [PATCH] [general] cleaned tensorflow source code generation --- .../codegen/analysis/shmem/allocation.h | 6 +- include/triton/codegen/analysis/tune.h | 5 +- include/triton/codegen/selection/selection.h | 6 +- .../triton/codegen/transform/reassociate.h | 6 +- include/triton/codegen/transform/vectorize.h | 6 +- include/triton/runtime/function.h | 2 +- lib/codegen/analysis/shmem/allocation.cpp | 4 +- lib/codegen/analysis/tune.cpp | 26 +- lib/codegen/selection/selection.cpp | 4 +- lib/codegen/transform/reassociate.cpp | 2 +- lib/runtime/function.cpp | 4 +- python/src/tensorflow.cpp | 243 ++++++++++-------- 12 files changed, 162 insertions(+), 152 deletions(-) diff --git a/include/triton/codegen/analysis/shmem/allocation.h b/include/triton/codegen/analysis/shmem/allocation.h index 024c3cf68..243d78352 100644 --- a/include/triton/codegen/analysis/shmem/allocation.h +++ b/include/triton/codegen/analysis/shmem/allocation.h @@ -15,7 +15,7 @@ namespace ir{ namespace codegen{ namespace analysis{ -class tune; +class grids; namespace shmem{ @@ -24,7 +24,7 @@ class info; class allocation { public: - allocation(liveness *live, info *buffer_info, tune *params) + allocation(liveness *live, info *buffer_info, grids *params) : liveness_(live), buffer_info_(buffer_info), params_(params){ } // utilities @@ -45,7 +45,7 @@ private: // dependences liveness *liveness_; info *buffer_info_; - tune *params_; + grids *params_; }; } diff --git a/include/triton/codegen/analysis/tune.h b/include/triton/codegen/analysis/tune.h index 373b20c03..26331c786 100644 --- a/include/triton/codegen/analysis/tune.h +++ b/include/triton/codegen/analysis/tune.h @@ -19,7 +19,7 @@ namespace ir{ namespace codegen{ namespace analysis{ -class tune { +class grids { typedef std::pair node_t; typedef std::map > graph_t; @@ -41,12 +41,11 @@ private: public: - tune(size_t num_warps); + grids(size_t num_warps); ir::metaparameter* get_param(ir::value *value, const std::string &key) { return params_[value][key]; } unsigned get_param_group(ir::value *value, unsigned ax); fragment_t get_fragment(ir::value *value, unsigned ax) { return fragments_.at({value, ax}); } void copy(ir::value *dst, ir::value *src); - bool check_constraints(std::map> &errors); void run(ir::module &mod); unsigned get_num_threads(); diff --git a/include/triton/codegen/selection/selection.h b/include/triton/codegen/selection/selection.h index 3f118d47a..2610fefc3 100644 --- a/include/triton/codegen/selection/selection.h +++ b/include/triton/codegen/selection/selection.h @@ -44,7 +44,7 @@ namespace codegen{ namespace analysis{ -class tune; +class grids; class alignment_info; namespace shmem{ @@ -196,7 +196,7 @@ private: public: - selection(analysis::shmem::allocation *alloc, analysis::tune *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) + selection(analysis::shmem::allocation *alloc, analysis::grids *params, analysis::shmem::info *buffer_info, analysis::alignment_info *alignment, target *tgt) : alloc_(alloc), params_(params), buffer_info_(buffer_info), alignment_(alignment), tgt_(tgt){ } void run(ir::module &src, Module &dst); @@ -205,7 +205,7 @@ private: vmap_t vmap_; tmap_t tmap_; analysis::shmem::allocation *alloc_; - analysis::tune *params_; + analysis::grids *params_; analysis::shmem::info *buffer_info_; analysis::alignment_info *alignment_; target *tgt_; diff --git a/include/triton/codegen/transform/reassociate.h b/include/triton/codegen/transform/reassociate.h index ce7ab476a..f7b843846 100644 --- a/include/triton/codegen/transform/reassociate.h +++ b/include/triton/codegen/transform/reassociate.h @@ -19,7 +19,7 @@ class getelementptr_inst; namespace codegen{ namespace analysis{ -class tune; +class grids; class alignment_info; } @@ -37,11 +37,11 @@ private: ir::value *reassociate_ptr(ir::getelementptr_inst* pz, ir::builder &builder, std::map &offsets); public: - reassociate(analysis::tune *params); + reassociate(analysis::grids *params); void run(ir::module& module); private: - analysis::tune* params_; + analysis::grids* params_; }; } diff --git a/include/triton/codegen/transform/vectorize.h b/include/triton/codegen/transform/vectorize.h index 09fb48000..bf08eb46f 100644 --- a/include/triton/codegen/transform/vectorize.h +++ b/include/triton/codegen/transform/vectorize.h @@ -10,18 +10,18 @@ namespace ir { namespace codegen{ namespace analysis{ - class tune; + class grids; } namespace transform{ class vectorize { public: - vectorize(analysis::tune *params): params_(params){} + vectorize(analysis::grids *params): params_(params){} void run(ir::module &mod); private: - analysis::tune *params_; + analysis::grids *params_; }; } diff --git a/include/triton/runtime/function.h b/include/triton/runtime/function.h index af849448b..2880a4e54 100644 --- a/include/triton/runtime/function.h +++ b/include/triton/runtime/function.h @@ -42,7 +42,7 @@ class translation_unit; namespace codegen{ namespace analysis{ -class tune; +class grids; } } diff --git a/lib/codegen/analysis/shmem/allocation.cpp b/lib/codegen/analysis/shmem/allocation.cpp index ead6143b3..00e90d4a6 100644 --- a/lib/codegen/analysis/shmem/allocation.cpp +++ b/lib/codegen/analysis/shmem/allocation.cpp @@ -21,7 +21,7 @@ unsigned allocation::is_ld_padded(ir::value *x) { } for(ir::user* user: x->get_users()) if(auto dot = dynamic_cast(user)){ - bool is_hmma = params_->get_fragment(user, 0) == tune::HMMA_FRAGMENT_C; + bool is_hmma = params_->get_fragment(user, 0) == grids::HMMA_FRAGMENT_C; bool is_op_0 = x == dot->get_operand(0); bool is_op_1 = x == dot->get_operand(1); if(is_hmma && is_op_0){ @@ -57,7 +57,7 @@ unsigned allocation::get_num_bytes(ir::value *x) { for(auto x: shapes) num_elements *= x->get_value(); size_t depth; - if(params_->get_fragment(x, 0) == tune::HMMA_FRAGMENT_C) + if(params_->get_fragment(x, 0) == grids::HMMA_FRAGMENT_C) depth = params_->get_param(op, "wpt.d" + std::to_string(axis))->get_value(); else depth = params_->get_param(op, "mts.d" + std::to_string(axis))->get_value(); diff --git a/lib/codegen/analysis/tune.cpp b/lib/codegen/analysis/tune.cpp index 2f00d0eb6..9e6c499a2 100644 --- a/lib/codegen/analysis/tune.cpp +++ b/lib/codegen/analysis/tune.cpp @@ -15,7 +15,7 @@ namespace triton{ namespace codegen{ namespace analysis{ -tune::tune(size_t num_warps): num_warps_(num_warps){ +grids::grids(size_t num_warps): num_warps_(num_warps){ } bool is_hmma(ir::value *v){ @@ -32,14 +32,14 @@ bool is_hmma(ir::value *v){ return result; } -void tune::add_constraint(node_t x, node_t y) { +void grids::add_constraint(node_t x, node_t y) { dependencies_[x].insert(y); dependencies_[y].insert(x); nodes_.insert(x); nodes_.insert(y); } -void tune::init_c_phi(ir::instruction *v) { +void grids::init_c_phi(ir::instruction *v) { // Phi Nodes: all the incoming value share the result layout if(auto *phi = dynamic_cast(v)) for(ir::value *op: phi->ops()) @@ -50,7 +50,7 @@ void tune::init_c_phi(ir::instruction *v) { } } -void tune::init_c_graph(ir::instruction *v) { +void grids::init_c_graph(ir::instruction *v) { // Reference shape ir::type::tile_shapes_t::value_type one = ir::tile_type::make_one(v->get_parent()->get_context()); ir::type::tile_shapes_t shapes; @@ -142,7 +142,7 @@ void tune::init_c_graph(ir::instruction *v) { } } -tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ +grids::fragment_t grids::get_fragmentation_type(node_t x, graph_t &graph){ std::list work; std::set seen; work.push_back(x); @@ -160,7 +160,7 @@ tune::fragment_t tune::get_fragmentation_type(node_t x, graph_t &graph){ return STRIDED_SCAN; } -void tune::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { +void grids::connected_components(node_t x, const std::vector mps, const std::vector prefixes, std::set &nodes, graph_t &graph, unsigned group_id) { // std::cout << "connected component: " << x.first->get_name() << " " << x.second << std::endl; groups_[x.first].insert({x.second, group_id}); if(nodes.find(x) != nodes.end()){ @@ -183,20 +183,20 @@ void tune::connected_components(node_t x, const std::vector } } -unsigned tune::get_param_group(ir::value *value, unsigned ax) { +unsigned grids::get_param_group(ir::value *value, unsigned ax) { unsigned result = groups_.at(value).at(ax); return result; } //TODO: This shouldn't exist! -void tune::copy(ir::value *dst, ir::value *src) { +void grids::copy(ir::value *dst, ir::value *src) { params_[dst] = params_[src]; groups_[dst] = groups_[src]; fragments_[{dst, 0}] = fragments_[{src, 0}]; } -void tune::run(ir::module &mod) { +void grids::run(ir::module &mod) { ir::context &ctx = mod.get_context(); // Create metaparameters for(ir::function *fn: mod.get_function_list()){ @@ -318,7 +318,7 @@ void tune::run(ir::module &mod) { } -void tune::create_grids(std::vector &grids, +void grids::create_grids(std::vector &grids, std::map &references, ir::function *fn) { // get number of dimensions greater than 1 @@ -363,11 +363,7 @@ void tune::create_grids(std::vector &grids, } -bool tune::check_constraints(std::map> &errors) { - return errors.empty(); -} - -unsigned tune::get_num_threads() { +unsigned grids::get_num_threads() { return num_warps_*32; } diff --git a/lib/codegen/selection/selection.cpp b/lib/codegen/selection/selection.cpp index 4b31dce52..99b18e568 100644 --- a/lib/codegen/selection/selection.cpp +++ b/lib/codegen/selection/selection.cpp @@ -573,7 +573,7 @@ inline void to_warps(const std::vector &bs, std::vector &nw, void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id, Value *u_warp_id) { const auto& shapes = v->get_type()->get_tile_shapes(); size_t dim = shapes.size(); - if(params_->get_fragment(v, 0) == analysis::tune::STRIDED_SCAN){ + if(params_->get_fragment(v, 0) == analysis::grids::STRIDED_SCAN){ std::vector contiguous(dim); std::vector block_size(dim); std::vector warp_size(dim); @@ -1278,7 +1278,7 @@ void selection::lower_dot(ir::dot_inst *dot, LLVMContext &ctx, Function *fn, IRB if(NK != 1) { shared_tile *TA = (shared_tile*)tmap_.at(A); shared_tile *TB = (shared_tile*)tmap_.at(B); - if(params_->get_fragment(dot, 0) == analysis::tune::STRIDED_SCAN) + if(params_->get_fragment(dot, 0) == analysis::grids::STRIDED_SCAN) lower_scanline_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK, c_ty, f_mul_add); else lower_hmma_dot(dot, ctx, fn, builder, TC, TA, TB, TD, NK); diff --git a/lib/codegen/transform/reassociate.cpp b/lib/codegen/transform/reassociate.cpp index c411ccf12..c5e76f18a 100644 --- a/lib/codegen/transform/reassociate.cpp +++ b/lib/codegen/transform/reassociate.cpp @@ -155,7 +155,7 @@ ir::value *reassociate::reassociate_idx(ir::value *old_value, return new_value; } -reassociate::reassociate(analysis::tune* params) +reassociate::reassociate(analysis::grids* params) : params_(params) { } diff --git a/lib/runtime/function.cpp b/lib/runtime/function.cpp index 250e53243..1e7de730b 100644 --- a/lib/runtime/function.cpp +++ b/lib/runtime/function.cpp @@ -147,7 +147,7 @@ options function::autotune(lang::translation_unit *ast, driver::stream* stream, double ts; std::vector params; }; - profile_t best = { INFINITY }; + profile_t best = { INFINITY, {} }; std::function)> benchmark = [&](std::vector params) { // options @@ -184,7 +184,7 @@ std::unique_ptr function::make_bin(ir::module &module, driver::c if(auto* mp = dynamic_cast(module.globals().at(x.first))) mp->set_value(x.second); // create passes - codegen::analysis::tune tune(opt.num_warps); + codegen::analysis::grids tune(opt.num_warps); codegen::analysis::shmem::info shmem_info; codegen::analysis::shmem::liveness shmem_liveness(&shmem_info); codegen::analysis::shmem::allocation shmem_allocation(&shmem_liveness, &shmem_info, &tune); diff --git a/python/src/tensorflow.cpp b/python/src/tensorflow.cpp index 0e98f6636..ef7de24ff 100644 --- a/python/src/tensorflow.cpp +++ b/python/src/tensorflow.cpp @@ -74,49 +74,118 @@ inline std::unique_ptr make_ir(ir::context& ctx, triton::lang::trans return std::unique_ptr(module); } + +void gen_extract_inputs(std::ostream &os, const std::vector& args) { + for(unsigned i = 0; i < args.size(); i++){ + ir::value *arg = args[i]; + std::string suffix = ""; + ir::type *tr_ty = arg->get_type(); + std::string tf_ty = ref_to_tf_ty(tr_ty); + if(!tr_ty->is_pointer_ty()) + suffix = ".scalar<" + tf_ty + ">()()"; + os << " " << tf_ty << " " << arg->get_name() << " = context->input(" << i << ")" << suffix << ";\n "; + } +} + +void gen_set_outputs(std::ostream &os, const std::vector& outputs) { + for(unsigned i = 0; i < outputs.size(); i++) + os << " context->set_output(" << i << ", " << outputs[i] << ");\n "; +} + +void gen_make_handles(std::ostream &os, const std::vector& args) { + for(unsigned i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + if(!arg->get_type()->is_pointer_ty()) + continue; + const std::string& name = arg->get_name(); + os << " drv::cu_buffer cu_" + name + "(ctx, " + name + ".tensor_data().size(), (CUdeviceptr)" + name + ".tensor_data().data(), false);\n "; + } +} + +void gen_make_spmd_grid(std::ostream &os, const std::vector& macros) { + std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); + std::vector grids = macros; + for(size_t i = grids.size(); i < 3; i++) + grids.push_back("1"); + std::string grid = "rt::grid_t{"; + for(size_t i = 0; i < grids.size(); i++){ + if(i > 0) + grid += ", "; + grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); + } + grid += "}"; + + os << " auto grid = [&](const rt::params_t& x) { return " << grid << "; };\n "; +} + +void gen_make_launch_function(std::ostream &os, const std::vector& args) { + os << " fn_({"; + for(unsigned i = 0; i < args.size() ; i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + if(arg->get_type()->is_pointer_ty()) + name = "&cu_" + name; + if(i > 0) + os << ", "; + os << name; + } + os << "}, grid, stream); \n"; +} + +void gen_register_kernel_builder(std::ostream &os, const std::string &name, + const std::string &classname, + const std::vector& args){ + os << "REGISTER_KERNEL_BUILDER(Name(\"" + name + "\").Device(DEVICE_GPU)"; + for(size_t i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + auto tolower = [](char c) { return std::tolower(c);}; + std::transform(name.begin(), name.end(), name.begin(), tolower); + if(!arg->get_type()->is_pointer_ty()) + os << ".HostMemory(\"" + name + "\")"; + } + os << ", " + classname << ");\n"; +} + +void gen_register_op(std::ostream &os, const std::string &name, + const std::vector& args, + const std::vector& outputs){ + os << "REGISTER_OP(\"" << name << "\")\n"; + for(size_t i = 0; i < args.size(); i++){ + ir::argument *arg = args[i]; + std::string name = arg->get_name(); + auto tolower = [](char c) { return std::tolower(c);}; + std::transform(name.begin(), name.end(), name.begin(), tolower); + os << " .Input(\"" << name << ": " << to_tf_scalar_ty(arg->get_type()) << "\")\n"; + } + for(size_t i = 0; i < outputs.size(); i++){ + std::string name = outputs[i]; + size_t idx; + for(idx = 0; idx < args.size(); idx++) + if(args[idx]->get_name() == name) + break; + if(idx == args.size()) + throw std::runtime_error("unknown output"); + os << " .Output(\"out" << i << ": " << to_tf_scalar_ty(args[idx]->get_type()) << "\")\n"; + } + os << ";\n"; +} + std::string make_tensorflow_src(const std::string src, const std::vector& outputs, const std::vector& macros) { triton::lang::translation_unit *ast = make_ast(src.c_str()); triton::ir::context context; std::unique_ptr ir = make_ir(context, ast); - // extract function signature + // function ir::function* fn = ir->get_function_list().front(); - ir::function_type* fn_ty = fn->get_fn_type(); - // numberof arguments - size_t n_args = fn_ty->get_num_params(); - size_t n_outputs = outputs.size(); - // extract function name std::string name = fn->get_name(); name[0] = static_cast(std::toupper(name[0])); std::string classname = name + "Op"; - // extract argument name - std::vector arg_names; - for(ir::argument *arg: fn->args()) - arg_names.push_back(arg->get_name()); - // cached int to str - std::vector str_i; - for(size_t i = 0; i < fn_ty->get_num_params(); i++) - str_i.push_back(std::to_string(i)); - // index of tensors - std::vector ptr_idx; - for(unsigned i = 0; i < fn_ty->get_num_params(); i++) - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - ptr_idx.push_back(i); - // extract tensorflow types - std::vector tf_scalar_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_scalar_tys), to_tf_scalar_ty); - std::vector tf_cref_tys; - std::transform(fn_ty->params_begin(), fn_ty->params_end(), std::back_inserter(tf_cref_tys), ref_to_tf_ty); - // output indices - std::vector out_idx; - for(const std::string &name : outputs){ - auto it = std::find(arg_names.begin(), arg_names.end(), name); - out_idx.push_back(std::distance(arg_names.begin(), it)); - } + std::ostringstream oss; - std::string result = R"( + oss << R"( #include "triton/driver/buffer.h" #include "triton/driver/backend.h" #include "triton/driver/stream.h" @@ -138,106 +207,52 @@ namespace drv = triton::driver; std::string src = R"TTKERNSRC( )" + src + ")TTKERNSRC\";" + R"( -class )" + classname + R"(: public OpKernel { +class )" << classname << R"(: public OpKernel { public: - explicit )" + classname + R"((OpKernelConstruction* context) + explicit )" << classname << R"((OpKernelConstruction* context) : OpKernel(context), fn_(src) { } void Compute(OpKernelContext* context){ - // get device/stream GPUDevice device = context->eigen_device(); drv::cu_stream sstream(device.stream(), false); drv::context* ctx = sstream.context(); drv::stream* stream = &sstream; - - // extract inputs)"; -for(unsigned i = 0; i < n_args; i++){ - std::string suffix = ""; - std::string ty = tf_cref_tys[i]; - if(!fn_ty->get_param_ty(i)->is_pointer_ty()) - suffix = ".scalar<" + ty + ">()()"; - result += R"( - )" + ty + " " + arg_names[i] + " = context->input(" + str_i[i] + ")" + suffix + ";"; -} - -result += R"( - - // extract outputs)"; -for(unsigned i = 0; i < n_outputs; i++) - result += R"( - context->set_output()" + str_i[i] + ", " + outputs[i] + ");"; - -result += R"( - - // wrap tensors)"; -for(size_t i: ptr_idx) -result += R"( - drv::cu_buffer cu_)" + arg_names[i] + "(ctx, " + arg_names[i] + ".tensor_data().size(), (CUdeviceptr)" + arg_names[i] + R"(.tensor_data().data(), false);)"; - - -std::regex regex("#([a-zA-Z]([a-zA-Z]|[0-9])*)"); -std::vector grids = macros; -for(size_t i = grids.size(); i < 3; i++) - grids.push_back("1"); -std::string grid = "rt::grid_t{"; -for(size_t i = 0; i < grids.size(); i++){ - if(i > 0) - grid += ", "; - grid += std::regex_replace(grids[i], regex, "x.at(\"$1\")"); -} -grid += "}"; - -result += R"( - - // create launch grid; - auto grid = [&](const rt::params_t& x) { return )" + grid + R"(; };)"; - -result += R"( - - // execute function - fn_({ + // extract inputs )"; -for(unsigned i = 0; i < n_args; i++){ - std::string arg = arg_names[i]; - if(fn_ty->get_param_ty(i)->is_pointer_ty()) - arg = "&cu_" + arg; - if(i > 0) - result += ", "; - result += arg; -} -result += R"( - }, grid, stream); - +gen_extract_inputs(oss, fn->args()); +oss << R"( + // set outputs + )"; +gen_set_outputs(oss, outputs); +oss << R"( + // wrap tensors + )"; +gen_make_handles(oss, fn->args()); +oss << R"( + // create spmd grid + )"; +gen_make_spmd_grid(oss, macros); +oss << R"( + // launch function + )"; +gen_make_launch_function(oss, fn->args()); +oss << R"( } private: rt::function fn_; }; -REGISTER_KERNEL_BUILDER(Name(")" + name + "\").Device(DEVICE_GPU)"; -for(size_t i = 0; i < tf_scalar_tys.size(); i++){ - std::string arg_name = arg_names[i]; - std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); - if(!fn_ty->get_param_ty(i)->is_pointer_ty()) - result += ".HostMemory(\"" + arg_name + "\")"; -} -result += ", " + classname + R"(); +// register kernel builder +)"; +gen_register_kernel_builder(oss, name, classname, fn->args()); +oss << R"( +// register op +)"; +gen_register_op(oss, name, fn->args(), outputs); - -REGISTER_OP(")" + name + "\")\n"; -for(size_t i = 0; i < tf_scalar_tys.size(); i++){ - std::string arg_name = arg_names[i]; - std::transform(arg_name.begin(), arg_name.end(), arg_name.begin(), [](char c) { return std::tolower(c);}); - result += " .Input(\"" + arg_name + ": " + tf_scalar_tys[i] + "\")\n"; -} -for(size_t i = 0; i < outputs.size(); i++){ - result += " .Output(\"out" + std::to_string(i) + ": " + tf_scalar_tys[out_idx[i]] + "\")\n"; -} -result += ";\n"; - - - return result; + return oss.str(); }