diff --git a/include/atidlas/backend/templates/base.h b/include/atidlas/backend/templates/base.h index 29393189c..9c51060a7 100644 --- a/include/atidlas/backend/templates/base.h +++ b/include/atidlas/backend/templates/base.h @@ -11,7 +11,7 @@ #include "atidlas/backend/stream.h" #include "atidlas/cl_ext/lazy_compiler.h" #include "atidlas/symbolic/expression.h" - +#include "atidlas/tools/to_string.hpp" namespace atidlas { @@ -119,8 +119,34 @@ protected: static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size); - static void element_wise_loop_1D(kernel_generation_stream & stream, loop_body_base const & loop_body, - fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size); + + template + static void element_wise_loop_1D(kernel_generation_stream & stream, fetching_policy_type fetch, unsigned int simd_width, + std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size, Fun const & generate_body) + { + std::string strwidth = tools::to_string(simd_width); + std::string boundround = bound + "/" + strwidth; + + std::string init, upper_bound, inc; + fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size); + stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + generate_body(simd_width); + stream.dec_tab(); + stream << "}" << std::endl; + + if (simd_width>1) + { + stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + generate_body(1); + stream.dec_tab(); + stream << "}" << std::endl; + } + } + static void compute_reduction(kernel_generation_stream & os, std::string acc, std::string cur, op_element const & op); static void compute_index_reduction(kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, op_element const & op); static void process_all(std::string const & type_key, std::string const & str, diff --git a/include/atidlas/backend/templates/mreduction.h b/include/atidlas/backend/templates/mreduction.h index c98a6ae8f..a4889ee55 100644 --- a/include/atidlas/backend/templates/mreduction.h +++ b/include/atidlas/backend/templates/mreduction.h @@ -15,6 +15,7 @@ struct mreduction_parameters : public base::parameters_type unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, fetching_policy_type _fetch_policy); unsigned int num_groups_0; + unsigned int num_groups_1; fetching_policy_type fetch_policy; }; diff --git a/lib/backend/templates/base.cpp b/lib/backend/templates/base.cpp index 96d579b66..427da6b91 100644 --- a/lib/backend/templates/base.cpp +++ b/lib/backend/templates/base.cpp @@ -407,32 +407,6 @@ std::pair base::matrix_size(array_expression::node const & node) return std::make_pair(node.lhs.array.shape1,node.lhs.array.shape2); } -void base::element_wise_loop_1D(kernel_generation_stream & stream, loop_body_base const & loop_body, - fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size) -{ - std::string strwidth = tools::to_string(simd_width); - std::string boundround = bound + "/" + strwidth; - - std::string init, upper_bound, inc; - fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size); - stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - loop_body(stream, simd_width); - stream.dec_tab(); - stream << "}" << std::endl; - - if (simd_width>1) - { - stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; - stream << "{" << std::endl; - stream.inc_tab(); - loop_body(stream, 1); - stream.dec_tab(); - stream << "}" << std::endl; - } -} - bool base::is_reduction(array_expression::node const & node) { return node.op.type_family==OPERATOR_VECTOR_REDUCTION_TYPE_FAMILY diff --git a/lib/backend/templates/mreduction.cpp b/lib/backend/templates/mreduction.cpp index 2303296ad..60629e0e7 100644 --- a/lib/backend/templates/mreduction.cpp +++ b/lib/backend/templates/mreduction.cpp @@ -11,7 +11,7 @@ namespace atidlas mreduction_parameters::mreduction_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, fetching_policy_type _fetch_policy): base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), -num_groups_0(_num_groups_0), fetch_policy(_fetch_policy) { } +num_groups_0(_num_groups_0), num_groups_1(2), fetch_policy(_fetch_policy) { } int mreduction::check_invalid_impl(cl::Device const &, expressions_tuple const &) const @@ -29,29 +29,40 @@ unsigned int mreduction::lmem_usage() const std::string mreduction::generate_impl(unsigned int label, expressions_tuple const & expressions, std::vector const & mappings, unsigned int simd_width, std::vector const & exprs) const { using tools::to_string; - - unsigned int lsize0 = p_.local_size_0; - unsigned int lsize1 = p_.local_size_1+1; - std::string lsize1str = to_string(lsize1); + unsigned int local_size_1_ld = p_.local_size_1+1; + std::string local_size_1_ld_str = to_string(local_size_1_ld); kernel_generation_stream stream; char kprefix[10]; fill_kernel_name(kprefix, label, "d"); + std::string arguments = "unsigned int M, unsigned int N, " ; + for (const auto & e : exprs) + { + std::string numeric_type = numeric_type_to_string(lhs_most(e->array_expression().tree(), e->array_expression().root()).lhs.dtype); + if (e->is_index_reduction()) + { + arguments += e->process("__global unsigned int* #name_temp, "); + arguments += e->process("__global " + to_string(numeric_type) + "* #name_temp_value,"); + } + else + arguments += e->process("__global " + to_string(numeric_type) + "* #name_temp, "); + } + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; - stream << "__kernel void " << kprefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, expressions) << ")" << std::endl; + stream << "__kernel void " << kprefix << "0(" << arguments << generate_arguments("#scalartype", mappings, expressions) << ")" << std::endl; stream << "{" << std::endl; stream.inc_tab(); process(stream, PARENT_NODE_TYPE, - tools::make_map >("array0", "#scalartype #namereg = #pointer[#start];") - ("array1", "#pointer += #start;") - ("array2", "#pointer += #start1 + #start2*#ld; " - "#ld *= #nldstride; "), expressions, mappings); + {{"array0", "#scalartype #namereg = #pointer[#start];"}, + {"array1", "#pointer += #start;"}, + {"array2", "#pointer += #start1 + #start2*#ld; " + "#ld *= #nldstride; "}}, expressions, mappings); - for (const auto & expr : exprs) - stream << (expr)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl; + for (const auto & e : exprs) + stream << e->process("__local #scalartype #name_buf[" + to_string(p_.local_size_0*local_size_1_ld) + "];") << std::endl; stream << "unsigned int lid0 = get_local_id(0);" << std::endl; stream << "unsigned int lid1 = get_local_id(1);" << std::endl; @@ -59,75 +70,58 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl; stream.inc_tab(); - for (const auto & expr : exprs) - stream << (expr)->process("#scalartype #name_acc = " + neutral_element((expr)->root_op()) + ";") << std::endl; + for (const auto & e : exprs) + stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op()) + ";") << std::endl; stream << "if (r < M)" << std::endl; stream << "{" << std::endl; stream.inc_tab(); - class loop_body : public loop_body_base + element_wise_loop_1D(stream, p_.fetch_policy, simd_width, "c", "N", "get_global_id(1)", "get_global_size(1)", [&](unsigned int simd_width) { - public: - loop_body(std::vector const & _exprs, reduction_type _reduction_type) : exprs(_exprs), reduction(_reduction_type){ } + std::string data_type = append_width("#scalartype",simd_width); - void operator()(kernel_generation_stream & stream, unsigned int simd_width) const + for (const auto & e : exprs) { - std::string data_type = append_width("#scalartype",simd_width); - - for (const auto & elem : exprs) + std::map accessors; + if(reduction_type_==REDUCE_COLUMNS) { - std::map accessors; - if(reduction==REDUCE_COLUMNS) - { - accessors["array2"] = data_type + " #namereg = " + vload(simd_width, "c*#stride1", "#pointer + r*#ld")+";"; - accessors["repeat"] = data_type + " #namereg = " + vload(simd_width, "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ")+";"; - } - else - { - accessors["array2"] = "#scalartype #namereg = #pointer[r*#stride1 + c*#ld];"; - accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};"; - } - (elem)->process_recursive(stream, PARENT_NODE_TYPE, accessors); + accessors["array2"] = data_type + " #namereg = " + vload(simd_width, "c*#stride1", "#pointer + r*#ld")+";"; + accessors["repeat"] = data_type + " #namereg = " + vload(simd_width, "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ")+";"; } - - - //Update accumulators - std::vector str(simd_width); - if (simd_width==1) - str[0] = "#namereg"; else - for (unsigned int a = 0; a < simd_width; ++a) - str[a] = append_simd_suffix("#namereg.s",a); - - - for (auto & elem : exprs) { - for (unsigned int a = 0; a < simd_width; ++a) - { - std::map accessors; - accessors["array2"] = str[a]; - accessors["repeat"] = str[a]; - accessors["array0"] = "#namereg"; - std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors); - if (elem->is_index_reduction()) - compute_index_reduction(stream, elem->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), elem->process("#name_acc_value"), value,elem->root_op()); - else - compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op()); - } + accessors["array2"] = "#scalartype #namereg = #pointer[r*#stride1 + c*#ld];"; + accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};"; } + e->process_recursive(stream, PARENT_NODE_TYPE, accessors); } - private: - std::vector exprs; - reduction_type reduction; - }; - element_wise_loop_1D(stream, loop_body(exprs, reduction_type_), p_.fetch_policy, simd_width, "c", "N", "get_local_id(1)", "get_local_size(1)"); + + //Update accumulators + std::vector str(simd_width); + if (simd_width==1) + str[0] = "#namereg"; + else + for (unsigned int a = 0; a < simd_width; ++a) + str[a] = append_simd_suffix("#namereg.s",a); + + + for (auto & elem : exprs) + for (unsigned int a = 0; a < simd_width; ++a) + { + std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, {{"array2", str[a]}, {"repeat", str[a]}, {"array0", "#namereg"}}); + if (elem->is_index_reduction()) + compute_index_reduction(stream, elem->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), elem->process("#name_acc_value"), value, elem->root_op()); + else + compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op()); + } + }); stream.dec_tab(); stream << "}" << std::endl; for (auto & expr : exprs) - stream << expr->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl; + stream << expr->process("#name_buf[lid0*" + local_size_1_ld_str + "+ lid1] = #name_acc;") << std::endl; stream << "#pragma unroll" << std::endl; stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl; @@ -139,13 +133,13 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons stream << "{" << std::endl; stream.inc_tab(); - for (auto & expr : exprs) - if (expr->is_index_reduction()) - compute_index_reduction(stream, expr->process("#name_buf[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]") - , expr->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"), - expr->root_op()); + for (auto & e : exprs) + if (e->is_index_reduction()) + compute_index_reduction(stream, e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1 + stride]") + , e->process("#name_buf_value[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf_value[lid0*" + local_size_1_ld_str + " + lid1 + stride]") + , e->root_op()); else - compute_reduction(stream,expr->process("#name_buf[lid0*" + lsize1str + " + lid1]"), expr->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), expr->root_op()); + compute_reduction(stream,e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1 + stride]"), e->root_op()); stream.dec_tab(); stream << "}" << std::endl; @@ -157,10 +151,104 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons stream << "if (lid1 == 0 && r < M)"; stream << "{" << std::endl; stream.inc_tab(); + + for (mapped_reduction const * e : exprs) + { + if (e->is_index_reduction()) + stream << e->process("#name_temp_value[r + M*get_group_id(1)] = #name_buf_value[lid0*" + local_size_1_ld_str + "];") << std::endl; + stream << e->process("#name_temp[r + M*get_group_id(1)] = #name_buf[lid0*" + local_size_1_ld_str + "];") << std::endl; + } + stream.dec_tab(); + stream << "}" << std::endl; + + + stream.dec_tab(); + stream << "}" << std::endl; + + stream.dec_tab(); + stream << "}" << std::endl; + + ///////////////////////////////////////// + ////////////// Kernel 2 + //////////////////////////////////////// + + stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; + stream << "__kernel void " << kprefix << "1(" << arguments << generate_arguments("#scalartype", mappings, expressions) << ")" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + + process(stream, PARENT_NODE_TYPE, + {{"array0", "#scalartype #namereg = #pointer[#start];"}, + {"array1", "#pointer += #start;"}, + {"array2", "#pointer += #start1 + #start2*#ld; " + "#ld *= #nldstride; "}}, expressions, mappings); + + for (const auto & e : exprs) + stream << e->process("__local #scalartype #name_buf[" + to_string(p_.local_size_0*local_size_1_ld) + "];") << std::endl; + + stream << "unsigned int lid0 = get_local_id(0);" << std::endl; + stream << "unsigned int lid1 = get_local_id(1);" << std::endl; + stream << "unsigned int upper_bound_0 = ( M +" << p_.local_size_0 - 1 << ")/" << p_.local_size_0 << "*" << p_.local_size_0 << ";" << std::endl; + stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl; + stream.inc_tab(); + + for (const auto & e : exprs) + stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op()) + ";") << std::endl; + + stream << "if (r < M)" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + + stream << "for(unsigned int c = get_local_id(1); c < " << p_.num_groups_1 << "; c += get_local_size(1)){" << std::endl; + stream.inc_tab(); + + for (mapped_reduction* e: exprs) + compute_reduction(stream, e->process("#name_acc"), e->process("#name_temp[r + M*c]"), e->root_op()); + + stream.dec_tab(); + stream << "}" << std::endl; + + + stream.dec_tab(); + stream << "}" << std::endl; + + for (auto & expr : exprs) + stream << expr->process("#name_buf[lid0*" + local_size_1_ld_str + "+ lid1] = #name_acc;") << std::endl; + + stream << "#pragma unroll" << std::endl; + stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + + stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl; + stream << "if (lid1 < stride)" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + + for (auto & e : exprs) + if (e->is_index_reduction()) + compute_index_reduction(stream, e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1 + stride]") + , e->process("#name_buf_value[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf_value[lid0*" + local_size_1_ld_str + " + lid1 + stride]") + , e->root_op()); + else + compute_reduction(stream,e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1]"), e->process("#name_buf[lid0*" + local_size_1_ld_str + " + lid1 + stride]"), e->root_op()); + + stream.dec_tab(); + stream << "}" << std::endl; + + stream.dec_tab(); + stream << "}" << std::endl; + + + stream << "if (lid1 == 0 && r < M)"; + stream << "{" << std::endl; + stream.inc_tab(); + std::map accessors; - accessors["mreduction"] = "#name_buf[lid0*" + lsize1str + "]"; + accessors["mreduction"] = "#name_buf[lid0*" + local_size_1_ld_str + "]"; accessors["array1"] = "#pointer[r*#stride]"; evaluate(stream, PARENT_NODE_TYPE, accessors, expressions, mappings); + stream.dec_tab(); stream << "}" << std::endl; @@ -171,12 +259,14 @@ std::string mreduction::generate_impl(unsigned int label, expressions_tuple cons stream.dec_tab(); stream << "}" << std::endl; + +// std::cout << stream.str() << std::endl; return stream.str(); } std::vector mreduction::generate_impl(unsigned int label, expressions_tuple const & expressions, std::vector const & mappings) const { - std::vector exprs; + std::vector reductions; expressions_tuple::data_type::const_iterator sit; std::vector::const_iterator mit; for (mit = mappings.begin(), sit = expressions.data().begin(); mit != mappings.end(); ++mit, ++sit) @@ -184,17 +274,17 @@ std::vector mreduction::generate_impl(unsigned int label, expressio array_expression const & first_expression = *expressions.data().front(); std::vector idx = filter_nodes(&is_reduction, first_expression, false); for (auto & elem : idx) - exprs.push_back((mapped_mreduction*)(mit->at(mapping_key(elem, PARENT_NODE_TYPE)).get())); + reductions.push_back((mapped_mreduction*)(mit->at(mapping_key(elem, PARENT_NODE_TYPE)).get())); } std::vector res; if (reduction_type_ && p_.simd_width>1) { - res.push_back(generate_impl(label, expressions, mappings, p_.simd_width, exprs)); - res.push_back(generate_impl(label, expressions, mappings, 1, exprs)); + res.push_back(generate_impl(label, expressions, mappings, p_.simd_width, reductions)); + res.push_back(generate_impl(label, expressions, mappings, 1, reductions)); } else - res.push_back(generate_impl(label, expressions, mappings, 1, exprs)); + res.push_back(generate_impl(label, expressions, mappings, 1, reductions)); return res; } @@ -217,28 +307,67 @@ std::vector mreduction::input_sizes(expressions_tuple const & expressions void mreduction::enqueue(cl::CommandQueue & queue, std::vector & programs, unsigned int label, controller const & controller) { expressions_tuple const & expressions = controller.x(); + cl::Context const & context = expressions.context(); + + char k0[10]; + char k1[10]; + fill_kernel_name(k0, label, "d0"); + fill_kernel_name(k1, label, "d1"); - char kname[10]; - fill_kernel_name(kname, label, "d"); std::vector MN = input_sizes(expressions); + std::vector reductions; + for (const auto & e : expressions.data()) + { + std::vector reductions_idx = filter_nodes(&is_reduction, *e, false); + for (auto & r : reductions_idx) + reductions.push_back(&(e)->tree()[r]); + } + //Kernel int idx = 0; if(reduction_type_==REDUCE_COLUMNS && p_.simd_width>1 && requires_fallback(expressions)) idx = 1; cl::Program & program = programs[idx].program(); - cl::Kernel kernel(program, kname); //NDRange - cl::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1); - cl::NDRange local(p_.local_size_0, p_.local_size_1); + cl::Kernel kernels[2] = { cl::Kernel(program, k0), cl::Kernel(program, k1)}; + cl::NDRange global[2] = { cl::NDRange(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1), cl::NDRange(p_.local_size_0*p_.num_groups_0, p_.local_size_1) }; + cl::NDRange local[2] = { cl::NDRange(p_.local_size_0, p_.local_size_1), cl::NDRange(p_.local_size_0, p_.local_size_1) }; - unsigned int current_arg = 0; - kernel.setArg(current_arg++, cl_uint(MN[0])); - kernel.setArg(current_arg++, cl_uint(MN[1])); - set_arguments(expressions, kernel, current_arg); + std::vector< cl::Buffer > tmp; + std::vector< cl::Buffer > tmpidx; + unsigned int dtype_size = size_of(lhs_most(expressions.data().front()->tree(), expressions.data().front()->root()).lhs.dtype); + for (auto & k : kernels) + { + unsigned int n_arg = 0; + int_t M = MN[0]; + int_t N = MN[1]; + k.setArg(n_arg++, cl_uint(M)); + k.setArg(n_arg++, cl_uint(N)); - controller.execution_options().enqueue_cache(queue, kernel, cl::NullRange, global, local); + //Temporary buffers + unsigned int i = 0; + unsigned int j = 0; + for (auto const & r : reductions) + { + if (is_index_reduction(r->op)) + { + if (tmpidx.size() <= j) + tmpidx.push_back(cl::Buffer(context, CL_MEM_READ_WRITE, p_.num_groups_1*M*4)); + k.setArg(n_arg++, tmpidx[j]); + j++; + } + if (tmp.size() <= i) + tmp.push_back(cl::Buffer(context, CL_MEM_READ_WRITE, p_.num_groups_1*M*dtype_size)); + k.setArg(n_arg++, tmp[i]); + i++; + } + set_arguments(expressions, k, n_arg); + } + + for(unsigned int i = 0 ; i < 2 ; ++i) + controller.execution_options().enqueue_cache(queue, kernels[i], cl::NullRange, global[i], local[i]); } mreduction_rows::mreduction_rows(mreduction_parameters const & parameters, diff --git a/lib/backend/templates/reduction.cpp b/lib/backend/templates/reduction.cpp index 7ecaa729a..e4d5835cf 100644 --- a/lib/backend/templates/reduction.cpp +++ b/lib/backend/templates/reduction.cpp @@ -94,8 +94,9 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr stream.inc_tab(); stream << "unsigned int lid = get_local_id(0);" << std::endl; - process(stream, PARENT_NODE_TYPE, tools::make_map >("array0", "#scalartype #namereg = #pointer[#start];") - ("array1", "#pointer += #start;"), expressions, mappings); + process(stream, PARENT_NODE_TYPE, {{"array0", "#scalartype #namereg = #pointer[#start];"}, + {"array1", "#pointer += #start;"}}, + expressions, mappings); for (unsigned int k = 0; k < N; ++k) { @@ -113,55 +114,44 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr } } - class loop_body : public loop_body_base + + element_wise_loop_1D(stream, p_.fetching_policy, simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)", [&](unsigned int simd_width) { - public: - loop_body(std::vector const & _exprs) : exprs(_exprs){ } + std::string i = (simd_width==1)?"i*#stride":"i"; + //Fetch vector entry + for (const auto & elem : exprs) + (elem)->process_recursive(stream, PARENT_NODE_TYPE, {{"array1", append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";"}, + {"matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride, i*#stride2}];"}, + {"matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride,#column*#stride2}];"}, + {"matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride, i*#stride2}:$OFFSET{i*#stride, (i + #diag_offset)*#stride2}];"}}); - void operator()(kernel_generation_stream & stream, unsigned int simd_width) const + //Update accumulators + std::vector str(simd_width); + if (simd_width==1) + str[0] = "#namereg"; + else + for (unsigned int a = 0; a < simd_width; ++a) + str[a] = append_simd_suffix("#namereg.s", a); + + for (auto & elem : exprs) { - std::string i = (simd_width==1)?"i*#stride":"i"; - //Fetch vector entry - for (const auto & elem : exprs) - (elem)->process_recursive(stream, PARENT_NODE_TYPE, tools::make_map >("array1", append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";") - ("matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride, i*#stride2}];") - ("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride,#column*#stride2}];") - ("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride, i*#stride2}:$OFFSET{i*#stride, (i + #diag_offset)*#stride2}];")); - - - //Update accumulators - std::vector str(simd_width); - if (simd_width==1) - str[0] = "#namereg"; - else - for (unsigned int a = 0; a < simd_width; ++a) - str[a] = append_simd_suffix("#namereg.s", a); - - for (auto & elem : exprs) + for (unsigned int a = 0; a < simd_width; ++a) { - for (unsigned int a = 0; a < simd_width; ++a) - { - std::map accessors; - accessors["array1"] = str[a]; - accessors["matrix_row"] = str[a]; - accessors["matrix_column"] = str[a]; - accessors["matrix_diag"] = str[a]; - accessors["array0"] = "#namereg"; - std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors); - if (elem->is_index_reduction()) - compute_index_reduction(stream, elem->process("#name_acc"), "i*" + tools::to_string(simd_width) + "+" - + tools::to_string(a), elem->process("#name_acc_value"), value,elem->root_op()); - else - compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op()); - } + std::map accessors; + accessors["array1"] = str[a]; + accessors["matrix_row"] = str[a]; + accessors["matrix_column"] = str[a]; + accessors["matrix_diag"] = str[a]; + accessors["array0"] = "#namereg"; + std::string value = elem->evaluate_recursive(LHS_NODE_TYPE, accessors); + if (elem->is_index_reduction()) + compute_index_reduction(stream, elem->process("#name_acc"), "i*" + tools::to_string(simd_width) + "+" + + tools::to_string(a), elem->process("#name_acc_value"), value,elem->root_op()); + else + compute_reduction(stream, elem->process("#name_acc"), value,elem->root_op()); } } - - private: - std::vector exprs; - }; - - element_wise_loop_1D(stream, loop_body(exprs), p_.fetching_policy, simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)"); + }); //Fills local memory for (unsigned int k = 0; k < N; ++k) @@ -200,31 +190,30 @@ std::string reduction::generate_impl(unsigned int label, const char * type, expr stream << "unsigned int lid = get_local_id(0);" << std::endl; - for (unsigned int k = 0; k < N; ++k) + for (mapped_scalar_reduction* e: exprs) { - if (exprs[k]->is_index_reduction()) + if (e->is_index_reduction()) { - stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];"); - stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl; - stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl; - stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";"); + stream << e->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];"); + stream << e->process("unsigned int #name_acc = 0;") << std::endl; + stream << e->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl; + stream << e->process("#scalartype #name_acc_value = " + neutral_element(e->root_op()) + ";"); } else { - stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl; - stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";"); + stream << e->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl; + stream << e->process("#scalartype #name_acc = " + neutral_element(e->root_op()) + ";"); } } stream << "for(unsigned int i = lid; i < " << p_.num_groups << "; i += get_local_size(0))" << std::endl; stream << "{" << std::endl; stream.inc_tab(); - for (unsigned int k = 0; k < N; ++k) - if (exprs[k]->is_index_reduction()) - compute_index_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"), - exprs[k]->process("#name_acc_value"),exprs[k]->process("#name_temp_value[i]"),exprs[k]->root_op()); + for (mapped_scalar_reduction* e: exprs) + if (e->is_index_reduction()) + compute_index_reduction(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->process("#name_acc_value"),e->process("#name_temp_value[i]"),e->root_op()); else - compute_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"), exprs[k]->root_op()); + compute_reduction(stream, e->process("#name_acc"), e->process("#name_temp[i]"), e->root_op()); stream.dec_tab(); stream << "}" << std::endl; diff --git a/lib/model/model.cpp b/lib/model/model.cpp index 4dbe4410f..b70604453 100644 --- a/lib/model/model.cpp +++ b/lib/model/model.cpp @@ -254,8 +254,8 @@ model_map_t init_models(cl::CommandQueue & queue) res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new model(mproduct_nt(1, 8, 8, 8, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8), queue)); res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new model(mproduct_tt(1, 8, 8, 8, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8), queue)); } - if(const char * homepath = std::getenv("HOME")) - import(std::string(homepath) + "/.atidlas/devices/device0.json", queue, res); +// if(const char * homepath = std::getenv("HOME")) +// import(std::string(homepath) + "/.atidlas/devices/device0.json", queue, res); return res; }