CUDA: Many fixes in the backend
This commit is contained in:
@@ -240,21 +240,24 @@ void base::base::process_all_at(std::string const & type_key, std::string const
|
||||
}
|
||||
}
|
||||
|
||||
std::string base::neutral_element(op_element const & op)
|
||||
std::string base::neutral_element(op_element const & op, driver::backend_type backend, std::string const & dtype)
|
||||
{
|
||||
std::string INF = Infinity(backend, dtype).get();
|
||||
std::string N_INF = "-" + INF;
|
||||
|
||||
switch (op.type)
|
||||
{
|
||||
case OPERATOR_ADD_TYPE : return "0";
|
||||
case OPERATOR_MULT_TYPE : return "1";
|
||||
case OPERATOR_DIV_TYPE : return "1";
|
||||
case OPERATOR_ELEMENT_FMAX_TYPE : return "-INFINITY";
|
||||
case OPERATOR_ELEMENT_ARGFMAX_TYPE : return "-INFINITY";
|
||||
case OPERATOR_ELEMENT_MAX_TYPE : return "-INFINITY";
|
||||
case OPERATOR_ELEMENT_ARGMAX_TYPE : return "-INFINITY";
|
||||
case OPERATOR_ELEMENT_FMIN_TYPE : return "INFINITY";
|
||||
case OPERATOR_ELEMENT_ARGFMIN_TYPE : return "INFINITY";
|
||||
case OPERATOR_ELEMENT_MIN_TYPE : return "INFINITY";
|
||||
case OPERATOR_ELEMENT_ARGMIN_TYPE : return "INFINITY";
|
||||
case OPERATOR_ELEMENT_FMAX_TYPE : return N_INF;
|
||||
case OPERATOR_ELEMENT_ARGFMAX_TYPE : return N_INF;
|
||||
case OPERATOR_ELEMENT_MAX_TYPE : return N_INF;
|
||||
case OPERATOR_ELEMENT_ARGMAX_TYPE : return N_INF;
|
||||
case OPERATOR_ELEMENT_FMIN_TYPE : return INF;
|
||||
case OPERATOR_ELEMENT_ARGFMIN_TYPE : return INF;
|
||||
case OPERATOR_ELEMENT_MIN_TYPE : return INF;
|
||||
case OPERATOR_ELEMENT_ARGMIN_TYPE : return INF;
|
||||
|
||||
default: throw operation_not_supported_exception("Unsupported reduction operator : no neutral element known");
|
||||
}
|
||||
@@ -292,7 +295,7 @@ void base::set_arguments(expressions_tuple const & expressions, driver::Kernel &
|
||||
base::invalid_exception::invalid_exception() : message_() {}
|
||||
|
||||
base::invalid_exception::invalid_exception(std::string message) :
|
||||
message_("ViennaCL: Internal error: The generator cannot apply the given template to the given array_expression: " + message + "\n"
|
||||
message_("ISAAC: Internal error: The generator cannot apply the given template to the given array_expression: " + message + "\n"
|
||||
"If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
|
||||
"If you are using your own template, please try using other parameters") {}
|
||||
|
||||
|
@@ -30,9 +30,17 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
std::string _size_t = size_type(device);
|
||||
std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;
|
||||
std::string data_type = append_width("#scalartype",p_.simd_width);
|
||||
driver::backend_type backend = device.backend();
|
||||
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
|
||||
stream << "__kernel void axpy" << suffix << "(" << _size_t << " M, " << _size_t << " N, " << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
|
||||
switch(backend)
|
||||
{
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break;
|
||||
#endif
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
|
||||
}
|
||||
|
||||
stream << KernelPrefix(backend) << " void axpy" << suffix << "(" << _size_t << " M, " << _size_t << " N, " << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
@@ -40,11 +48,11 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
("array1", "#pointer += #start;")
|
||||
("array2", "#pointer = &$VALUE{#start1, #start2};"), expressions, mappings);
|
||||
|
||||
fetching_loop_info(p_.fetching_policy, "M", stream, init0, upper_bound0, inc0, "get_global_id(0)", "get_global_size(0)", device);
|
||||
fetching_loop_info(p_.fetching_policy, "M", stream, init0, upper_bound0, inc0, GlobalIdx0(backend).get(), GlobalSize0(backend).get(), device);
|
||||
stream << "for(" << _size_t << " i = " << init0 << "; i < " << upper_bound0 << "; i += " << inc0 << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
fetching_loop_info(p_.fetching_policy, "N", stream, init1, upper_bound1, inc1, "get_global_id(1)", "get_global_size(1)", device);
|
||||
fetching_loop_info(p_.fetching_policy, "N", stream, init1, upper_bound1, inc1, GlobalIdx1(backend).get(), GlobalSize1(backend).get(), device);
|
||||
stream << "for(" << _size_t << " j = " << init1 << "; j < " << upper_bound1 << "; j += " << inc1 << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
@@ -62,7 +70,8 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
("repeat", "#namereg")
|
||||
("array0", "#namereg")
|
||||
("outer", "#namereg")
|
||||
("cast", "convert_"+data_type)
|
||||
("cast", CastPrefix(backend, data_type).get())
|
||||
("host_scalar", p_.simd_width==1?"#name": InitPrefix(backend, data_type).get() + "(#name)")
|
||||
, expressions, mappings);
|
||||
|
||||
process(stream, LHS_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array2", "$VALUE{i*#stride1,j*#stride2} = #namereg;")
|
||||
@@ -76,7 +85,6 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
// std::cout << stream.str() << std::endl;
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
|
@@ -129,8 +129,14 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
strcat(gemm_name, suffix);
|
||||
strcat(reduce_name, suffix);
|
||||
|
||||
if(backend==driver::OPENCL)
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
|
||||
switch(backend)
|
||||
{
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break;
|
||||
#endif
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
|
||||
}
|
||||
|
||||
stream << KernelPrefix(backend) << " void " << gemm_name << "(" << _size_t << " M, " << _size_t << " N, " << _size_t << " K, "
|
||||
<< Global(backend) << " " << sdtype << "* C, " << _size_t << " Cld," << _size_t << " Coff," << _size_t << " Cstride1, "
|
||||
<< sdtype << " alpha,"
|
||||
|
@@ -65,8 +65,13 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
arguments += e->process(Global(backend).get() + " " + to_string(numeric_type) + "* #name_temp, ");
|
||||
}
|
||||
|
||||
if(backend==driver::OPENCL)
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
|
||||
switch(backend)
|
||||
{
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break;
|
||||
#endif
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
|
||||
}
|
||||
|
||||
stream << KernelPrefix(backend) << " void " << name[0] << "(" << arguments << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
@@ -99,7 +104,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
stream.inc_tab();
|
||||
|
||||
for (const auto & e : reductions)
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op()) + ";") << std::endl;
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op(), backend, "#scalartype") + ";") << std::endl;
|
||||
|
||||
stream << "if (r < M)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
@@ -240,7 +245,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
stream.inc_tab();
|
||||
|
||||
for (const auto & e : reductions)
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op()) + ";") << std::endl;
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element((e)->root_op(), backend, "#scalartype") + ";") << std::endl;
|
||||
|
||||
stream << "if (r < M)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
|
@@ -72,8 +72,7 @@ std::string reduction::generate_impl(const char * suffix, expressions_tuple cons
|
||||
std::string arguments = _size_t + " N, ";
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
std::string numeric_type = numeric_type_to_string(lhs_most(exprs[k]->array_expression().tree(),
|
||||
exprs[k]->array_expression().root()).lhs.dtype);
|
||||
std::string numeric_type = numeric_type_to_string(lhs_most(exprs[k]->array_expression().tree(), exprs[k]->array_expression().root()).lhs.dtype);
|
||||
if (exprs[k]->is_index_reduction())
|
||||
{
|
||||
arguments += exprs[k]->process(Global(backend).get() + " unsigned int* #name_temp, ");
|
||||
@@ -90,8 +89,14 @@ std::string reduction::generate_impl(const char * suffix, expressions_tuple cons
|
||||
/* ------------------------
|
||||
* First Kernel
|
||||
* -----------------------*/
|
||||
if(backend==driver::OPENCL)
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
|
||||
switch(backend)
|
||||
{
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break;
|
||||
#endif
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl; break;
|
||||
}
|
||||
|
||||
stream << KernelPrefix(backend) << " void " << name[0] << "(" << arguments << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
@@ -110,19 +115,19 @@ std::string reduction::generate_impl(const char * suffix, expressions_tuple cons
|
||||
if (exprs[k]->is_index_reduction())
|
||||
{
|
||||
stream << exprs[k]->process(Local(backend).get() + " #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op(), backend, "#scalartype") + ";") << std::endl;
|
||||
stream << exprs[k]->process(Local(backend).get() + " unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
stream << exprs[k]->process(Local(backend).get() + " #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op(), backend, "#scalartype") + ";") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
element_wise_loop_1D(stream, p_.fetching_policy, p_.simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)", device, [&](unsigned int simd_width)
|
||||
element_wise_loop_1D(stream, p_.fetching_policy, p_.simd_width, "i", "N", GlobalIdx0(backend).get(), GlobalSize0(backend).get(), device, [&](unsigned int simd_width)
|
||||
{
|
||||
std::string i = (simd_width==1)?"i*#stride":"i";
|
||||
//Fetch vector entry
|
||||
@@ -190,8 +195,9 @@ std::string reduction::generate_impl(const char * suffix, expressions_tuple cons
|
||||
/* ------------------------
|
||||
* Second kernel
|
||||
* -----------------------*/
|
||||
if(backend==driver::OPENCL)
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
|
||||
|
||||
|
||||
|
||||
stream << KernelPrefix(backend) << " void " << name[1] << "(" << arguments << generate_arguments("#scalartype", device, mappings, expressions) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
@@ -206,12 +212,12 @@ std::string reduction::generate_impl(const char * suffix, expressions_tuple cons
|
||||
stream << e->process(Local(backend).get() + " unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];");
|
||||
stream << e->process("unsigned int #name_acc = 0;") << std::endl;
|
||||
stream << e->process(Local(backend).get() + " #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << e->process("#scalartype #name_acc_value = " + neutral_element(e->root_op()) + ";");
|
||||
stream << e->process("#scalartype #name_acc_value = " + neutral_element(e->root_op(), backend, "#scalartype") + ";");
|
||||
}
|
||||
else
|
||||
{
|
||||
stream << e->process(Local(backend).get() + " #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element(e->root_op()) + ";");
|
||||
stream << e->process("#scalartype #name_acc = " + neutral_element(e->root_op(), backend, "#scalartype") + ";");
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -38,7 +38,7 @@ std::string vaxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
#ifdef ISAAC_WITH_CUDA
|
||||
case driver::CUDA: stream << "#include \"helper_math.h\"" << std::endl; break;
|
||||
#endif
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl; break;
|
||||
case driver::OPENCL: stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break;
|
||||
}
|
||||
|
||||
stream << KernelPrefix(backend) << " void " << "axpy" << suffix << "(" << _size_t << " N," << generate_arguments(dtype, device, mappings, expressions) << ")" << std::endl;
|
||||
@@ -87,7 +87,6 @@ std::string vaxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user