Code quality: shortened parameter names in JIT code generator

This commit is contained in:
Philippe Tillet
2016-07-02 12:06:05 -07:00
parent 35260a49d4
commit 1e178dab22
15 changed files with 222 additions and 225 deletions

View File

@@ -39,7 +39,7 @@ namespace isaac
namespace templates namespace templates
{ {
enum fetching_policy_type enum fetch_type
{ {
FETCH_FROM_LOCAL, FETCH_FROM_LOCAL,
FETCH_FROM_GLOBAL_STRIDED, FETCH_FROM_GLOBAL_STRIDED,
@@ -75,10 +75,10 @@ class base
public: public:
struct parameters_type struct parameters_type
{ {
parameters_type(unsigned int _simd_width, int_t _local_size_1, int_t _local_size_2, int_t _num_kernels); parameters_type(unsigned int _vwidth, int_t _ls0, int_t _ls1, int_t _num_kernels);
unsigned int simd_width; unsigned int vwidth;
unsigned int local_size_0; unsigned int ls0;
unsigned int local_size_1; unsigned int ls1;
unsigned int num_kernels; unsigned int num_kernels;
}; };
private: private:
@@ -107,8 +107,8 @@ private:
public: public:
typedef ParametersType parameters_type; typedef ParametersType parameters_type;
base_impl(parameters_type const & parameters, fusion_policy_t fusion_policy); base_impl(parameters_type const & parameters, fusion_policy_t fusion_policy);
unsigned int local_size_0() const; unsigned int ls0() const;
unsigned int local_size_1() const; unsigned int ls1() const;
std::shared_ptr<base> clone() const; std::shared_ptr<base> clone() const;
/** @brief returns whether or not the profile has undefined behavior on particular device */ /** @brief returns whether or not the profile has undefined behavior on particular device */
int is_invalid(expression_tree const & expressions, driver::Device const & device) const; int is_invalid(expression_tree const & expressions, driver::Device const & device) const;

View File

@@ -32,9 +32,9 @@ namespace templates
class elementwise_1d_parameters : public base::parameters_type class elementwise_1d_parameters : public base::parameters_type
{ {
public: public:
elementwise_1d_parameters(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy); elementwise_1d_parameters(unsigned int _vwidth, unsigned int _group_size, unsigned int _num_groups, fetch_type _fetch);
unsigned int num_groups; unsigned int num_groups;
fetching_policy_type fetching_policy; fetch_type fetch;
}; };
class elementwise_1d : public base_impl<elementwise_1d, elementwise_1d_parameters> class elementwise_1d : public base_impl<elementwise_1d, elementwise_1d_parameters>
@@ -44,7 +44,7 @@ private:
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & symbols) const; std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & symbols) const;
public: public:
elementwise_1d(elementwise_1d::parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); elementwise_1d(elementwise_1d::parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
elementwise_1d(unsigned int _simd_width, unsigned int _group_size, unsigned int _num_groups, fetching_policy_type _fetching_policy, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); elementwise_1d(unsigned int _vwidth, unsigned int _group_size, unsigned int _num_groups, fetch_type _fetch, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
std::vector<int_t> input_sizes(expression_tree const & expressions) const; std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &); void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
}; };

View File

@@ -33,11 +33,11 @@ namespace templates
class elementwise_2d_parameters : public base::parameters_type class elementwise_2d_parameters : public base::parameters_type
{ {
public: public:
elementwise_2d_parameters(unsigned int _simd_width, unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetching_policy); elementwise_2d_parameters(unsigned int _vwidth, unsigned int _ls0, unsigned int _ls1, unsigned int _num_groups_0, unsigned int _num_groups_1, fetch_type _fetch);
unsigned int num_groups_0; unsigned int num_groups_0;
unsigned int num_groups_1; unsigned int num_groups_1;
fetching_policy_type fetching_policy; fetch_type fetch;
}; };
class elementwise_2d : public base_impl<elementwise_2d, elementwise_2d_parameters> class elementwise_2d : public base_impl<elementwise_2d, elementwise_2d_parameters>
@@ -47,7 +47,7 @@ private:
std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const; std::string generate_impl(std::string const & suffix, expression_tree const & expressions, driver::Device const & device, symbolic::symbols_table const & mapping) const;
public: public:
elementwise_2d(parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); elementwise_2d(parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT); elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetch_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT);
std::vector<int_t> input_sizes(expression_tree const & expressions) const; std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &); void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
}; };

View File

@@ -33,11 +33,11 @@ namespace templates
struct matrix_product_parameters : public base::parameters_type struct matrix_product_parameters : public base::parameters_type
{ {
matrix_product_parameters(unsigned int simd_width matrix_product_parameters(unsigned int vwidth
, unsigned int local_size_0, unsigned int KL, unsigned int local_size_1, unsigned int D , unsigned int ls0, unsigned int KL, unsigned int ls1, unsigned int D
, unsigned int ms, unsigned int ks, unsigned int ns , unsigned int ms, unsigned int ks, unsigned int ns
, fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy , fetch_type Afetch, fetch_type Bfetch
, unsigned int local_fetch_0, unsigned int local_fetch_1); , unsigned int lf0, unsigned int lf1);
unsigned int kL; unsigned int kL;
unsigned int depth; unsigned int depth;
@@ -46,11 +46,11 @@ struct matrix_product_parameters : public base::parameters_type
unsigned int kS; unsigned int kS;
unsigned int nS; unsigned int nS;
fetching_policy_type A_fetching_policy; fetch_type Afetch;
fetching_policy_type B_fetching_policy; fetch_type Bfetch;
unsigned int local_fetch_0; unsigned int lf0;
unsigned int local_fetch_1; unsigned int lf1;
unsigned int mL; unsigned int mL;
unsigned int nL; unsigned int nL;
@@ -84,7 +84,7 @@ class matrix_product_nn : public matrix_product
{ {
public: public:
matrix_product_nn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D matrix_product_nn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1); , int_t lfetch0, int_t lfetch1);
}; };
@@ -92,7 +92,7 @@ class matrix_product_tn : public matrix_product
{ {
public: public:
matrix_product_tn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D matrix_product_tn(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1); , int_t lfetch0, int_t lfetch1);
}; };
@@ -101,7 +101,7 @@ class matrix_product_nt : public matrix_product
{ {
public: public:
matrix_product_nt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D matrix_product_nt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1); , int_t lfetch0, int_t lfetch1);
}; };
@@ -110,7 +110,7 @@ class matrix_product_tt : public matrix_product
{ {
public: public:
matrix_product_tt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D matrix_product_tt(unsigned int simd, int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns, fetching_policy_type Afetch , fetching_policy_type Bfetch , int_t ms, int_t ks, int_t ns, fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1); , int_t lfetch0, int_t lfetch1);
}; };

View File

@@ -31,11 +31,11 @@ namespace templates
struct reduce_1d_parameters : public base::parameters_type struct reduce_1d_parameters : public base::parameters_type
{ {
reduce_1d_parameters(unsigned int _simd_width, reduce_1d_parameters(unsigned int _vwidth,
unsigned int _group_size, unsigned int _num_groups, unsigned int _group_size, unsigned int _num_groups,
fetching_policy_type _fetching_policy); fetch_type _fetch);
unsigned int num_groups; unsigned int num_groups;
fetching_policy_type fetching_policy; fetch_type fetch;
}; };
class reduce_1d : public base_impl<reduce_1d, reduce_1d_parameters> class reduce_1d : public base_impl<reduce_1d, reduce_1d_parameters>
@@ -50,7 +50,7 @@ private:
public: public:
reduce_1d(reduce_1d::parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); reduce_1d(reduce_1d::parameters_type const & parameters, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng, fetching_policy_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT); reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng, fetch_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT);
std::vector<int_t> input_sizes(expression_tree const & expressions) const; std::vector<int_t> input_sizes(expression_tree const & expressions) const;
void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &); void enqueue(driver::CommandQueue & queue, driver::Program const & program, std::string const & suffix, runtime::execution_handler const &);
private: private:

View File

@@ -33,12 +33,12 @@ namespace templates
{ {
struct reduce_2d_parameters : public base::parameters_type struct reduce_2d_parameters : public base::parameters_type
{ {
reduce_2d_parameters(unsigned int _simd_width, reduce_2d_parameters(unsigned int _vwidth,
unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _ls0, unsigned int _ls1,
unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetch_policy); unsigned int _num_groups_0, unsigned int _num_groups_1, fetch_type _fetch_policy);
unsigned int num_groups_0; unsigned int num_groups_0;
unsigned int num_groups_1; unsigned int num_groups_1;
fetching_policy_type fetch_policy; fetch_type fetch_policy;
}; };
@@ -62,14 +62,14 @@ class reduce_2d_rows : public reduce_2d
{ {
public: public:
reduce_2d_rows(reduce_2d::parameters_type const &, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); reduce_2d_rows(reduce_2d::parameters_type const &, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
reduce_2d_rows(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT); reduce_2d_rows(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetch_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT);
}; };
class reduce_2d_cols : public reduce_2d class reduce_2d_cols : public reduce_2d
{ {
public: public:
reduce_2d_cols(reduce_2d::parameters_type const &, fusion_policy_t fusion_policy = FUSE_INDEPENDENT); reduce_2d_cols(reduce_2d::parameters_type const &, fusion_policy_t fusion_policy = FUSE_INDEPENDENT);
reduce_2d_cols(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT); reduce_2d_cols(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, fetch_type fetch, fusion_policy_t bind = FUSE_INDEPENDENT);
}; };
} }

View File

@@ -40,7 +40,7 @@ namespace isaac
namespace templates namespace templates
{ {
base::parameters_type::parameters_type(unsigned int _simd_width, int_t _local_size_1, int_t _local_size_2, int_t _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels) base::parameters_type::parameters_type(unsigned int _vwidth, int_t _ls0, int_t _ls1, int_t _num_kernels) : vwidth(_vwidth), ls0(_ls0), ls1(_ls1), num_kernels(_num_kernels)
{ } { }
base::base(fusion_policy_t fusion_policy) : fusion_policy_(fusion_policy) base::base(fusion_policy_t fusion_policy) : fusion_policy_(fusion_policy)
@@ -79,12 +79,12 @@ base_impl<TType, PType>::base_impl(parameters_type const & parameters, fusion_po
{ } { }
template<class TType, class PType> template<class TType, class PType>
unsigned int base_impl<TType, PType>::local_size_0() const unsigned int base_impl<TType, PType>::ls0() const
{ return p_.local_size_0; } { return p_.ls0; }
template<class TType, class PType> template<class TType, class PType>
unsigned int base_impl<TType, PType>::local_size_1() const unsigned int base_impl<TType, PType>::ls1() const
{ return p_.local_size_1; } { return p_.ls1; }
template<class TType, class PType> template<class TType, class PType>
std::shared_ptr<base> base_impl<TType, PType>::clone() const std::shared_ptr<base> base_impl<TType, PType>::clone() const
@@ -102,16 +102,16 @@ int base_impl<TType, PType>::is_invalid(expression_tree const & expressions, dr
//Invalid work group size //Invalid work group size
size_t max_workgroup_size = device.max_work_group_size(); size_t max_workgroup_size = device.max_work_group_size();
std::vector<size_t> max_work_item_sizes = device.max_work_item_sizes(); std::vector<size_t> max_work_item_sizes = device.max_work_item_sizes();
if (p_.local_size_0*p_.local_size_1 > max_workgroup_size) if (p_.ls0*p_.ls1 > max_workgroup_size)
return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW; return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
if (p_.local_size_0 > max_work_item_sizes[0]) if (p_.ls0 > max_work_item_sizes[0])
return TEMPLATE_LOCAL_SIZE_0_OVERFLOW; return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
if (p_.local_size_1 > max_work_item_sizes[1]) if (p_.ls1 > max_work_item_sizes[1])
return TEMPLATE_LOCAL_SIZE_1_OVERFLOW; return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
//Invalid SIMD Width //Invalid SIMD Width
if (p_.simd_width!=1 && p_.simd_width!=2 && p_.simd_width!=3 && p_.simd_width!=4) if (p_.vwidth!=1 && p_.vwidth!=2 && p_.vwidth!=3 && p_.vwidth!=4)
return TEMPLATE_INVALID_SIMD_WIDTH; return TEMPLATE_INVALID_SIMD_WIDTH;
return is_invalid_impl(device, expressions); return is_invalid_impl(device, expressions);

View File

@@ -36,17 +36,17 @@ namespace isaac
namespace templates namespace templates
{ {
elementwise_1d_parameters::elementwise_1d_parameters(unsigned int _simd_width, elementwise_1d_parameters::elementwise_1d_parameters(unsigned int _vwidth,
unsigned int _group_size, unsigned int _num_groups, unsigned int _group_size, unsigned int _num_groups,
fetching_policy_type _fetching_policy) : fetch_type _fetch) :
base::parameters_type(_simd_width, _group_size, 1, 1), num_groups(_num_groups), fetching_policy(_fetching_policy) base::parameters_type(_vwidth, _group_size, 1, 1), num_groups(_num_groups), fetch(_fetch)
{ {
} }
int elementwise_1d::is_invalid_impl(driver::Device const &, expression_tree const &) const int elementwise_1d::is_invalid_impl(driver::Device const &, expression_tree const &) const
{ {
if (p_.fetching_policy==FETCH_FROM_LOCAL) if (p_.fetch==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE; return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID; return TEMPLATE_VALID;
} }
@@ -65,7 +65,7 @@ std::string elementwise_1d::generate_impl(std::string const & suffix, expression
case driver::CUDA: case driver::CUDA:
stream << "#include \"vector.h\"" << std::endl; break; stream << "#include \"vector.h\"" << std::endl; break;
case driver::OPENCL: case driver::OPENCL:
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << "," << p_.ls1 << ",1)))" << std::endl; break;
} }
stream << "$KERNEL void elementwise_1d" << suffix << "($SIZE_T N, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")"; stream << "$KERNEL void elementwise_1d" << suffix << "($SIZE_T N, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")";
@@ -83,9 +83,9 @@ std::string elementwise_1d::generate_impl(std::string const & suffix, expression
stream.inc_tab(); stream.inc_tab();
} }
element_wise_loop_1D(stream, p_.fetching_policy, p_.simd_width, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int simd_width) element_wise_loop_1D(stream, p_.fetch, p_.vwidth, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int vwidth)
{ {
std::string dtype = append_width("#scalartype",simd_width); std::string dtype = append_width("#scalartype",vwidth);
//Declares register to store results //Declares register to store results
for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_lhs, false)) for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_lhs, false))
@@ -93,17 +93,17 @@ std::string elementwise_1d::generate_impl(std::string const & suffix, expression
//Load to registers //Load to registers
for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_rhs, false)) for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_rhs, false))
stream << sym->process(dtype + " #name = " + append_width("loadv", simd_width) + "(i);") << std::endl; stream << sym->process(dtype + " #name = " + append_width("loadv", vwidth) + "(i);") << std::endl;
//Compute //Compute
for(size_t idx: assignments) for(size_t idx: assignments)
for(unsigned int s = 0 ; s < simd_width ; ++s) for(unsigned int s = 0 ; s < vwidth ; ++s)
stream << symbols.at(idx)->evaluate({{"leaf", access_vector_type("#name", s, simd_width)}}) << ";" << std::endl; stream << symbols.at(idx)->evaluate({{"leaf", access_vector_type("#name", s, vwidth)}}) << ";" << std::endl;
//Writes back //Writes back
for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_lhs, false)) for(symbolic::leaf* sym: symbolic::extract<symbolic::leaf>(tree, symbols, assignments_lhs, false))
for(unsigned int s = 0 ; s < simd_width ; ++s) for(unsigned int s = 0 ; s < vwidth ; ++s)
stream << sym->process("at(i+" + tools::to_string(s)+") = " + access_vector_type("#name", s, simd_width) + ";") << std::endl; stream << sym->process("at(i+" + tools::to_string(s)+") = " + access_vector_type("#name", s, vwidth) + ";") << std::endl;
}); });
//Close user-provided for-loops //Close user-provided for-loops
if(sfors.size()){ if(sfors.size()){
@@ -124,7 +124,7 @@ elementwise_1d::elementwise_1d(elementwise_1d_parameters const & parameters,
{} {}
elementwise_1d::elementwise_1d(unsigned int simd, unsigned int ls, unsigned int ng, elementwise_1d::elementwise_1d(unsigned int simd, unsigned int ls, unsigned int ng,
fetching_policy_type fetch, fusion_policy_t bind): fetch_type fetch, fusion_policy_t bind):
base_impl<elementwise_1d, elementwise_1d_parameters>(elementwise_1d_parameters(simd,ls,ng,fetch), bind) base_impl<elementwise_1d, elementwise_1d_parameters>(elementwise_1d_parameters(simd,ls,ng,fetch), bind)
{} {}
@@ -144,8 +144,8 @@ void elementwise_1d::enqueue(driver::CommandQueue &, driver::Program const & pro
name += suffix; name += suffix;
driver::Kernel kernel(program, name.c_str()); driver::Kernel kernel(program, name.c_str());
//NDRange //NDRange
driver::NDRange global(p_.local_size_0*p_.num_groups); driver::NDRange global(p_.ls0*p_.num_groups);
driver::NDRange local(p_.local_size_0); driver::NDRange local(p_.ls0);
//Arguments //Arguments
unsigned int current_arg = 0; unsigned int current_arg = 0;
kernel.setSizeArg(current_arg++, size); kernel.setSizeArg(current_arg++, size);

View File

@@ -33,18 +33,18 @@ namespace isaac
namespace templates namespace templates
{ {
elementwise_2d_parameters::elementwise_2d_parameters(unsigned int _simd_width, elementwise_2d_parameters::elementwise_2d_parameters(unsigned int _vwidth,
unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _ls0, unsigned int _ls1,
unsigned int _num_groups_0, unsigned int _num_groups_1, unsigned int _num_groups_0, unsigned int _num_groups_1,
fetching_policy_type _fetching_policy) : base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetching_policy(_fetching_policy){ } fetch_type _fetch) : base::parameters_type(_vwidth, _ls0, _ls1, 1), num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetch(_fetch){ }
int elementwise_2d::is_invalid_impl(driver::Device const &, expression_tree const &) const int elementwise_2d::is_invalid_impl(driver::Device const &, expression_tree const &) const
{ {
if (p_.simd_width>1) if (p_.vwidth>1)
return TEMPLATE_INVALID_SIMD_WIDTH; return TEMPLATE_INVALID_SIMD_WIDTH;
if(p_.fetching_policy==FETCH_FROM_LOCAL) if(p_.fetch==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE; return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID; return TEMPLATE_VALID;
} }
@@ -67,7 +67,7 @@ std::string elementwise_2d::generate_impl(std::string const & suffix, expression
case driver::CUDA: case driver::CUDA:
stream << "#include \"vector.h\"" << std::endl; break; stream << "#include \"vector.h\"" << std::endl; break;
case driver::OPENCL: case driver::OPENCL:
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; break; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << "," << p_.ls1 << ",1)))" << std::endl; break;
} }
stream << "$KERNEL void elementwise_2d" << suffix << "($SIZE_T M, $SIZE_T N, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl; stream << "$KERNEL void elementwise_2d" << suffix << "($SIZE_T M, $SIZE_T N, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl;
@@ -75,11 +75,11 @@ std::string elementwise_2d::generate_impl(std::string const & suffix, expression
stream.inc_tab(); stream.inc_tab();
fetching_loop_info(p_.fetching_policy, "M", stream, init0, upper_bound0, inc0, "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device); fetching_loop_info(p_.fetch, "M", stream, init0, upper_bound0, inc0, "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device);
stream << "for($SIZE_T i = " << init0 << "; i < " << upper_bound0 << "; i += " << inc0 << ")" << std::endl; stream << "for($SIZE_T i = " << init0 << "; i < " << upper_bound0 << "; i += " << inc0 << ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
fetching_loop_info(p_.fetching_policy, "N", stream, init1, upper_bound1, inc1, "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", device); fetching_loop_info(p_.fetch, "N", stream, init1, upper_bound1, inc1, "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", device);
stream << "for($SIZE_T j = " << init1 << "; j < " << upper_bound1 << "; j += " << inc1 << ")" << std::endl; stream << "for($SIZE_T j = " << init1 << "; j < " << upper_bound1 << "; j += " << inc1 << ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
@@ -115,7 +115,7 @@ elementwise_2d::elementwise_2d(parameters_type const & parameters, fusion_policy
base_impl<elementwise_2d, elementwise_2d_parameters>(parameters, fusion_policy){ } base_impl<elementwise_2d, elementwise_2d_parameters>(parameters, fusion_policy){ }
elementwise_2d::elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2, elementwise_2d::elementwise_2d(unsigned int simd, unsigned int ls1, unsigned int ls2,
unsigned int ng1, unsigned int ng2, fetching_policy_type fetch, unsigned int ng1, unsigned int ng2, fetch_type fetch,
fusion_policy_t bind): fusion_policy_t bind):
base_impl<elementwise_2d, elementwise_2d_parameters>(elementwise_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind) base_impl<elementwise_2d, elementwise_2d_parameters>(elementwise_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), bind)
{} {}
@@ -130,8 +130,8 @@ void elementwise_2d::enqueue(driver::CommandQueue & /*queue*/, driver::Program c
std::string name = "elementwise_2d"; std::string name = "elementwise_2d";
name +=suffix; name +=suffix;
driver::Kernel kernel(program, name.c_str()); driver::Kernel kernel(program, name.c_str());
driver::NDRange global(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1); driver::NDRange global(p_.ls0*p_.num_groups_0, p_.ls1*p_.num_groups_1);
driver::NDRange local(p_.local_size_0, p_.local_size_1); driver::NDRange local(p_.ls0, p_.ls1);
unsigned int current_arg = 0; unsigned int current_arg = 0;
std::vector<int_t> MN = input_sizes(expressions); std::vector<int_t> MN = input_sizes(expressions);
kernel.setSizeArg(current_arg++, MN[0]); kernel.setSizeArg(current_arg++, MN[0]);

View File

@@ -37,14 +37,14 @@ namespace isaac
namespace templates namespace templates
{ {
matrix_product_parameters::matrix_product_parameters(unsigned int simd_width matrix_product_parameters::matrix_product_parameters(unsigned int vwidth
, unsigned int local_size_0, unsigned int KL, unsigned int local_size_1, unsigned int D , unsigned int ls0, unsigned int KL, unsigned int ls1, unsigned int D
, unsigned int ms, unsigned int ks, unsigned int ns , unsigned int ms, unsigned int ks, unsigned int ns
, fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy , fetch_type Afetch, fetch_type Bfetch
, unsigned int local_fetch_0, unsigned int local_fetch_1): base::parameters_type(simd_width, local_size_0, local_size_1, 1), , unsigned int lf0, unsigned int lf1): base::parameters_type(vwidth, ls0, ls1, 1),
kL(KL), depth(D), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy), B_fetching_policy(B_fetching_policy), kL(KL), depth(D), mS(ms), kS(ks), nS(ns), Afetch(Afetch), Bfetch(Bfetch),
local_fetch_0(local_fetch_0), local_fetch_1(local_fetch_1), lf0(lf0), lf1(lf1),
mL(ms*local_size_0), nL(ns*local_size_1) mL(ms*ls0), nL(ns*ls1)
{ {
} }
@@ -74,13 +74,10 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
int matrix_product::is_invalid_impl(driver::Device const &, expression_tree const &) const int matrix_product::is_invalid_impl(driver::Device const &, expression_tree const &) const
{ {
// if(device.vendor()==driver::Device::Vendor::NVIDIA && p_.simd_width > 1) if(p_.Afetch!=FETCH_FROM_LOCAL || p_.Bfetch!=FETCH_FROM_LOCAL)
// return TEMPLATE_INVALID_SIMD_WIDTH;
if(p_.A_fetching_policy!=FETCH_FROM_LOCAL || p_.B_fetching_policy!=FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE; return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0) if ((p_.mS % p_.vwidth) > 0 || (p_.nS % p_.vwidth) > 0)
return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE; return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
if(p_.mL > 256 || p_.nL > 256) if(p_.mL > 256 || p_.nL > 256)
@@ -89,32 +86,32 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
if ( p_.kS % p_.kL == 0) if ( p_.kS % p_.kL == 0)
return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL; return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL){ if (p_.Afetch==FETCH_FROM_LOCAL || p_.Bfetch==FETCH_FROM_LOCAL){
if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1)) if ((p_.lf0*p_.lf1) !=(p_.ls0*p_.ls1))
return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT; return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
} }
if (p_.A_fetching_policy==FETCH_FROM_LOCAL) if (p_.Afetch==FETCH_FROM_LOCAL)
{ {
unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL; unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL; unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0) if (p_.lf1>0 && (bound1 % p_.lf1)> 0)
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0) if (p_.lf0>0 && (bound0 % (p_.lf0*p_.vwidth)) > 0)
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE; return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
} }
if (p_.B_fetching_policy==FETCH_FROM_LOCAL) if (p_.Bfetch==FETCH_FROM_LOCAL)
{ {
unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL; unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL; unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0) if (p_.lf1>0 && (bound1 % p_.lf1)> 0)
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0) if (p_.lf0>0 && (bound0 % (p_.lf0*p_.vwidth)) > 0)
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE; return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
} }
@@ -129,9 +126,9 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
driver::backend_type backend = device.backend(); driver::backend_type backend = device.backend();
bool has_depth = p_.depth > 1; bool has_depth = p_.depth > 1;
#define VLOAD(offset, ptr) vload(p_.simd_width, sdtype, offset, ptr, "1", backend, true) #define VLOAD(offset, ptr) vload(p_.vwidth, sdtype, offset, ptr, "1", backend, true)
#define VLOAD_MISALIGNED(offset, ptr) vload(p_.simd_width, sdtype, offset, ptr, "1", backend, false) #define VLOAD_MISALIGNED(offset, ptr) vload(p_.vwidth, sdtype, offset, ptr, "1", backend, false)
#define VSTORE(value, offset, ptr) vstore(p_.simd_width, sdtype, value, offset, ptr, "1", backend) #define VSTORE(value, offset, ptr) vstore(p_.vwidth, sdtype, value, offset, ptr, "1", backend)
symbolic::preset::matrix_product::args args; symbolic::preset::matrix_product::args args;
infos(tree, args); infos(tree, args);
@@ -145,7 +142,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
kernel_generation_stream stream(backend); kernel_generation_stream stream(backend);
numeric_type dtype = tree.dtype(); numeric_type dtype = tree.dtype();
std::string sdtype = to_string(dtype); std::string sdtype = to_string(dtype);
std::string vdtype = append_width(sdtype, p_.simd_width); std::string vdtype = append_width(sdtype, p_.vwidth);
////////////////// //////////////////
/// DECLARATIONS /// DECLARATIONS
@@ -159,7 +156,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
switch(backend) switch(backend)
{ {
case driver::OPENCL: case driver::OPENCL:
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << "," << p_.ls1 << ",1)))" << std::endl;
break; break;
default: default:
break; break;
@@ -178,8 +175,8 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
///Declare ///Declare
stream << "//blocks" << std::endl; stream << "//blocks" << std::endl;
stream << sdtype << " rC[" << p_.mS << "][" << p_.nS << "] = {{0}};" << std::endl; stream << sdtype << " rC[" << p_.mS << "][" << p_.nS << "] = {{0}};" << std::endl;
stream << vdtype << " rA[" << p_.kS << "][" << p_.mS/p_.simd_width << "];" << std::endl; stream << vdtype << " rA[" << p_.kS << "][" << p_.mS/p_.vwidth << "];" << std::endl;
stream << vdtype << " rB[" << p_.kS << "][" << p_.nS/p_.simd_width << "];" << std::endl; stream << vdtype << " rB[" << p_.kS << "][" << p_.nS/p_.vwidth << "];" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//pointers" << std::endl; stream << "//pointers" << std::endl;
@@ -187,8 +184,8 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
size_t lldb = (B_trans_=='T')?p_.nL:p_.kL; size_t lldb = (B_trans_=='T')?p_.nL:p_.kL;
stream << "$LOCAL " << sdtype << " lA[" << p_.kL*p_.mL << "];" << std::endl; stream << "$LOCAL " << sdtype << " lA[" << p_.kL*p_.mL << "];" << std::endl;
stream << "$LOCAL " << sdtype << " lB[" << p_.kL*p_.nL << "];" << std::endl; stream << "$LOCAL " << sdtype << " lB[" << p_.kL*p_.nL << "];" << std::endl;
unsigned int npA = p_.mL/(A_trans_=='N'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1); unsigned int npA = p_.mL/(A_trans_=='N'?p_.lf0*p_.vwidth:p_.lf1);
unsigned int npB = p_.nL/(B_trans_=='T'?p_.local_fetch_0*p_.simd_width:p_.local_fetch_1); unsigned int npB = p_.nL/(B_trans_=='T'?p_.lf0*p_.vwidth:p_.lf1);
stream << "$GLOBAL " << sdtype << "* Ai[" << npA << "];" << std::endl; stream << "$GLOBAL " << sdtype << "* Ai[" << npA << "];" << std::endl;
stream << "$GLOBAL " << sdtype << "* Bi[" << npB << "];" << std::endl; stream << "$GLOBAL " << sdtype << "* Bi[" << npB << "];" << std::endl;
stream << std::endl; stream << std::endl;
@@ -218,15 +215,15 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "K = min(K - div*gidz, ($SIZE_T)div);" << std::endl; stream << "K = min(K - div*gidz, ($SIZE_T)div);" << std::endl;
} }
stream << "idt = " << p_.local_size_0 << "*ids.w + ids.z;" << std::endl; stream << "idt = " << p_.ls0 << "*ids.w + ids.z;" << std::endl;
stream << "idT.y = idt/" << p_.local_fetch_0 << ";" << std::endl; stream << "idT.y = idt/" << p_.lf0 << ";" << std::endl;
stream << "idT.x = idt - " << p_.local_fetch_0 << "*idT.y;" << std::endl; stream << "idT.x = idt - " << p_.lf0 << "*idT.y;" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//Adjust pointers and bounds per work-item" << std::endl; stream << "//Adjust pointers and bounds per work-item" << std::endl;
stream << "ids.x *= " << p_.mL << ";" << std::endl; stream << "ids.x *= " << p_.mL << ";" << std::endl;
stream << "ids.y *= " << p_.nL << ";" << std::endl; stream << "ids.y *= " << p_.nL << ";" << std::endl;
stream << "idT.x *= " << p_.simd_width << ";" << std::endl; stream << "idT.x *= " << p_.vwidth << ";" << std::endl;
stream << "M -= ids.x;" << std::endl; stream << "M -= ids.x;" << std::endl;
if(A_trans_=='N') if(A_trans_=='N')
@@ -289,15 +286,15 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
for(unsigned int i = 0 ; i < npA ; i++ ) for(unsigned int i = 0 ; i < npA ; i++ )
if (A_trans_=='N') if (A_trans_=='N')
stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < M", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + ASTRIDE1 + ")", "0") << ";" << std::endl; stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.lf0*p_.vwidth) + " < M", "(int)((idT.x + " + to_string(i*p_.lf0*p_.vwidth) + ")" + ASTRIDE1 + ")", "0") << ";" << std::endl;
else else
stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < M", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*lda)", "0") << ";" << std::endl; stream << "Ai[" << i << "] += " << Select(backend, to_string(i*p_.lf1) + " < M", "(int)((idT.y + " + to_string(i*p_.lf1) + ")*lda)", "0") << ";" << std::endl;
for(unsigned int i = 0 ; i < npB ; i++ ) for(unsigned int i = 0 ; i < npB ; i++ )
if (B_trans_=='T') if (B_trans_=='T')
stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_0*p_.simd_width) + " < N", "(int)((idT.x + " + to_string(i*p_.local_fetch_0*p_.simd_width) + ")" + BSTRIDE1 + ")", "0") << ";" << std::endl; stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.lf0*p_.vwidth) + " < N", "(int)((idT.x + " + to_string(i*p_.lf0*p_.vwidth) + ")" + BSTRIDE1 + ")", "0") << ";" << std::endl;
else else
stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.local_fetch_1) + " < N", "(int)((idT.y + " + to_string(i*p_.local_fetch_1) + ")*ldb)", "0") << ";" << std::endl; stream << "Bi[" << i << "] += " << Select(backend, to_string(i*p_.lf1) + " < N", "(int)((idT.y + " + to_string(i*p_.lf1) + ")*ldb)", "0") << ";" << std::endl;
stream << std::endl; stream << std::endl;
stream << "//Outer loop" << std::endl; stream << "//Outer loop" << std::endl;
@@ -315,13 +312,13 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "//Fetch A to local memory" << std::endl; stream << "//Fetch A to local memory" << std::endl;
if (A_trans_=='N') if (A_trans_=='N')
{ {
for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1) for(unsigned int k = 0; k < p_.kL; k += p_.lf1)
for(unsigned int m = 0; m < p_.mL; m += p_.local_fetch_0*p_.simd_width) for(unsigned int m = 0; m < p_.mL; m += p_.lf0*p_.vwidth)
{ {
std::string mm = to_string(m/(p_.simd_width*p_.local_fetch_0)); std::string mm = to_string(m/(p_.vwidth*p_.lf0));
std::string kk = to_string(k); std::string kk = to_string(k);
if(last_iteration) if(last_iteration)
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << "ldsA[" << k*llda + m + s << "] = (condy" << k << " && " << s << "< M)? Ai[" << mm << "][" << k << "*lda + " << s << "] : 0;" << std::endl; stream << "ldsA[" << k*llda + m + s << "] = (condy" << k << " && " << s << "< M)? Ai[" << mm << "][" << k << "*lda + " << s << "] : 0;" << std::endl;
else else
stream << VSTORE(VLOAD_MISALIGNED("0" ,"&Ai[" + mm +"][" + kk + "*lda]"), "0", "ldsA + " + to_string(k*llda+m)) << ";" << std::endl; stream << VSTORE(VLOAD_MISALIGNED("0" ,"&Ai[" + mm +"][" + kk + "*lda]"), "0", "ldsA + " + to_string(k*llda+m)) << ";" << std::endl;
@@ -329,13 +326,13 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
} }
else else
{ {
for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_0*p_.simd_width) for(unsigned int k = 0; k < p_.kL; k += p_.lf0*p_.vwidth)
for(unsigned int m = 0; m < p_.mL; m += p_.local_fetch_1) for(unsigned int m = 0; m < p_.mL; m += p_.lf1)
{ {
std::string mm = to_string(m/p_.local_fetch_1); std::string mm = to_string(m/p_.lf1);
std::string kk = to_string(k); std::string kk = to_string(k);
if(last_iteration) if(last_iteration)
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << "ldsA[" << m*llda + k + s << "] = condx" << k + s << "? Ai[" << mm << "][" << k + s << ASTRIDE1 << "] : 0;" << std::endl; stream << "ldsA[" << m*llda + k + s << "] = condx" << k + s << "? Ai[" << mm << "][" << k + s << ASTRIDE1 << "] : 0;" << std::endl;
else else
@@ -346,13 +343,13 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "//Fetch B to local memory" << std::endl; stream << "//Fetch B to local memory" << std::endl;
if (B_trans_=='T') if (B_trans_=='T')
{ {
for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1) for(unsigned int k = 0; k < p_.kL; k += p_.lf1)
for(unsigned int n = 0; n < p_.nL; n += p_.local_fetch_0*p_.simd_width) for(unsigned int n = 0; n < p_.nL; n += p_.lf0*p_.vwidth)
{ {
std::string nn = to_string(n/(p_.simd_width*p_.local_fetch_0)); std::string nn = to_string(n/(p_.vwidth*p_.lf0));
std::string kk = to_string(k); std::string kk = to_string(k);
if(last_iteration) if(last_iteration)
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << "ldsB[" << k*lldb + n + s << "] = (condy" << k << " && " << s << "< N)? Bi[" << nn << "][" << kk << "*ldb +" << s << "] : 0;" << std::endl; stream << "ldsB[" << k*lldb + n + s << "] = (condy" << k << " && " << s << "< N)? Bi[" << nn << "][" << kk << "*ldb +" << s << "] : 0;" << std::endl;
else else
stream << VSTORE(VLOAD_MISALIGNED("0" ,"&Bi[" + nn +"][" + kk + "*ldb]"), "0", "ldsB + " + to_string(k*lldb+n)) << ";" << std::endl; stream << VSTORE(VLOAD_MISALIGNED("0" ,"&Bi[" + nn +"][" + kk + "*ldb]"), "0", "ldsB + " + to_string(k*lldb+n)) << ";" << std::endl;
@@ -360,13 +357,13 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
} }
else else
{ {
for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_0*p_.simd_width) for(unsigned int k = 0; k < p_.kL; k += p_.lf0*p_.vwidth)
for(unsigned int n = 0; n < p_.nL; n += p_.local_fetch_1) for(unsigned int n = 0; n < p_.nL; n += p_.lf1)
{ {
std::string nn = to_string(n/p_.local_fetch_1); std::string nn = to_string(n/p_.lf1);
std::string kk = to_string(k); std::string kk = to_string(k);
if(last_iteration) if(last_iteration)
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << "ldsB[" << n*lldb + k + s << "] = condx" << k + s << "? Bi[" << nn << "][" << k + s << BSTRIDE1 << "] : 0;" << std::endl; stream << "ldsB[" << n*lldb + k + s << "] = condx" << k + s << "? Bi[" << nn << "][" << k + s << BSTRIDE1 << "] : 0;" << std::endl;
else else
@@ -375,14 +372,14 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
} }
if(A_trans_=='N') if(A_trans_=='N')
stream << "ldsA = lA + ids.z*" << p_.simd_width << ";" << std::endl; stream << "ldsA = lA + ids.z*" << p_.vwidth << ";" << std::endl;
else else
stream << "ldsA = lA + ids.z*" << llda*p_.simd_width << ";" << std::endl; stream << "ldsA = lA + ids.z*" << llda*p_.vwidth << ";" << std::endl;
if(B_trans_=='T') if(B_trans_=='T')
stream << "ldsB = lB + ids.w*" << p_.simd_width << ";" << std::endl; stream << "ldsB = lB + ids.w*" << p_.vwidth << ";" << std::endl;
else else
stream << "ldsB = lB + ids.w*" << lldb*p_.simd_width << ";" << std::endl; stream << "ldsB = lB + ids.w*" << lldb*p_.vwidth << ";" << std::endl;
stream << "$LOCAL_BARRIER;" << std::endl; stream << "$LOCAL_BARRIER;" << std::endl;
@@ -393,19 +390,19 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "//Fetch A to registers" << std::endl; stream << "//Fetch A to registers" << std::endl;
stream << "#pragma unroll" << std::endl; stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int kk = 0; kk < " << p_.kS << "; kk++)" << std::endl; stream << "for(unsigned int kk = 0; kk < " << p_.kS << "; kk++)" << std::endl;
stream << "#pragma unroll " << p_.mS/p_.simd_width << std::endl; stream << "#pragma unroll " << p_.mS/p_.vwidth << std::endl;
stream << "for(unsigned int mm = 0; mm < " << p_.mS/p_.simd_width << "; mm++)" << std::endl; stream << "for(unsigned int mm = 0; mm < " << p_.mS/p_.vwidth << "; mm++)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
if(A_trans_=='N') if(A_trans_=='N')
stream << "rA[kk][mm] = " << VLOAD("0", "ldsA + k*" + to_string(llda) + " + mm*" + to_string(p_.local_size_0*p_.simd_width) + "+ kk*" + to_string(llda)) << ";" << std::endl; stream << "rA[kk][mm] = " << VLOAD("0", "ldsA + k*" + to_string(llda) + " + mm*" + to_string(p_.ls0*p_.vwidth) + "+ kk*" + to_string(llda)) << ";" << std::endl;
else else
{ {
if(p_.simd_width==1) if(p_.vwidth==1)
stream << "rA[kk][mm] = ldsA[k + mm*" << p_.local_size_0*llda << "+ kk" << "];" << std::endl; stream << "rA[kk][mm] = ldsA[k + mm*" << p_.ls0*llda << "+ kk" << "];" << std::endl;
else else
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << access_vector_type("rA[kk][mm]", s) << " = ldsA[k + (mm*" << p_.simd_width*p_.local_size_0 << " + " << s << ")*" << llda << "+ kk];" << std::endl; stream << access_vector_type("rA[kk][mm]", s) << " = ldsA[k + (mm*" << p_.vwidth*p_.ls0 << " + " << s << ")*" << llda << "+ kk];" << std::endl;
} }
stream.dec_tab(); stream.dec_tab();
@@ -414,19 +411,19 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "//Fetch B to registers" << std::endl; stream << "//Fetch B to registers" << std::endl;
stream << "#pragma unroll " << p_.kS << std::endl; stream << "#pragma unroll " << p_.kS << std::endl;
stream << "for(unsigned int kk = 0; kk < " << p_.kS << "; kk++)" << std::endl; stream << "for(unsigned int kk = 0; kk < " << p_.kS << "; kk++)" << std::endl;
stream << "#pragma unroll " << p_.nS/p_.simd_width << std::endl; stream << "#pragma unroll " << p_.nS/p_.vwidth << std::endl;
stream << "for(unsigned int nn = 0; nn < " << p_.nS/p_.simd_width << "; nn++)" << std::endl; stream << "for(unsigned int nn = 0; nn < " << p_.nS/p_.vwidth << "; nn++)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
if(B_trans_=='T') if(B_trans_=='T')
stream << "rB[kk][nn] = " << VLOAD("0", "ldsB + k*" + to_string(lldb) + " + nn*" + to_string(p_.local_size_1*p_.simd_width) + "+ kk*" + to_string(lldb)) << ";" << std::endl; stream << "rB[kk][nn] = " << VLOAD("0", "ldsB + k*" + to_string(lldb) + " + nn*" + to_string(p_.ls1*p_.vwidth) + "+ kk*" + to_string(lldb)) << ";" << std::endl;
else else
{ {
if(p_.simd_width==1) if(p_.vwidth==1)
stream << "rB[kk][nn] = ldsB[k" << " + nn*" << p_.local_size_1*lldb << "+ kk" << "];" << std::endl; stream << "rB[kk][nn] = ldsB[k" << " + nn*" << p_.ls1*lldb << "+ kk" << "];" << std::endl;
else else
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << access_vector_type("rB[kk][nn]", s) << " = ldsB[k" << " + (nn*" << p_.simd_width*p_.local_size_1 << " + " << s << ")*" << lldb << "+ kk];" << std::endl; stream << access_vector_type("rB[kk][nn]", s) << " = ldsB[k" << " + (nn*" << p_.vwidth*p_.ls1 << " + " << s << ")*" << lldb << "+ kk];" << std::endl;
} }
stream.dec_tab(); stream.dec_tab();
stream << "}" << std::endl; stream << "}" << std::endl;
@@ -437,14 +434,14 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
for(unsigned int mm=0; mm < p_.mS; ++mm){ for(unsigned int mm=0; mm < p_.mS; ++mm){
string res_str, lhs_str, rhs_str; string res_str, lhs_str, rhs_str;
res_str = "rC[" + to_string(mm) + "][" + to_string(nn) + "]"; res_str = "rC[" + to_string(mm) + "][" + to_string(nn) + "]";
if (p_.simd_width==1) if (p_.vwidth==1)
lhs_str = "rA[" + to_string(kk) + "][" + to_string(mm) + "]"; lhs_str = "rA[" + to_string(kk) + "][" + to_string(mm) + "]";
else else
lhs_str = access_vector_type("rA[" + to_string(kk) + "][" + to_string(mm/p_.simd_width) + "]", mm%p_.simd_width); lhs_str = access_vector_type("rA[" + to_string(kk) + "][" + to_string(mm/p_.vwidth) + "]", mm%p_.vwidth);
if (p_.simd_width==1) if (p_.vwidth==1)
rhs_str = "rB[" + to_string(kk) + "]["+to_string(nn)+"]"; rhs_str = "rB[" + to_string(kk) + "]["+to_string(nn)+"]";
else else
rhs_str = access_vector_type("rB[" + to_string(kk) + "]["+to_string(nn/p_.simd_width)+"]", nn%p_.simd_width); rhs_str = access_vector_type("rB[" + to_string(kk) + "]["+to_string(nn/p_.vwidth)+"]", nn%p_.vwidth);
stream << res_str << "= $MAD(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl; stream << res_str << "= $MAD(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl;
} }
@@ -476,15 +473,15 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
if(A_trans_=='N' || B_trans_=='T') if(A_trans_=='N' || B_trans_=='T')
{ {
stream << "int Ky = K - idT.y;" << std::endl; stream << "int Ky = K - idT.y;" << std::endl;
for(unsigned int k = 0; k < p_.kL; k += p_.local_fetch_1) for(unsigned int k = 0; k < p_.kL; k += p_.lf1)
stream << "int condy" << k << " = " << k << " < Ky;" << std::endl; stream << "int condy" << k << " = " << k << " < Ky;" << std::endl;
} }
if(A_trans_=='T' || B_trans_=='N') if(A_trans_=='T' || B_trans_=='N')
{ {
stream << "int Kx = K - idT.x;" << std::endl; stream << "int Kx = K - idT.x;" << std::endl;
for(unsigned int k = 0 ; k < p_.kL ; k += p_.local_fetch_0*p_.simd_width) for(unsigned int k = 0 ; k < p_.kL ; k += p_.lf0*p_.vwidth)
for(unsigned int s = 0 ; s < p_.simd_width ; ++s) for(unsigned int s = 0 ; s < p_.vwidth ; ++s)
stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl; stream << "int condx" << k + s << " = " << k + s << " < Kx;" << std::endl;
} }
fetch_to_lds(true); fetch_to_lds(true);
@@ -503,35 +500,35 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
stream << "N += ids.y;" << std::endl; stream << "N += ids.y;" << std::endl;
stream << "C += ids.x" << CSTRIDE1 << ";" << std::endl; stream << "C += ids.x" << CSTRIDE1 << ";" << std::endl;
stream << "C += ids.z*" << p_.simd_width << CSTRIDE1 << ";" << std::endl; stream << "C += ids.z*" << p_.vwidth << CSTRIDE1 << ";" << std::endl;
stream << "C += ids.y*ldc;" << std::endl; stream << "C += ids.y*ldc;" << std::endl;
stream << "C += ids.w*" << p_.simd_width << "*ldc;" << std::endl; stream << "C += ids.w*" << p_.vwidth << "*ldc;" << std::endl;
if(has_depth) if(has_depth)
stream << "C += gidz*ldc*N;" << std::endl; stream << "C += gidz*ldc*N;" << std::endl;
stream << "M -= ids.x;" << std::endl; stream << "M -= ids.x;" << std::endl;
stream << "M -= ids.z*" << p_.simd_width << ";" << std::endl; stream << "M -= ids.z*" << p_.vwidth << ";" << std::endl;
stream << "N -= ids.y;" << std::endl; stream << "N -= ids.y;" << std::endl;
stream << "N -= ids.w*" << p_.simd_width << ";" << std::endl; stream << "N -= ids.w*" << p_.vwidth << ";" << std::endl;
for(unsigned int n=0; n < p_.nS; ++n) for(unsigned int n=0; n < p_.nS; ++n)
{ {
string Cj = to_string((n/p_.simd_width)*(p_.local_size_1*p_.simd_width) + n%p_.simd_width); string Cj = to_string((n/p_.vwidth)*(p_.ls1*p_.vwidth) + n%p_.vwidth);
stream << "if(" << Cj << " >= N) return;" << std::endl; stream << "if(" << Cj << " >= N) return;" << std::endl;
for(unsigned int m=0; m < p_.mS; ++m) for(unsigned int m=0; m < p_.mS; ++m)
stream << "rC[" << m << "][" << n << "] *= alpha;" << std::endl; stream << "rC[" << m << "][" << n << "] *= alpha;" << std::endl;
for(unsigned int m=0; m < p_.mS; ++m) for(unsigned int m=0; m < p_.mS; ++m)
{ {
string Ci = to_string((m/p_.simd_width)*(p_.local_size_0*p_.simd_width) + m%p_.simd_width); string Ci = to_string((m/p_.vwidth)*(p_.ls0*p_.vwidth) + m%p_.vwidth);
stream << "if(" << Ci << "< M) "; stream << "if(" << Ci << "< M) ";
if(has_depth) if(has_depth)
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl; stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "];" << std::endl;
else else
stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + ((beta != (" << sdtype << ")0)?(beta*" << "C[" << Ci << CSTRIDE1 << "]):0);" << std::endl; stream << "C[" << Ci << CSTRIDE1 << "] = rC[" << m << "][" << n << "] + ((beta != (" << sdtype << ")0)?(beta*" << "C[" << Ci << CSTRIDE1 << "]):0);" << std::endl;
} }
if((n+1)%p_.simd_width==0){ if((n+1)%p_.vwidth==0){
stream << "C += ldc*" << p_.local_size_1*p_.simd_width - p_.simd_width + 1 << ";" << std::endl; stream << "C += ldc*" << p_.ls1*p_.vwidth - p_.vwidth + 1 << ";" << std::endl;
} }
else{ else{
stream << "C += ldc;" << std::endl; stream << "C += ldc;" << std::endl;
@@ -599,8 +596,8 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
reduce_name += suffix; reduce_name += suffix;
driver::Kernel matrix_product(program, matrix_product_name.c_str()); driver::Kernel matrix_product(program, matrix_product_name.c_str());
driver::NDRange local(p_.local_size_0, p_.local_size_1, 1); driver::NDRange local(p_.ls0, p_.ls1, 1);
driver::NDRange global(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth); driver::NDRange global(align(align(M,p_.mS)/p_.mS, p_.ls0), align(align(N,p_.nS)/p_.nS, p_.ls1), p_.depth);
unsigned int current_arg = 0; unsigned int current_arg = 0;
@@ -651,8 +648,8 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
{ {
unsigned int current_arg = 0; unsigned int current_arg = 0;
driver::Kernel reduce(program, reduce_name.c_str()); driver::Kernel reduce(program, reduce_name.c_str());
driver::NDRange local(p_.local_size_0, p_.local_size_1); driver::NDRange local(p_.ls0, p_.ls1);
driver::NDRange global(align(M, p_.local_size_0), align(N, p_.local_size_1)); driver::NDRange global(align(M, p_.ls0), align(N, p_.ls1));
reduce.setSizeArg(current_arg++, M); reduce.setSizeArg(current_arg++, M);
reduce.setSizeArg(current_arg++, N); reduce.setSizeArg(current_arg++, N);
reduce.setSizeArg(current_arg++, p_.depth); reduce.setSizeArg(current_arg++, p_.depth);
@@ -717,7 +714,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
matrix_product_nn::matrix_product_nn(unsigned int simd matrix_product_nn::matrix_product_nn(unsigned int simd
, int_t ls0, int_t KL, int_t ls1, int_t D , int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns , int_t ms, int_t ks, int_t ns
, fetching_policy_type Afetch , fetching_policy_type Bfetch , fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1) : , int_t lfetch0, int_t lfetch1) :
matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'N', 'N') matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'N', 'N')
{ {
@@ -727,7 +724,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
matrix_product_tn::matrix_product_tn(unsigned int simd matrix_product_tn::matrix_product_tn(unsigned int simd
, int_t ls0, int_t KL, int_t ls1, int_t D , int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns , int_t ms, int_t ks, int_t ns
, fetching_policy_type Afetch , fetching_policy_type Bfetch , fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1) : , int_t lfetch0, int_t lfetch1) :
matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'T', 'N') matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'T', 'N')
{ } { }
@@ -736,7 +733,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
matrix_product_nt::matrix_product_nt(unsigned int simd matrix_product_nt::matrix_product_nt(unsigned int simd
, int_t ls0, int_t KL, int_t ls1, int_t D , int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns , int_t ms, int_t ks, int_t ns
, fetching_policy_type Afetch , fetching_policy_type Bfetch , fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1) : , int_t lfetch0, int_t lfetch1) :
matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'N', 'T') matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'N', 'T')
{ } { }
@@ -745,7 +742,7 @@ matrix_product_parameters::matrix_product_parameters(unsigned int simd_width
matrix_product_tt::matrix_product_tt(unsigned int simd matrix_product_tt::matrix_product_tt(unsigned int simd
, int_t ls0, int_t KL, int_t ls1, int_t D , int_t ls0, int_t KL, int_t ls1, int_t D
, int_t ms, int_t ks, int_t ns , int_t ms, int_t ks, int_t ns
, fetching_policy_type Afetch , fetching_policy_type Bfetch , fetch_type Afetch , fetch_type Bfetch
, int_t lfetch0, int_t lfetch1) : , int_t lfetch0, int_t lfetch1) :
matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'T', 'T') matrix_product(matrix_product_parameters(simd, ls0, KL, ls1, D, ms, ks, ns, Afetch, Bfetch, lfetch0, lfetch1), 'T', 'T')
{ } { }

View File

@@ -35,19 +35,19 @@ namespace isaac
{ {
namespace templates namespace templates
{ {
reduce_1d_parameters::reduce_1d_parameters(unsigned int _simd_width, reduce_1d_parameters::reduce_1d_parameters(unsigned int _vwidth,
unsigned int _group_size, unsigned int _num_groups, unsigned int _group_size, unsigned int _num_groups,
fetching_policy_type _fetching_policy) : base::parameters_type(_simd_width, _group_size, 1, 2), num_groups(_num_groups), fetching_policy(_fetching_policy) fetch_type _fetch) : base::parameters_type(_vwidth, _group_size, 1, 2), num_groups(_num_groups), fetch(_fetch)
{ } { }
unsigned int reduce_1d::lmem_usage(expression_tree const & x) const unsigned int reduce_1d::lmem_usage(expression_tree const & x) const
{ {
return p_.local_size_0*size_of(x.dtype()); return p_.ls0*size_of(x.dtype());
} }
int reduce_1d::is_invalid_impl(driver::Device const &, expression_tree const &) const int reduce_1d::is_invalid_impl(driver::Device const &, expression_tree const &) const
{ {
if (p_.fetching_policy==FETCH_FROM_LOCAL) if (p_.fetch==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE; return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID; return TEMPLATE_VALID;
} }
@@ -122,7 +122,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree
case driver::CUDA: case driver::CUDA:
stream << "#include \"vector.h\"" << std::endl; break; stream << "#include \"vector.h\"" << std::endl; break;
case driver::OPENCL: case driver::OPENCL:
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl; break; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << ",1,1)))" << std::endl; break;
} }
stream << "$KERNEL void prod" << suffix << "($SIZE_T N, $GLOBAL char* tmp," << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl; stream << "$KERNEL void prod" << suffix << "($SIZE_T N, $GLOBAL char* tmp," << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
@@ -139,33 +139,33 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree
{ {
if(is_indexing(rd->op().type)) if(is_indexing(rd->op().type))
{ {
stream << rd->process("$LOCAL #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl; stream << rd->process("$LOCAL #scalartype #name_buf_value[" + tools::to_string(p_.ls0) + "];") << std::endl;
stream << rd->process("#scalartype #name_acc_value = " + neutral_element(rd->op(), backend, "#scalartype") + ";") << std::endl; stream << rd->process("#scalartype #name_acc_value = " + neutral_element(rd->op(), backend, "#scalartype") + ";") << std::endl;
stream << rd->process("$LOCAL unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl; stream << rd->process("$LOCAL unsigned int #name_buf[" + tools::to_string(p_.ls0) + "];") << std::endl;
stream << rd->process("unsigned int #name_acc = 0;") << std::endl; stream << rd->process("unsigned int #name_acc = 0;") << std::endl;
} }
else else
{ {
stream << rd->process("$LOCAL #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl; stream << rd->process("$LOCAL #scalartype #name_buf[" + tools::to_string(p_.ls0) + "];") << std::endl;
stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";") << std::endl; stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";") << std::endl;
} }
} }
element_wise_loop_1D(stream, p_.fetching_policy, p_.simd_width, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int simd_width) element_wise_loop_1D(stream, p_.fetch, p_.vwidth, "i", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int vwidth)
{ {
std::string dtype = append_width("#scalartype",simd_width); std::string dtype = append_width("#scalartype",vwidth);
//Fetch vector entry //Fetch vector entry
std::set<std::string> fetched; std::set<std::string> fetched;
for (symbolic::reduce_1d* rd : reductions) for (symbolic::reduce_1d* rd : reductions)
for(symbolic::leaf* leaf: symbolic::extract<symbolic::leaf>(tree, symbols, rd->root(), false)) for(symbolic::leaf* leaf: symbolic::extract<symbolic::leaf>(tree, symbols, rd->root(), false))
if(fetched.insert(leaf->process("#name")).second) if(fetched.insert(leaf->process("#name")).second)
stream << leaf->process(dtype + " #name = " + append_width("loadv", simd_width) + "(i);") << std::endl; stream << leaf->process(dtype + " #name = " + append_width("loadv", vwidth) + "(i);") << std::endl;
//Update accumulators //Update accumulators
for (symbolic::reduce_1d* rd : reductions) for (symbolic::reduce_1d* rd : reductions)
for (unsigned int s = 0; s < simd_width; ++s) for (unsigned int s = 0; s < vwidth; ++s)
{ {
std::string value = rd->lhs()->evaluate({{"leaf", access_vector_type("#name", s, simd_width)}}); std::string value = rd->lhs()->evaluate({{"leaf", access_vector_type("#name", s, vwidth)}});
if (is_indexing(rd->op().type)) if (is_indexing(rd->op().type))
compute_index_reduce_1d(stream, rd->process("#name_acc"), "i*" + tools::to_string(simd_width) + "+" + tools::to_string(s), rd->process("#name_acc_value"), value,rd->op()); compute_index_reduce_1d(stream, rd->process("#name_acc"), "i*" + tools::to_string(vwidth) + "+" + tools::to_string(s), rd->process("#name_acc_value"), value,rd->op());
else else
compute_reduce_1d(stream, rd->process("#name_acc"), value,rd->op()); compute_reduce_1d(stream, rd->process("#name_acc"), value,rd->op());
} }
@@ -178,7 +178,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree
stream << rd->process("#name_buf[lid] = #name_acc;") << std::endl; stream << rd->process("#name_buf[lid] = #name_acc;") << std::endl;
} }
//Reduce local memory //Reduce local memory
reduce_1d_local_memory(stream, p_.local_size_0, reductions, "#name_buf", "#name_buf_value", backend); reduce_1d_local_memory(stream, p_.ls0, reductions, "#name_buf", "#name_buf_value", backend);
//Write to temporary buffers //Write to temporary buffers
stream << "if (lid==0)" << std::endl; stream << "if (lid==0)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
@@ -209,14 +209,14 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree
{ {
if (is_indexing(rd->op().type)) if (is_indexing(rd->op().type))
{ {
stream << rd->process("$LOCAL unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];"); stream << rd->process("$LOCAL unsigned int #name_buf[" + tools::to_string(p_.ls0) + "];");
stream << rd->process("unsigned int #name_acc = 0;") << std::endl; stream << rd->process("unsigned int #name_acc = 0;") << std::endl;
stream << rd->process("$LOCAL #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl; stream << rd->process("$LOCAL #scalartype #name_buf_value[" + tools::to_string(p_.ls0) + "];") << std::endl;
stream << rd->process("#scalartype #name_acc_value = " + neutral_element(rd->op(), backend, "#scalartype") + ";"); stream << rd->process("#scalartype #name_acc_value = " + neutral_element(rd->op(), backend, "#scalartype") + ";");
} }
else else
{ {
stream << rd->process("$LOCAL #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl; stream << rd->process("$LOCAL #scalartype #name_buf[" + tools::to_string(p_.ls0) + "];") << std::endl;
stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";"); stream << rd->process("#scalartype #name_acc = " + neutral_element(rd->op(), backend, "#scalartype") + ";");
} }
} }
@@ -238,7 +238,7 @@ std::string reduce_1d::generate_impl(std::string const & suffix, expression_tree
stream << rd->process("#name_buf[lid] = #name_acc;") << std::endl; stream << rd->process("#name_buf[lid] = #name_acc;") << std::endl;
} }
//Local reduction //Local reduction
reduce_1d_local_memory(stream, p_.local_size_0, reductions, "#name_buf", "#name_buf_value", backend); reduce_1d_local_memory(stream, p_.ls0, reductions, "#name_buf", "#name_buf_value", backend);
//Write //Write
stream << "if (lid==0)" << std::endl; stream << "if (lid==0)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
@@ -258,7 +258,7 @@ reduce_1d::reduce_1d(reduce_1d::parameters_type const & parameters,
{ } { }
reduce_1d::reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng, reduce_1d::reduce_1d(unsigned int simd, unsigned int ls, unsigned int ng,
fetching_policy_type fetch, fusion_policy_t bind): fetch_type fetch, fusion_policy_t bind):
base_impl<reduce_1d, reduce_1d_parameters>(reduce_1d_parameters(simd,ls,ng,fetch), bind) base_impl<reduce_1d, reduce_1d_parameters>(reduce_1d_parameters(simd,ls,ng,fetch), bind)
{} {}
@@ -284,8 +284,8 @@ void reduce_1d::enqueue(driver::CommandQueue & queue, driver::Program const & pr
driver::Kernel kernels[2] = { driver::Kernel(program,name[0].c_str()), driver::Kernel(program,name[1].c_str()) }; driver::Kernel kernels[2] = { driver::Kernel(program,name[0].c_str()), driver::Kernel(program,name[1].c_str()) };
//NDRange //NDRange
driver::NDRange global[2] = { driver::NDRange(p_.local_size_0*p_.num_groups), driver::NDRange(p_.local_size_0) }; driver::NDRange global[2] = { driver::NDRange(p_.ls0*p_.num_groups), driver::NDRange(p_.ls0) };
driver::NDRange local[2] = { driver::NDRange(p_.local_size_0), driver::NDRange(p_.local_size_0) }; driver::NDRange local[2] = { driver::NDRange(p_.ls0), driver::NDRange(p_.ls0) };
//Arguments //Arguments
for (auto & kernel : kernels) for (auto & kernel : kernels)
{ {

View File

@@ -39,9 +39,9 @@ namespace isaac
namespace templates namespace templates
{ {
reduce_2d_parameters::reduce_2d_parameters(unsigned int _simd_width, reduce_2d_parameters::reduce_2d_parameters(unsigned int _vwidth,
unsigned int _local_size_0, unsigned int _local_size_1, unsigned int _ls0, unsigned int _ls1,
unsigned int _num_groups_0, unsigned int _num_groups_1, fetching_policy_type _fetch_policy): base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), unsigned int _num_groups_0, unsigned int _num_groups_1, fetch_type _fetch_policy): base::parameters_type(_vwidth, _ls0, _ls1, 1),
num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetch_policy(_fetch_policy) { } num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetch_policy(_fetch_policy) { }
@@ -54,7 +54,7 @@ int reduce_2d::is_invalid_impl(driver::Device const &, expression_tree const &)
unsigned int reduce_2d::lmem_usage(const expression_tree&) const unsigned int reduce_2d::lmem_usage(const expression_tree&) const
{ {
return (p_.local_size_0+1)*p_.local_size_1; return (p_.ls0+1)*p_.ls1;
} }
unsigned int reduce_2d::temporary_workspace(expression_tree const & expressions) const unsigned int reduce_2d::temporary_workspace(expression_tree const & expressions) const
@@ -80,7 +80,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
name[0] += suffix; name[0] += suffix;
name[1] += suffix; name[1] += suffix;
unsigned int ldls = p_.local_size_0; unsigned int ldls = p_.ls0;
std::string ls0ldstr = to_string(ldls); std::string ls0ldstr = to_string(ldls);
auto unroll_tmp = [&]() auto unroll_tmp = [&]()
@@ -113,7 +113,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
stream << "#include \"vector.h\"" << std::endl; stream << "#include \"vector.h\"" << std::endl;
break; break;
case driver::OPENCL: case driver::OPENCL:
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << "," << p_.ls1 << ",1)))" << std::endl;
break; break;
} }
stream << "$KERNEL void " << name[0] << "($SIZE_T M, $SIZE_T N, $GLOBAL char* tmp, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl; stream << "$KERNEL void " << name[0] << "($SIZE_T M, $SIZE_T N, $GLOBAL char* tmp, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl;
@@ -125,13 +125,13 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
stream << "$SIZE_T lidy = $LOCAL_IDX_1;" << std::endl; stream << "$SIZE_T lidy = $LOCAL_IDX_1;" << std::endl;
//Loop r //Loop r
std::ostringstream upper; std::ostringstream upper;
upper << "(M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1; upper << "(M +" << p_.ls1 - 1 << ")/" << p_.ls1 << "*" << p_.ls1;
element_wise_loop_1D(stream, p_.fetch_policy, (reduction_type_==REDUCE_ROWS)?p_.simd_width:1, "r", upper.str(), "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", device, [&](unsigned int cwidth) element_wise_loop_1D(stream, p_.fetch_policy, (reduction_type_==REDUCE_ROWS)?p_.vwidth:1, "r", upper.str(), "$GLOBAL_IDX_1", "$GLOBAL_SIZE_1", device, [&](unsigned int cwidth)
{ {
//Declare Buffers //Declare Buffers
for (symbolic::reduce_2d* rd : reductions) for (symbolic::reduce_2d* rd : reductions)
stream << rd->process("$LOCAL " + append_width("#scalartype", cwidth) + " #name_buf[" + to_string(p_.local_size_1*ldls) + "];") << std::endl; stream << rd->process("$LOCAL " + append_width("#scalartype", cwidth) + " #name_buf[" + to_string(p_.ls1*ldls) + "];") << std::endl;
//Accumulators //Accumulators
for (symbolic::reduce_2d* rd : reductions){ for (symbolic::reduce_2d* rd : reductions){
@@ -142,7 +142,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
stream << "if (r < M)" << std::endl; stream << "if (r < M)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
element_wise_loop_1D(stream, p_.fetch_policy, (reduction_type_==REDUCE_COLUMNS)?p_.simd_width:1, "c", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int rwidth) element_wise_loop_1D(stream, p_.fetch_policy, (reduction_type_==REDUCE_COLUMNS)?p_.vwidth:1, "c", "N", "$GLOBAL_IDX_0", "$GLOBAL_SIZE_0", device, [&](unsigned int rwidth)
{ {
std::string rdtype = append_width("#scalartype", rwidth); std::string rdtype = append_width("#scalartype", rwidth);
std::string cdtype = append_width("#scalartype", cwidth); std::string cdtype = append_width("#scalartype", cwidth);
@@ -173,7 +173,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
stream << rd->process("#name_buf[lidy*" + ls0ldstr + "+ lidx] = #name_acc;") << std::endl; stream << rd->process("#name_buf[lidy*" + ls0ldstr + "+ lidx] = #name_acc;") << std::endl;
//Reduce local memory //Reduce local memory
stream << "#pragma unroll" << std::endl; stream << "#pragma unroll" << std::endl;
stream << "for($SIZE_T stride = " << p_.local_size_0/2 << "; stride >0; stride /=2)" << std::endl; stream << "for($SIZE_T stride = " << p_.ls0/2 << "; stride >0; stride /=2)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
stream << "$LOCAL_BARRIER;" << std::endl; stream << "$LOCAL_BARRIER;" << std::endl;
@@ -220,14 +220,14 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
if(p_.num_groups_0>1) if(p_.num_groups_0>1)
{ {
if(backend==driver::OPENCL) if(backend==driver::OPENCL)
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl; stream << " __attribute__((reqd_work_group_size(" << p_.ls0 << "," << p_.ls1 << ",1)))" << std::endl;
stream << "$KERNEL void " << name[1] << "($SIZE_T M, $SIZE_T N , $GLOBAL char* tmp, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl; stream << "$KERNEL void " << name[1] << "($SIZE_T M, $SIZE_T N , $GLOBAL char* tmp, " << tools::join(kernel_arguments(device, symbols, tree), ", ") << ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
unroll_tmp(); unroll_tmp();
for (symbolic::reduce_2d* rd : reductions) for (symbolic::reduce_2d* rd : reductions)
stream << rd->process("$LOCAL #scalartype #name_buf[" + to_string(p_.local_size_1*ldls) + "];") << std::endl; stream << rd->process("$LOCAL #scalartype #name_buf[" + to_string(p_.ls1*ldls) + "];") << std::endl;
stream << "for($SIZE_T r = $GLOBAL_IDX_1; r < (M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << "; r += " << GlobalSize1(backend) << "){" << std::endl; stream << "for($SIZE_T r = $GLOBAL_IDX_1; r < (M +" << p_.ls1 - 1 << ")/" << p_.ls1 << "*" << p_.ls1 << "; r += " << GlobalSize1(backend) << "){" << std::endl;
stream.inc_tab(); stream.inc_tab();
stream << "$SIZE_T lidx = $LOCAL_IDX_0;" << std::endl; stream << "$SIZE_T lidx = $LOCAL_IDX_0;" << std::endl;
stream << "$SIZE_T lidy = $LOCAL_IDX_1;" << std::endl; stream << "$SIZE_T lidy = $LOCAL_IDX_1;" << std::endl;
@@ -247,7 +247,7 @@ std::string reduce_2d::generate_impl(std::string const & suffix, expression_tree
for (symbolic::reduce_2d* rd : reductions) for (symbolic::reduce_2d* rd : reductions)
stream << rd->process("#name_buf[lidy*" + ls0ldstr + "+ lidx] = #name_acc;") << std::endl; stream << rd->process("#name_buf[lidy*" + ls0ldstr + "+ lidx] = #name_acc;") << std::endl;
stream << "#pragma unroll" << std::endl; stream << "#pragma unroll" << std::endl;
stream << "for($SIZE_T stride = " << p_.local_size_0/2 << "; stride >0; stride /=2)" << std::endl; stream << "for($SIZE_T stride = " << p_.ls0/2 << "; stride >0; stride /=2)" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
stream << "$LOCAL_BARRIER;" << std::endl; stream << "$LOCAL_BARRIER;" << std::endl;
@@ -326,8 +326,8 @@ void reduce_2d::enqueue(driver::CommandQueue & queue, driver::Program const & pr
} }
//NDRange //NDRange
driver::NDRange global[2] = { driver::NDRange(p_.local_size_0*p_.num_groups_0, p_.local_size_1*p_.num_groups_1), driver::NDRange(p_.local_size_0, p_.local_size_1*p_.num_groups_1) }; driver::NDRange global[2] = { driver::NDRange(p_.ls0*p_.num_groups_0, p_.ls1*p_.num_groups_1), driver::NDRange(p_.ls0, p_.ls1*p_.num_groups_1) };
driver::NDRange local[2] = { driver::NDRange(p_.local_size_0, p_.local_size_1), driver::NDRange(p_.local_size_0, p_.local_size_1) }; driver::NDRange local[2] = { driver::NDRange(p_.ls0, p_.ls1), driver::NDRange(p_.ls0, p_.ls1) };
for(unsigned int i = 0 ; i < nk ; ++i) for(unsigned int i = 0 ; i < nk ; ++i)
control.execution_options().enqueue(program.context(), kernels[i], global[i], local[i]); control.execution_options().enqueue(program.context(), kernels[i], global[i], local[i]);
} }
@@ -335,12 +335,12 @@ void reduce_2d::enqueue(driver::CommandQueue & queue, driver::Program const & pr
reduce_2d_rows::reduce_2d_rows(reduce_2d_parameters const & parameters,fusion_policy_t fusion_policy): reduce_2d(parameters, REDUCE_ROWS, fusion_policy){} reduce_2d_rows::reduce_2d_rows(reduce_2d_parameters const & parameters,fusion_policy_t fusion_policy): reduce_2d(parameters, REDUCE_ROWS, fusion_policy){}
reduce_2d_rows::reduce_2d_rows(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, reduce_2d_rows::reduce_2d_rows(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
fetching_policy_type fetch, fusion_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind) {} fetch_type fetch, fusion_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_ROWS, bind) {}
reduce_2d_cols::reduce_2d_cols(reduce_2d::parameters_type const & parameters, fusion_policy_t fusion_policy): reduce_2d(parameters, REDUCE_COLUMNS, fusion_policy){} reduce_2d_cols::reduce_2d_cols(reduce_2d::parameters_type const & parameters, fusion_policy_t fusion_policy): reduce_2d(parameters, REDUCE_COLUMNS, fusion_policy){}
reduce_2d_cols::reduce_2d_cols(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2, reduce_2d_cols::reduce_2d_cols(unsigned int simd, unsigned int ls1, unsigned int ls2, unsigned int ng1, unsigned int ng2,
fetching_policy_type fetch, fusion_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind) {} fetch_type fetch, fusion_policy_t bind): reduce_2d(reduce_2d_parameters(simd, ls1, ls2, ng1, ng2, fetch), REDUCE_COLUMNS, bind) {}
} }

View File

@@ -29,7 +29,7 @@ namespace isaac
namespace templates namespace templates
{ {
inline void fetching_loop_info(fetching_policy_type policy, std::string const & bound, kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size, driver::Device const &) inline void fetching_loop_info(fetch_type policy, std::string const & bound, kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size, driver::Device const &)
{ {
if (policy==FETCH_FROM_GLOBAL_STRIDED) if (policy==FETCH_FROM_GLOBAL_STRIDED)
{ {
@@ -54,10 +54,10 @@ inline void fetching_loop_info(fetching_policy_type policy, std::string const &
template<class Fun> template<class Fun>
inline void element_wise_loop_1D(kernel_generation_stream & stream, fetching_policy_type fetch, unsigned int simd_width, inline void element_wise_loop_1D(kernel_generation_stream & stream, fetch_type fetch, unsigned int vwidth,
std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size, driver::Device const & device, Fun const & generate_body) std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size, driver::Device const & device, Fun const & generate_body)
{ {
std::string strwidth = tools::to_string(simd_width); std::string strwidth = tools::to_string(vwidth);
std::string init, upper_bound, inc; std::string init, upper_bound, inc;
fetching_loop_info(fetch, bound, stream, init, upper_bound, inc, domain_id, domain_size, device); fetching_loop_info(fetch, bound, stream, init, upper_bound, inc, domain_id, domain_size, device);
@@ -65,11 +65,11 @@ inline void element_wise_loop_1D(kernel_generation_stream & stream, fetching_pol
stream << "for(unsigned int " << i << " = " << init << "*" << strwidth << "; " << i << " < " << boundround << "; " << i << " += " << inc << "*" << strwidth << ")" << std::endl; stream << "for(unsigned int " << i << " = " << init << "*" << strwidth << "; " << i << " < " << boundround << "; " << i << " += " << inc << "*" << strwidth << ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;
stream.inc_tab(); stream.inc_tab();
generate_body(simd_width); generate_body(vwidth);
stream.dec_tab(); stream.dec_tab();
stream << "}" << std::endl; stream << "}" << std::endl;
if (simd_width>1) if (vwidth>1)
{ {
stream << "for(unsigned int " << i << " = " << boundround << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl; stream << "for(unsigned int " << i << " = " << boundround << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl;
stream << "{" << std::endl; stream << "{" << std::endl;

View File

@@ -43,9 +43,9 @@ inline std::string access_vector_type(std::string const & v, int i)
} }
} }
inline std::string access_vector_type(std::string const & v, int i, unsigned int simd_width) inline std::string access_vector_type(std::string const & v, int i, unsigned int vwidth)
{ {
if(simd_width==1) if(vwidth==1)
return v; return v;
else else
return access_vector_type(v, i); return access_vector_type(v, i);
@@ -59,40 +59,40 @@ inline std::string append_width(std::string const & str, unsigned int width)
} }
inline std::string vstore(unsigned int simd_width, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, std::string const & stride, driver::backend_type backend, bool aligned = true) inline std::string vstore(unsigned int vwidth, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, std::string const & stride, driver::backend_type backend, bool aligned = true)
{ {
std::string vdtype = append_width(dtype,simd_width); std::string vdtype = append_width(dtype,vwidth);
if (simd_width==1) if (vwidth==1)
return "(" + ptr + ")[" + offset + "] = " + value; return "(" + ptr + ")[" + offset + "] = " + value;
else else
{ {
if(backend == driver::CUDA && stride == "1" && aligned) if(backend == driver::CUDA && stride == "1" && aligned)
return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value; return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value;
else if(backend == driver::OPENCL && stride == "1") else if(backend == driver::OPENCL && stride == "1")
return append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")"; return append_width("vstore", vwidth) + "(" + value + ", " + offset + ", " + ptr + ")";
else else
{ {
std::string stridestr = (stride=="1")?"":("*" + stride); std::string stridestr = (stride=="1")?"":("*" + stride);
std::string res; std::string res;
for(unsigned int s = 0 ; s < simd_width ; ++s) for(unsigned int s = 0 ; s < vwidth ; ++s)
res += (s>0?";(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s); res += (s>0?";(":"(") + ptr + ")[" + offset + "*" + tools::to_string(vwidth) + " + " + tools::to_string(s) + stridestr + "] = " + access_vector_type(value, s);
return res; return res;
} }
} }
} }
inline std::string vload(unsigned int simd_width, std::string const & dtype, std::string const & offset, std::string const & ptr, std::string const & stride, driver::backend_type backend, bool aligned = true) inline std::string vload(unsigned int vwidth, std::string const & dtype, std::string const & offset, std::string const & ptr, std::string const & stride, driver::backend_type backend, bool aligned = true)
{ {
std::string vdtype = append_width(dtype,simd_width); std::string vdtype = append_width(dtype,vwidth);
if (simd_width==1) if (vwidth==1)
return "(" + ptr + ")[" + offset + "]"; return "(" + ptr + ")[" + offset + "]";
else else
{ {
if(backend == driver::CUDA && stride == "1" && aligned) if(backend == driver::CUDA && stride == "1" && aligned)
return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "]"; return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "]";
else if(backend == driver::OPENCL && stride == "1") else if(backend == driver::OPENCL && stride == "1")
return append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")"; return append_width("vload", vwidth) + "(" + offset + ", " + ptr + ")";
else else
{ {
std::string stridestr = (stride=="1")?"":("*" + stride); std::string stridestr = (stride=="1")?"":("*" + stride);
@@ -101,8 +101,8 @@ inline std::string vload(unsigned int simd_width, std::string const & dtype, std
res = "make_" + vdtype + "("; res = "make_" + vdtype + "(";
else else
res = "(" + vdtype + ")("; res = "(" + vdtype + ")(";
for(unsigned int s = 0 ; s < simd_width ; ++s) for(unsigned int s = 0 ; s < vwidth ; ++s)
res += ((s>0)?",(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + stridestr + "]"; res += ((s>0)?",(":"(") + ptr + ")[" + offset + "*" + tools::to_string(vwidth) + " + " + tools::to_string(s) + stridestr + "]";
res += ")"; res += ")";
return res; return res;
} }

View File

@@ -142,7 +142,7 @@ profiles::value_type::templates_container const & profiles::value_type::template
std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x) std::shared_ptr<templates::base> profiles::create(std::string const & template_name, std::vector<int> const & x)
{ {
templates::fetching_policy_type fetch[] = {templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_GLOBAL_STRIDED, templates::FETCH_FROM_GLOBAL_CONTIGUOUS}; templates::fetch_type fetch[] = {templates::FETCH_FROM_LOCAL, templates::FETCH_FROM_GLOBAL_STRIDED, templates::FETCH_FROM_GLOBAL_CONTIGUOUS};
if(template_name=="elementwise_1d") if(template_name=="elementwise_1d")
return std::shared_ptr<templates::base>(new templates::elementwise_1d(x[0], x[1], x[2], fetch[x[3]])); return std::shared_ptr<templates::base>(new templates::elementwise_1d(x[0], x[1], x[2], fetch[x[3]]));
else if(template_name=="reduce_1d") else if(template_name=="reduce_1d")