Initial commit

This commit is contained in:
Philippe Tillet
2014-08-30 18:02:17 -04:00
parent 5f4bab6e47
commit 609c7af6ce
25 changed files with 5515 additions and 0 deletions

18
CMakeLists.txt Normal file
View File

@@ -0,0 +1,18 @@
cmake_minimum_required(VERSION 2.8)
# Add visibility of headers
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.hpp *.h)
add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
find_package(ViennaCL QUIET REQUIRED)
include_directories(${VIENNACL_INCLUDE_DIRS})
find_package(OpenCL QUIET REQUIRED)
include_directories(${OPENCL_INCLUDE_DIRS})
include_directories(${PROJECT_SOURCE_DIR})
INCLUDE(CTest)
add_subdirectory(tests)

31
atidlas/execute.hpp Normal file
View File

@@ -0,0 +1,31 @@
#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
#define VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
#include <cstring>
#include <vector>
#include <typeinfo>
#include "viennacl/tools/tools.hpp"
#include "viennacl/tools/timer.hpp"
#include "viennacl/scheduler/forwards.h"
#include "atidlas/forwards.h"
#include "atidlas/templates/template_base.hpp"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/execution_handler.hpp"
namespace atidlas
{
inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false)
{
//Generate program name
std::string program_name = tree_parsing::statements_representation(statements, BIND_TO_HANDLE);
execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation);
handler.add(program_name, T, statements);
handler.execute(program_name, statements);
}
}
#endif

View File

@@ -0,0 +1,77 @@
#ifndef ATIDLAS_EXECUTION_HANDLER_HPP
#define ATIDLAS_EXECUTION_HANDLER_HPP
#include <map>
#include "viennacl/tools/shared_ptr.hpp"
#include "atidlas/lazy_program_compiler.hpp"
#include "atidlas/templates/template_base.hpp"
namespace atidlas
{
class execution_handler
{
public:
typedef std::map< std::string, tools::shared_ptr<template_base> > container_type;
private:
std::string append_prefix(std::string const & str)
{
return "_" + str;
}
std::string define_extension(std::string const & ext)
{
// Note: On devices without double precision support, 'ext' is an empty string.
return (ext.length() > 1) ? std::string("#pragma OPENCL EXTENSION " + ext + " : enable\n") : std::string("\n");
}
void init_program_compiler(std::string const & name, bool force_recompilation)
{
lazy_programs_.push_back(lazy_program_compiler(&ctx_, name, force_recompilation));
lazy_programs_.back().add(define_extension(device_.double_support_extension()));
}
public:
execution_handler(std::string const & program_name_base, viennacl::ocl::context & ctx, viennacl::ocl::device const & device, bool force_recompilation = false) : ctx_(ctx), device_(device), program_names_(2), init_done_(false)
{
lazy_programs_.reserve(2);
init_program_compiler(program_name_base + "_0", force_recompilation);
init_program_compiler(program_name_base + "_1", force_recompilation);
}
void add(std::string const & key, template_base const & T, statements_container const & statements)
{
if (kernels_.insert(container_type::value_type(key, T.clone())).second)
{
std::vector<std::string> sources = kernels_.at(key)->generate(append_prefix(key), statements, device_);
assert(sources.size()<=2);
for (unsigned int i = 0; i < sources.size(); ++i)
lazy_programs_[i].add(sources[i]);
}
}
template_base * template_of(std::string const & key)
{
return kernels_.at(key).get();
}
void execute(container_type::key_type const & key, statements_container const & statements)
{
tools::shared_ptr<template_base> & template_pointer = kernels_.at(key);
template_pointer->enqueue(append_prefix(key), lazy_programs_, statements);
}
private:
viennacl::ocl::context & ctx_;
viennacl::ocl::device const & device_;
container_type kernels_;
std::vector<std::string> program_names_;
std::vector<lazy_program_compiler> lazy_programs_;
bool init_done_;
};
}
#endif

258
atidlas/forwards.h Normal file
View File

@@ -0,0 +1,258 @@
#ifndef ATIDLAS_FORWARDS_H
#define ATIDLAS_FORWARDS_H
#include <list>
#include <map>
#include <set>
#include <stdexcept>
#include "atidlas/tools/shared_ptr.hpp"
#include "viennacl/scheduler/io.hpp"
#include "viennacl/ocl/forwards.h"
#include "viennacl/scheduler/forwards.h"
#include "viennacl/backend/mem_handle.hpp"
namespace atidlas
{
typedef int atidlas_int_t;
//Error codes
static const int TEMPLATE_VALID = 0;
static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
static const int TEMPLATE_AlignmentV_MUST_BE_BLOCK_SIZE_MULTIPLE = -8;
static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
struct atidlas_int_tuple
{
atidlas_int_tuple(std::string const & _i, std::string const & _bound0) : i(_i), bound0(_bound0), j(""), bound1(""){ }
atidlas_int_tuple(std::string const & _i, std::string const & _bound0, std::string const & _j, std::string const & _bound1) : i(_i), bound0(_bound0), j(_j), bound1(_bound1){ }
std::string i;
std::string bound0;
std::string j;
std::string bound1;
};
inline bool is_scalar_reduction(viennacl::scheduler::statement_node const & node)
{
return node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE || node.op.type_family==viennacl::scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY;
}
inline bool is_vector_reduction(viennacl::scheduler::statement_node const & node)
{
return node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
|| node.op.type_family==viennacl::scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|| node.op.type_family==viennacl::scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
}
inline viennacl::scheduler::statement_node const & lhs_most(viennacl::scheduler::statement::container_type const & array, size_t root)
{
viennacl::scheduler::statement_node const * current = &array[root];
while (current->lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
current = &array[current->lhs.node_index];
return *current;
}
enum expression_type
{
SCALAR_AXPY_TYPE,
VECTOR_AXPY_TYPE,
MATRIX_AXPY_TYPE,
REDUCTION_TYPE,
ROW_WISE_REDUCTION_Nx_TYPE,
ROW_WISE_REDUCTION_Tx_TYPE,
MATRIX_PRODUCT_NN_TYPE,
MATRIX_PRODUCT_TN_TYPE,
MATRIX_PRODUCT_NT_TYPE,
MATRIX_PRODUCT_TT_TYPE,
INVALID_EXPRESSION_TYPE
};
inline const char * expression_type_to_string(expression_type type)
{
switch (type)
{
case SCALAR_AXPY_TYPE : return "Scalar AXPY";
case VECTOR_AXPY_TYPE : return "Vector AXPY";
case MATRIX_AXPY_TYPE : return "Matrix AXPY";
case REDUCTION_TYPE : return "Reduction";
case ROW_WISE_REDUCTION_Nx_TYPE : return "Row-wise reduction: Ax";
case ROW_WISE_REDUCTION_Tx_TYPE : return "Row-wise reduction : Tx";
case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
default : return "INVALID EXPRESSION";
}
}
/** @brief generate the string for a pointer kernel argument */
static std::string generate_value_kernel_argument(std::string const & scalartype, std::string const & name)
{
return scalartype + ' ' + name + ",";
}
/** @brief generate the string for a pointer kernel argument */
static std::string generate_pointer_kernel_argument(std::string const & address_space, std::string const & scalartype, std::string const & name)
{
return address_space + " " + scalartype + "* " + name + ",";
}
/** @brief Emulation of C++11's .at() member for std::map<> */
template<typename KeyT, typename ValueT>
ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
{
typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
if (it != map.end())
return it->second;
throw std::out_of_range("Generator: Key not found in map");
}
/** @brief Exception for the case the generator is unable to deal with the operation */
class generator_not_supported_exception : public std::exception
{
public:
generator_not_supported_exception() : message_() {}
generator_not_supported_exception(std::string message) : message_("ViennaCL: Internal error: The generator cannot handle the statement provided: " + message) {}
virtual const char* what() const throw() { return message_.c_str(); }
virtual ~generator_not_supported_exception() throw() {}
private:
std::string message_;
};
namespace utils
{
class kernel_generation_stream;
}
enum leaf_t
{
LHS_NODE_TYPE,
PARENT_NODE_TYPE,
RHS_NODE_TYPE
};
class mapped_object;
class template_base;
typedef std::pair<atidlas_int_t, leaf_t> mapping_key;
typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
namespace tree_parsing
{
template<class Fun>
inline void traverse(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, Fun const & fun, bool inspect);
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
viennacl::scheduler::statement const & statement, size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed);
inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors, viennacl::scheduler::statement const & statement, atidlas_int_t root_idx,mapping_type const & mapping);
}
using viennacl::scheduler::INT_TYPE;
using viennacl::scheduler::UINT_TYPE;
using viennacl::scheduler::ULONG_TYPE;
using viennacl::scheduler::LONG_TYPE;
using viennacl::scheduler::FLOAT_TYPE;
using viennacl::scheduler::DOUBLE_TYPE;
typedef cl_uint vendor_id_type;
typedef cl_device_type device_type;
typedef std::string device_name_type;
class symbolic_binder
{
public:
virtual ~symbolic_binder(){ }
virtual bool bind(viennacl::backend::mem_handle const * ph) = 0;
virtual unsigned int get(viennacl::backend::mem_handle const * ph) = 0;
};
class bind_to_handle : public symbolic_binder
{
public:
bind_to_handle() : current_arg_(0){ }
bool bind(viennacl::backend::mem_handle const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
unsigned int get(viennacl::backend::mem_handle const * ph){ return bind(ph)?current_arg_++:memory.at((void*)ph); }
private:
unsigned int current_arg_;
std::map<void*,unsigned int> memory;
};
class bind_all_unique : public symbolic_binder
{
public:
bind_all_unique() : current_arg_(0){ }
bool bind(viennacl::backend::mem_handle const *) {return true; }
unsigned int get(viennacl::backend::mem_handle const *){ return current_arg_++; }
private:
unsigned int current_arg_;
std::map<void*,unsigned int> memory;
};
enum binding_policy_t{
BIND_ALL_UNIQUE,
BIND_TO_HANDLE
};
inline tools::shared_ptr<symbolic_binder> make_binder(binding_policy_t policy)
{
if (policy==BIND_TO_HANDLE)
return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
else
return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
}
template<char C>
struct char_to_type{ };
class statements_container
{
public:
typedef std::list<viennacl::scheduler::statement> data_type;
enum order_type { SEQUENTIAL, INDEPENDENT };
statements_container(data_type const & data, order_type order) : data_(data), order_(order)
{ }
statements_container(viennacl::scheduler::statement const & s0) : order_(INDEPENDENT)
{
data_.push_back(s0);
}
statements_container(viennacl::scheduler::statement const & s0, viennacl::scheduler::statement const & s1, order_type order) : order_(order)
{
data_.push_back(s0);
data_.push_back(s1);
}
std::list<viennacl::scheduler::statement> const & data() const { return data_; }
order_type order() const { return order_; }
private:
std::list<viennacl::scheduler::statement> data_;
order_type order_;
};
}
#endif

View File

@@ -0,0 +1,41 @@
#ifndef ATIDLAS_LAZY_PROGRAM_COMPILER_HPP
#define ATIDLAS_LAZY_PROGRAM_COMPILER_HPP
#include <map>
#include "viennacl/ocl/context.hpp"
namespace atidlas
{
class lazy_program_compiler
{
public:
lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, std::string const & src, bool force_recompilation) : ctx_(ctx), name_(name), src_(src), force_recompilation_(force_recompilation){ }
lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, bool force_recompilation) : ctx_(ctx), name_(name), force_recompilation_(force_recompilation){ }
void add(std::string const & src) { src_+=src; }
std::string const & src() const { return src_; }
viennacl::ocl::program & program()
{
if (force_recompilation_ && ctx_->has_program(name_))
ctx_->delete_program(name_);
if (!ctx_->has_program(name_))
{
// std::cout << src_ << std::endl;
ctx_->add_program(src_, name_);
}
return ctx_->get_program(name_);
}
private:
viennacl::ocl::context * ctx_;
std::string name_;
std::string src_;
bool force_recompilation_;
};
}
#endif

429
atidlas/mapped_objects.hpp Normal file
View File

@@ -0,0 +1,429 @@
#ifndef ATIDLAS_MAPPED_TYPE_HPP
#define ATIDLAS_MAPPED_TYPE_HPP
#include <string>
#include "viennacl/scheduler/forwards.h"
#include "atidlas/forwards.h"
#include "atidlas/tools/find_and_replace.hpp"
#include "atidlas/utils.hpp"
namespace atidlas
{
/** @brief Mapped Object
*
* This object populates the symbolic mapping associated with a statement. (root_id, LHS|RHS|PARENT) => mapped_object
* The tree can then be reconstructed in its symbolic form
*/
class mapped_object
{
private:
virtual void postprocess(std::string &) const { }
protected:
struct MorphBase { virtual ~MorphBase(){} };
struct MorphBase1D : public MorphBase { public: virtual std::string operator()(std::string const & i) const = 0; };
struct MorphBase2D : public MorphBase { public: virtual std::string operator()(std::string const & i, std::string const & j) const = 0; };
static void replace_offset(std::string & str, MorphBase const & morph)
{
size_t pos = 0;
while ((pos=str.find("$OFFSET", pos))!=std::string::npos)
{
std::string postprocessed;
size_t pos_po = str.find('{', pos);
size_t pos_pe = str.find('}', pos_po);
if (MorphBase2D const * p = dynamic_cast<MorphBase2D const *>(&morph))
{
size_t pos_comma = str.find(',', pos_po);
std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1);
std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1);
postprocessed = (*p)(i, j);
}
else if (MorphBase1D const * p = dynamic_cast<MorphBase1D const *>(&morph))
{
std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1);
postprocessed = (*p)(i);
}
str.replace(pos, pos_pe + 1 - pos, postprocessed);
pos = pos_pe;
}
}
void register_attribute(std::string & attribute, std::string const & key, std::string const & value)
{
attribute = value;
keywords_[key] = attribute;
}
public:
struct node_info
{
node_info(mapping_type const * _mapping, viennacl::scheduler::statement const * _statement, atidlas_int_t _root_idx) :
mapping(_mapping), statement(_statement), root_idx(_root_idx) { }
mapping_type const * mapping;
viennacl::scheduler::statement const * statement;
atidlas_int_t root_idx;
};
public:
mapped_object(std::string const & scalartype, unsigned int id, std::string const & type_key) : type_key_(type_key)
{
register_attribute(scalartype_, "#scalartype", scalartype);
register_attribute(name_, "#name", "obj" + tools::to_string(id));
}
virtual ~mapped_object(){ }
std::string type_key() const { return type_key_; }
std::string const & name() const { return name_; }
std::string process(std::string const & in) const
{
std::string res(in);
for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it)
tools::find_and_replace(res, it->first, it->second);
postprocess(res);
return res;
}
std::string evaluate(std::map<std::string, std::string> const & accessors) const
{
if (accessors.find(type_key_)==accessors.end())
return name_;
return process(accessors.at(type_key_));
}
protected:
std::string name_;
std::string scalartype_;
std::string type_key_;
std::map<std::string, std::string> keywords_;
};
/** @brief Binary leaf interface
*
* Some subtrees have to be interpret at leaves when reconstructing the final expression. It is the case of trans(), diag(), prod(), etc...
* This interface stores basic infos about the sub-trees
*/
class binary_leaf
{
public:
binary_leaf(mapped_object::node_info info) : info_(info){ }
void process_recursive(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors)
{
std::set<std::string> already_fetched;
tree_parsing::process(stream, leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping, already_fetched);
}
std::string evaluate_recursive(leaf_t leaf, std::map<std::string, std::string> const & accessors)
{
return tree_parsing::evaluate(leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
protected:
mapped_object::node_info info_;
};
/** @brief Matrix product
*
* Maps prod(matrix_expression, matrix_expression)
*/
class mapped_matrix_product : public mapped_object, public binary_leaf
{
public:
mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { }
};
/** @brief Reduction
*
* Base class for mapping a reduction
*/
class mapped_reduction : public mapped_object, public binary_leaf
{
public:
mapped_reduction(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) : mapped_object(scalartype, id, type_key), binary_leaf(info){ }
atidlas_int_t root_idx() const { return info_.root_idx; }
viennacl::scheduler::statement const & statement() const { return *info_.statement; }
viennacl::scheduler::statement_node root_node() const { return statement().array()[root_idx()]; }
bool is_index_reduction() const { return utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); }
viennacl::scheduler::op_element root_op() const
{
viennacl::scheduler::op_element res = info_.statement->array()[info_.root_idx].op;
if (res.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
||res.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
res.type = viennacl::scheduler::OPERATION_BINARY_ADD_TYPE;
return res;
}
};
/** @brief Scalar reduction
*
* Maps a scalar reduction (max, min, argmax, inner_prod, etc..)
*/
class mapped_scalar_reduction : public mapped_reduction
{
public:
mapped_scalar_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ }
};
/** @brief Vector reduction
*
* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..)
*/
class mapped_row_wise_reduction : public mapped_reduction
{
public:
mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") { }
};
/** @brief Host scalar
*
* Maps a host scalar (passed by value)
*/
class mapped_host_scalar : public mapped_object
{
public:
mapped_host_scalar(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "host_scalar"){ }
};
/** @brief Handle
*
* Maps an object passed by pointer
*/
class mapped_handle : public mapped_object
{
public:
mapped_handle(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_object(scalartype, id, type_key)
{
register_attribute(pointer_, "#pointer", name_ + "_pointer");
}
private:
std::string pointer_;
};
/** @brief Scalar
*
* Maps a scalar passed by pointer
*/
class mapped_scalar : public mapped_handle
{
public:
mapped_scalar(std::string const & scalartype, unsigned int id) : mapped_handle(scalartype, id, "scalar") { }
};
/** @brief Buffered
*
* Maps a buffered object (vector, matrix)
*/
class mapped_buffer : public mapped_handle
{
public:
mapped_buffer(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_handle(scalartype, id, type_key){ }
};
/** @brief Vector
*
* Maps a vector
*/
class mapped_vector : public mapped_buffer
{
public:
mapped_vector(std::string const & scalartype, unsigned int id) : mapped_buffer(scalartype, id, "vector")
{
register_attribute(start_, "#start", name_ + "_start");
register_attribute(stride_, "#stride", name_ + "_stride");
}
private:
std::string start_;
std::string stride_;
};
/** @brief Matrix
*
* Maps a matrix
*/
class mapped_matrix : public mapped_buffer
{
private:
void postprocess(std::string & str) const
{
struct Morph : public MorphBase2D
{
Morph(bool _is_row_major, std::string const & _ld) : is_row_major(_is_row_major), ld(_ld){ }
std::string operator()(std::string const & i, std::string const & j) const
{
if (is_row_major)
return "(" + i + ") * " + ld + " + (" + j + ")";
return "(" + i + ") + (" + j + ") * " + ld;
}
private:
bool is_row_major;
std::string const & ld;
};
replace_offset(str, Morph(row_major_, ld_));
}
public:
mapped_matrix(std::string const & scalartype, unsigned int id, bool row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major)
{
register_attribute(ld_, "#ld", name_ + "_ld");
register_attribute(start1_, "#start1", name_ + "_start1");
register_attribute(start2_, "#start2", name_ + "_start2");
register_attribute(stride1_, "#stride1", name_ + "_stride1");
register_attribute(stride2_, "#stride2", name_ + "_stride2");
if (row_major_)
keywords_["#nldstride"] = "#stride1";
else
keywords_["#nldstride"] = "#stride2";
}
bool row_major() const
{
return row_major_;
}
private:
std::string ld_;
std::string start1_;
std::string start2_;
std::string stride1_;
std::string stride2_;
bool row_major_;
};
/** @brief Vector diag
*
* Maps a diag(vector_expression) node into a diagonal matrix
*/
class mapped_vector_diag : public mapped_object, public binary_leaf
{
private:
void postprocess(std::string &res) const
{
std::map<std::string, std::string> accessors;
tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
accessors["vector"] = res;
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
public:
mapped_vector_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "vector_diag"), binary_leaf(info){ }
};
/** @brief Trans
*
* Maps trans(matrix_expression) into the transposed of matrix_expression
*/
class mapped_trans: public mapped_object, public binary_leaf
{
private:
void postprocess(std::string &res) const
{
std::map<std::string, std::string> accessors;
accessors["matrix"] = res;
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
public:
mapped_trans(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ }
};
/** @brief Matrix row
*
* Maps row(matrix_expression, scalar_expression) into the scalar_expression's row of matrix_expression
*/
class mapped_matrix_row : public mapped_object, binary_leaf
{
private:
void postprocess(std::string &res) const
{
std::map<std::string, std::string> accessors;
tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
accessors["matrix"] = res;
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
public:
mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info)
{ }
};
/** @brief Matrix column
*
* Maps column(matrix_expression, scalar_expression) into the scalar_expression's column of matrix_expression
*/
class mapped_matrix_column : public mapped_object, binary_leaf
{
private:
void postprocess(std::string &res) const
{
std::map<std::string, std::string> accessors;
tools::find_and_replace(res, "#column", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
accessors["matrix"] = res;
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
public:
mapped_matrix_column(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_column"), binary_leaf(info)
{ }
};
/** @brief Matrix diag
*
* Maps a diag(matrix_expression) node into the vector of its diagonal elements
*/
class mapped_matrix_diag : public mapped_object, binary_leaf
{
private:
void postprocess(std::string &res) const
{
std::map<std::string, std::string> accessors;
tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
accessors["matrix"] = res;
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
}
public:
mapped_matrix_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_diag"), binary_leaf(info)
{ }
};
/** @brief Implicit vector
*
* Maps an implicit vector
*/
class mapped_implicit_vector : public mapped_object
{
public:
mapped_implicit_vector(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_vector")
{ }
};
/** @brief Implicit matrix
*
* Maps an implicit matrix
*/
class mapped_implicit_matrix : public mapped_object
{
public:
mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_matrix")
{ }
};
}
#endif

View File

@@ -0,0 +1,137 @@
#ifndef ATIDLAS_TEMPLATES_MATRIX_AXPY_HPP
#define ATIDLAS_TEMPLATES_MATRIX_AXPY_HPP
#include <vector>
#include "atidlas/mapped_objects.hpp"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/utils.hpp"
#include "atidlas/templates/template_base.hpp"
#include "viennacl/scheduler/forwards.h"
#include "viennacl/tools/tools.hpp"
namespace atidlas
{
class matrix_axpy_parameters_type : public template_base::parameters_type
{
public:
matrix_axpy_parameters_type(unsigned int _simd_width,
unsigned int _local_size_0, unsigned int _local_size_1,
unsigned int _num_groups_0, unsigned int _num_groups_1,
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetching_policy(_fetching_policy){ }
unsigned int num_groups_0;
unsigned int num_groups_1;
fetching_policy_type fetching_policy;
};
class matrix_axpy_template : public template_base_impl<matrix_axpy_template, matrix_axpy_parameters_type>
{
private:
int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
{
if (p_.simd_width>1)
return TEMPLATE_INVALID_SIMD_WIDTH;
return TEMPLATE_VALID;
}
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
{
utils::kernel_generation_stream stream;
std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
stream << "__kernel void " << kernel_prefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
("matrix", "#pointer += $OFFSET{#start1, #start2};")
("vector", "#pointer += #start;"), statements, mappings);
fetching_loop_info(p_.fetching_policy, "M", stream, init0, upper_bound0, inc0, "get_global_id(0)", "get_global_size(0)");
stream << "for(unsigned int i = " << init0 << "; i < " << upper_bound0 << "; i += " << inc0 << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
fetching_loop_info(p_.fetching_policy, "N", stream, init1, upper_bound1, inc1, "get_global_id(1)", "get_global_size(1)");
stream << "for(unsigned int j = " << init1 << "; j < " << upper_bound1 << "; j += " << inc1 << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("matrix", utils::append_width("#scalartype",simd_width) + " #namereg = #pointer[$OFFSET{i*#stride1,j*#stride2}];")
("vector_diag", "#scalartype #namereg = ((i + ((#diag_offset<0)?#diag_offset:0))!=(j-((#diag_offset>0)?#diag_offset:0)))?0:#pointer[min(i*#stride, j*#stride)];")
, statements, mappings);
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, utils::create_evaluate_accessors("matrix", "#namereg")
("vector_diag", "#namereg")
("scalar", "#namereg")
, statements, mappings);
tree_parsing::process(stream, LHS_NODE_TYPE, utils::create_process_accessors("matrix", "#pointer[$OFFSET{i*#stride1,j*#stride2}] = #namereg;")
, statements, mappings);
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
return stream.str();
}
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
{
std::vector<std::string> res;
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1));
return res;
}
public:
matrix_axpy_template(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<matrix_axpy_template, matrix_axpy_parameters_type>(parameters, binding_policy), up_to_internal_size_(false){ }
void up_to_internal_size(bool v)
{
up_to_internal_size_ = v;
}
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
{
viennacl::ocl::kernel & kernel = programs[0].program().get_kernel(kernel_prefix);
kernel.local_work_size(0, p_.local_size_0);
kernel.local_work_size(1, p_.local_size_1);
kernel.global_work_size(0,p_.local_size_0*p_.num_groups_0);
kernel.global_work_size(1,p_.local_size_1*p_.num_groups_1);
scheduler::statement_node const & root = statements.data().front().array()[statements.data().front().root()];
unsigned int current_arg = 0;
if (up_to_internal_size_)
{
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size1_fun())));
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size2_fun())));
}
else
{
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size1_fun())));
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size2_fun())));
}
set_arguments(statements, kernel, current_arg);
viennacl::ocl::enqueue(kernel);
}
private:
bool up_to_internal_size_;
};
}
#endif

View File

@@ -0,0 +1,845 @@
#ifndef ATIDLAS_TEMPLATES_MATRIX_PRODUCT_HPP
#define ATIDLAS_TEMPLATES_MATRIX_PRODUCT_HPP
#include <vector>
#include "viennacl/scheduler/forwards.h"
#include "viennacl/matrix_def.hpp"
#include "viennacl/matrix_proxy.hpp"
#include "atidlas/templates/template_base.hpp"
#include "atidlas/mapped_objects.hpp"
#include "atidlas/utils.hpp"
#include "atidlas/tree_parsing.hpp"
#include "viennacl/forwards.h"
#include "viennacl/tools/tools.hpp"
namespace atidlas
{
struct matrix_product_parameters : public template_base::parameters_type
{
matrix_product_parameters(unsigned int simd_width
, unsigned int local_size_0, unsigned int KL, unsigned int local_size_1
, unsigned int ms, unsigned int ks, unsigned int ns
, fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy
, unsigned int local_fetch_0, unsigned int local_fetch_1): template_base::parameters_type(simd_width, local_size_0, local_size_1, 1),
kL(KL), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy), B_fetching_policy(B_fetching_policy),
local_fetch_0(local_fetch_0), local_fetch_1(local_fetch_1),
mL(ms*local_size_0), nL(ns*local_size_1){}
unsigned int kL;
unsigned int mS;
unsigned int kS;
unsigned int nS;
fetching_policy_type A_fetching_policy;
fetching_policy_type B_fetching_policy;
unsigned int local_fetch_0;
unsigned int local_fetch_1;
unsigned int mL;
unsigned int nL;
};
class matrix_product_template : public template_base_impl<matrix_product_template, matrix_product_parameters>
{
private:
unsigned int n_lmem_elements() const
{
unsigned int N = 0;
if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
N += p_.kL * (p_.mL+1);
if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
N += p_.nL * (p_.kL+1);
return N;
}
int check_invalid_impl(viennacl::ocl::device const & /*device*/) const
{
if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
if (viennacl::dense_padding_size % p_.mL > 0 || viennacl::dense_padding_size % p_.kL > 0 || viennacl::dense_padding_size % p_.nL > 0)
return TEMPLATE_AlignmentV_MUST_BE_BLOCK_SIZE_MULTIPLE;
if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
if (p_.kS > p_.kL)
return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1)
return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL)
{
if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1))
return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
}
if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
{
unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
}
if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
{
unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
}
return TEMPLATE_VALID;
}
static void parse(scheduler::statement const & s,
atidlas_int_t & C_idx, leaf_t & C_leaf, atidlas_int_t & alpha_idx, leaf_t & alpha_leaf,
atidlas_int_t & A_idx, leaf_t & A_leaf, bool& A_trans, atidlas_int_t & B_idx, leaf_t & B_leaf, bool& B_trans,
atidlas_int_t & beta_idx, leaf_t & beta_leaf)
{
using namespace tree_parsing;
using namespace scheduler;
scheduler::statement::container_type const & array = s.array();
atidlas_int_t root_idx = s.root();
C_idx = root_idx;
C_leaf = LHS_NODE_TYPE;
atidlas_int_t node_add_idx = array[root_idx].rhs.node_index;
atidlas_int_t node_1_idx = array[node_add_idx].lhs.node_index;
alpha_idx = node_1_idx;
alpha_leaf = RHS_NODE_TYPE;
atidlas_int_t mat_prod_idx = array[node_1_idx].lhs.node_index;
if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY)
{
A_trans = false;
A_idx = mat_prod_idx;
}
else
{
A_trans = true;
A_idx = array[mat_prod_idx].lhs.node_index;
}
A_leaf = LHS_NODE_TYPE;
if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY)
{
B_trans = false;
B_idx = mat_prod_idx;
B_leaf = RHS_NODE_TYPE;
}
else
{
B_trans = true;
B_idx = array[mat_prod_idx].rhs.node_index;
B_leaf = LHS_NODE_TYPE;
}
atidlas_int_t node_2_idx = array[node_add_idx].rhs.node_index;
beta_idx = node_2_idx;
beta_leaf = RHS_NODE_TYPE;
}
void handle_bounds(bool fallback, utils::kernel_generation_stream & stream, std::string const & inbounds, std::string const & do_if, std::string do_else) const
{
if (fallback)
{
stream << "if (" << inbounds << ")" << std::endl;
stream.inc_tab();
stream << do_if << ";" << std::endl;
stream.dec_tab();
stream << "else" << std::endl;
stream.inc_tab();
stream << do_else << ";" << std::endl;
stream.dec_tab();
}
else
stream << do_if << ";" << std::endl;
}
std::string generate_impl(const std::string &kernel_prefix, const statements_container &statements, const std::vector<mapping_type> &mappings, bool fallback) const
{
using std::string;
using tools::to_string;
parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, p_.local_fetch_1);
parameters_type const & p = fallback?pfallback:p_;
#define MUL_STRIDE1 string(fallback?"*#stride1":"")
#define HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0"))
#define VLOAD(offset, ptr) vload(p.simd_width, offset, ptr)
#define VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr)
string widthstr = tools::to_string(p.simd_width);
//////////////////
/// INIT
/// //////////////
utils::kernel_generation_stream stream;
scheduler::statement const & st = statements.data().front();
mapping_type const & mapping = mappings.front();
bool A_trans = false, B_trans = false;
atidlas_int_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
mapped_matrix * C = (mapped_matrix*)mapping.at(mapping_key(C_idx, C_leaf)).get();
mapped_host_scalar * alpha = (mapped_host_scalar*)mapping.at(mapping_key(alpha_idx, alpha_leaf)).get();
mapped_matrix * A = (mapped_matrix*)mapping.at(mapping_key(A_idx, A_leaf)).get();
mapped_matrix * B = (mapped_matrix*)mapping.at(mapping_key(B_idx, B_leaf)).get();
mapped_host_scalar * beta = (mapped_host_scalar*)mapping.at(mapping_key(beta_idx, beta_leaf)).get();
//////////////////
/// DECLARATIONS
/// //////////////
stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl;
std::map<std::string, unsigned int> widths;
widths[A->name()] = p.simd_width;
widths[B->name()] = p.simd_width;
generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths);
stream << "{" << std::endl;
stream.inc_tab();
if(!fallback)
{
stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << std::endl;
stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
stream << B->process("#start1/= " + to_string(p.simd_width) + ";") << std::endl;
stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
}
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("matrix", "#pointer += $OFFSET{#start1, #start2};")
("matrix", "#ld *= #nldstride;"), statements, mappings);
///Result Values
stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl;
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + to_string(p.mS) + "];") << std::endl;
else
stream << A->process(utils::append_width("#scalartype",p.simd_width) + " rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << std::endl;
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + to_string(p.nS) + "];");
else
stream << B->process(utils::append_width("#scalartype",p.simd_width) + " rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << std::endl;
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
stream << A->process("__local #scalartype lA[" + to_string(p.kL*(p.mL+1)) + "];");
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
stream << B->process("__local #scalartype lB[" + to_string(p.kL*(p.nL+1)) + "];");
stream << std::endl;
stream << "uint gidx = get_group_id(0);" << std::endl;
stream << "uint gidy = get_group_id(1);" << std::endl;
stream << "uint idx = get_local_id(0);" << std::endl;
stream << "uint idy = get_local_id(1);" << std::endl;
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
{
stream << std::endl;
stream << "uint idt = " << p.local_size_0 << "*idy + idx;" << std::endl;
stream << "uint idxT = idt % " << p.local_fetch_0 << ";" << std::endl;
stream << "uint idyT = idt / " << p.local_fetch_0 << ";" << std::endl;
}
stream << std::endl;
if (fallback)
{
//Bounds checking for M (in A, C)
stream << "bool in_bounds_m[" << p.mS << "];" << std::endl;
stream << "for(unsigned int m = 0; m < " << p.mS << "; m++)" << std::endl;
stream.inc_tab();
switch (p.A_fetching_policy)
{
case FETCH_FROM_GLOBAL_CONTIGUOUS:
stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + m < M;" << std::endl;
break;
default:
stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << p.local_size_0 << " < M;" << std::endl;
break;
}
stream.dec_tab();
//Bounds checking for A if Local
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
{
unsigned int fetch_size = (A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << std::endl;
stream << "for(unsigned int m = 0; m < " << p.mL/fetch_size << "; m++)" << std::endl;
stream.inc_tab();
stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << (A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl;
stream.dec_tab();
}
//Bounds checking for N (in B, C)
stream << "bool in_bounds_n[" << p.nS << "];" << std::endl;
stream << "for(unsigned int n = 0; n < " << p.nS << "; n++)" << std::endl;
stream.inc_tab();
switch (p.B_fetching_policy)
{
case FETCH_FROM_GLOBAL_CONTIGUOUS:
stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + n < N;" << std::endl;
break;
default:
stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << p.local_size_1 << " < N;" << std::endl;
break;
}
stream.dec_tab();
//Bounds checking for B if Local
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
{
unsigned int fetch_size = (B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << std::endl;
stream << "for(unsigned int n = 0; n < " << p.nL/fetch_size << "; n++)" << std::endl;
stream.inc_tab();
stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << (B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl;
stream.dec_tab();
}
}
switch (p.A_fetching_policy)
{
case FETCH_FROM_LOCAL:
if (A_trans_=='N')
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + " + idxT)" + MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
else
stream << A->process("#pointer += idxT" + MUL_STRIDE1 + " + gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
break;
case FETCH_FROM_GLOBAL_CONTIGUOUS:
if (A_trans_=='N')
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + MUL_STRIDE1 + ";") << std::endl;
else
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")*#ld;") << std::endl;
break;
case FETCH_FROM_GLOBAL_STRIDED:
if (A_trans_=='N')
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx" + ")" + MUL_STRIDE1 + ";") << std::endl;
else
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl;
break;
default: break;
}
switch (p.B_fetching_policy)
{
case FETCH_FROM_LOCAL:
if (B_trans_=='T')
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + " + idxT" + ")" + MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
else
stream << B->process("#pointer += idxT" + MUL_STRIDE1 + " + gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
break;
case FETCH_FROM_GLOBAL_CONTIGUOUS:
if (B_trans_=='T')
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + MUL_STRIDE1 + ";") << std::endl;
else
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")*#ld;") << std::endl;
break;
case FETCH_FROM_GLOBAL_STRIDED:
if (B_trans_=='T')
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy" + ")" + MUL_STRIDE1 + ";") << std::endl;
else
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl;
break;
default: break;
}
stream << std::endl;
stream << "for(unsigned int block_k=0; block_k < K; block_k+=" << p.kL << "){" << std::endl;
stream.inc_tab();
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
{
if (A_trans_=='N')
stream << A->process("__local #scalartype* plA = lA + idyT*" + to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
else
stream << A->process("__local #scalartype* plA = lA + idxT*" + to_string(p.mL + 1) + " + idyT;") << std::endl;
}
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
{
if (B_trans_=='T')
stream << B->process("__local #scalartype* plB = lB + idyT*" + to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
else
stream << B->process("__local #scalartype* plB = lB + idxT*" + to_string(p.nL+1) + "+ idyT;") <<std::endl;
}
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
///Fetch LHS to Local Memory
if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N')
for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width)
{
string in_bounds = "in_bounds_m_local[" + to_string(m/(p.local_fetch_0*p.simd_width)) + "]";
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + MUL_STRIDE1 + "]";
stream << A->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl;
}
else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T')
for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1)
for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width)
{
string in_bounds = "in_bounds_m_local[" + to_string(k/p.local_fetch_1) + "]";
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + MUL_STRIDE1 + "]";
stream << A->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl;
}
if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T')
for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width)
{
string in_bounds = "in_bounds_n_local[" + to_string(n/(p.local_fetch_0*p.simd_width)) + "]";
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + MUL_STRIDE1 + "]";
stream << B->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl;
}
else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N')
for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1)
for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width)
{
string in_bounds = "in_bounds_n_local[" + to_string(k/p.local_fetch_1) + "]";
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + MUL_STRIDE1 + "]";
stream << B->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl;
}
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == FETCH_FROM_LOCAL)
{
stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
stream << "uint offA = " << p.simd_width << "*idx;" << std::endl;
stream << "uint offB = " << p.simd_width << "*idy;" << std::endl;
}
if (fallback)
stream << "for(unsigned int k = 0; k < " << p.kL << " && (block_k + k < K); k+=" << p.kS << "){" << std::endl;
else
stream << "for(unsigned int k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << std::endl;
stream.inc_tab();
///Fetch LHS to registers
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
switch (p.A_fetching_policy)
{
case FETCH_FROM_LOCAL:
for (unsigned int ss = 0; ss < p.simd_width; ++ss)
stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" << std::endl;
break;
case FETCH_FROM_GLOBAL_CONTIGUOUS:
{
if (A_trans_=='N')
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + MUL_STRIDE1 + "]")) << ";" << std::endl;
else
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
break;
}
case FETCH_FROM_GLOBAL_STRIDED:
{
if (A_trans_=='N')
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + to_string(p.local_size_0) + MUL_STRIDE1 + "]")) << ";" << std::endl;
else
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + to_string(p.local_size_0) + " + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
break;
}
default:
break;
}
stream.dec_tab();
stream << "}" << std::endl;
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
switch (p.B_fetching_policy)
{
case FETCH_FROM_LOCAL:
for (unsigned int ss = 0; ss < p.simd_width; ++ss)
stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + nn*" << p.local_size_1*p.simd_width << "+" << ss << "+ kk*" << (p.nL+1) << "];" << std::endl;
break;
case FETCH_FROM_GLOBAL_CONTIGUOUS:
{
if (B_trans_=='T')
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + MUL_STRIDE1 + "]")) << ";" << std::endl;
else
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
break;
}
case FETCH_FROM_GLOBAL_STRIDED:
{
if (B_trans_=='T')
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + to_string(p.local_size_1) + MUL_STRIDE1 + "]")) << ";" << std::endl;
else
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + to_string(p.local_size_1) + " + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
break;
}
default: break;
}
stream.dec_tab();
stream << "}" << std::endl;
///Increment pointers
switch (p.A_fetching_policy)
{
case FETCH_FROM_LOCAL:
stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl;
break;
default:
if (A_trans_=='N')
stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
else
stream << A->process("#pointer += " + to_string(p.kS) + "" + MUL_STRIDE1 + ";") << std::endl;
break;
}
switch (p.B_fetching_policy)
{
case FETCH_FROM_LOCAL:
stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl;
break;
default:
if (B_trans_=='T')
stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
else
stream << B->process("#pointer += " + to_string(p.kS) + "" + MUL_STRIDE1 + ";") << std::endl;
break;
}
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int kk = 0; kk <" << p.kS << "; ++kk)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
for (unsigned int nn=0; nn < p.nS; ++nn)
for (unsigned int mm=0; mm < p.mS; ++mm)
{
string res_str, lhs_str, rhs_str;
res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + "]";
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
lhs_str = "rA[kk][" + tools::to_string(mm) + "]";
else
lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + tools::to_string(mm%p.simd_width);
if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
rhs_str = "rB[kk]["+tools::to_string(nn)+"]";
else
rhs_str = "rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width);
stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl;
}
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
//Increment global pointer if local memory is used
//Else, it's incremented directly when fetching
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
{
if (A_trans_=='N')
stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
else
stream << A->process("#pointer += " + to_string(p.kL) + "" + MUL_STRIDE1 + ";") << std::endl;
}
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
{
if (B_trans_=='T')
stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
else
stream << B->process("#pointer += " + to_string(p.kL) + "" + MUL_STRIDE1 + ";") << std::endl;
}
stream.dec_tab();
stream << "}" << std::endl;
if (C->row_major())
{
unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << std::endl;
stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#ld;") << std::endl;
stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#stride2;") << std::endl;
stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#stride2;") << std::endl;
for (unsigned int n=0; n < p.nS; ++n)
{
for (unsigned int m=0; m < p.mS; ++m)
{
unsigned int ministride1 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0;
string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + m%p.simd_width);
if (fallback)
{
stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
stream.inc_tab();
}
stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
if (fallback)
stream.dec_tab();
}
if ((n+1)%p.simd_width>0 || p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
stream << C->process("#pointer += #stride2;") << std::endl;
else
stream << C->process("#pointer += " + to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << std::endl;
}
}
else
{
unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#stride1;") << std::endl;
stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#stride1;") << std::endl;
stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << std::endl;
stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#ld;") << std::endl;
for (unsigned int m=0; m < p.mS; ++m)
{
for (unsigned int n=0; n < p.nS; ++n)
{
unsigned int ministride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1;
string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + n%p.simd_width);
if (fallback)
{
stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
stream.inc_tab();
}
stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
if (fallback)
stream.dec_tab();
}
if ((m+1)%p.simd_width>0 || p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
stream << C->process("#pointer += #stride1;") << std::endl;
else
stream << C->process("#pointer += " + to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << std::endl;
}
}
stream.dec_tab();
stream << "}" << std::endl;
return stream.str();
#undef MUL_STRIDE1
#undef HANDLE_BOUNDS
#undef VLOAD
#undef VST0RE
}
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
{
std::vector<std::string> res;
res.push_back(generate_impl(kernel_prefix, statements, mappings, false));
res.push_back(generate_impl(kernel_prefix, statements, mappings, true));
return res;
}
template<class NumericT>
void enqueue_block(scheduler::statement & statement, atidlas_int_t M, atidlas_int_t N, atidlas_int_t K,
scheduler::lhs_rhs_element& eA, scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, scheduler::lhs_rhs_element& ebeta,
matrix_base<NumericT> const & A, matrix_base<NumericT> const & B, matrix_base<NumericT> const & C, NumericT beta,
std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix, int id)
{
if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || C.size1()==0 || C.size2()==0)
return;
viennacl::ocl::kernel& kernel = programs[id].program().get_kernel(kernel_prefix);
kernel.local_work_size(0, p_.local_size_0);
kernel.local_work_size(1, p_.local_size_1);
scheduler::statement::assign_element(eA, A);
scheduler::statement::assign_element(eB, B);
scheduler::statement::assign_element(eC, C);
scheduler::statement::assign_element(ebeta, beta);
if (id==1)
{
kernel.global_work_size(0, tools::align_to_multiple(tools::align_to_multiple((unsigned int)M,p_.mS)/p_.mS, p_.local_size_0));
kernel.global_work_size(1, tools::align_to_multiple(tools::align_to_multiple((unsigned int)N,p_.nS)/p_.nS, p_.local_size_1));
}
else
{
kernel.global_work_size(0, M/p_.mS);
kernel.global_work_size(1, N/p_.nS);
}
unsigned int current_arg = 0;
kernel.arg(current_arg++, cl_uint(M));
kernel.arg(current_arg++, cl_uint(N));
kernel.arg(current_arg++, cl_uint(K));
set_arguments(statement, kernel, current_arg);
viennacl::ocl::enqueue(kernel);
}
template<class NumericT>
matrix_slice< viennacl::matrix_base<NumericT> > create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, scheduler::lhs_rhs_element const & element,
atidlas_int_t s0_0, atidlas_int_t s0_1, atidlas_int_t s1_0, atidlas_int_t s1_1, bool swap)
{
matrix_base<NumericT> & M = *(element.*ptr);
atidlas_int_t start1 = M.start1();
atidlas_int_t start2 = M.start2();
atidlas_int_t stride1 = M.stride1();
atidlas_int_t stride2 = M.stride2();
if (swap ^ M.row_major())
{
std::swap(start1, start2);
std::swap(stride1, stride2);
}
slice s0(start1 + s0_0, stride1, s0_1 - s0_0);
slice s1(start2 + s1_0, stride2, s1_1 - s1_0);
if(swap)
std::swap(s0, s1);
return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1);
}
template<class NumericT>
void enqueue_impl(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr_matrix,
scheduler::statement & statement, scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta,
NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix)
{
using namespace device_specific::utils;
atidlas_int_t ldstrideA = call_on_matrix(A, leading_stride_fun());
atidlas_int_t ldstrideB = call_on_matrix(B, leading_stride_fun());
atidlas_int_t ldstrideC = call_on_matrix(C, leading_stride_fun());
atidlas_int_t ldstartA = call_on_matrix(A, leading_start_fun());
atidlas_int_t ldstartB = call_on_matrix(B, leading_start_fun());
bool swap_A = (A_trans_=='T');
bool swap_B = (B_trans_=='T');
atidlas_int_t M = call_on_matrix(C, size1_fun());
atidlas_int_t N = call_on_matrix(C, size2_fun());
atidlas_int_t K = call_on_matrix(A, size2_fun());
if (utils::call_on_matrix(A, row_major_fun()))
K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
else
K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 ||
(p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % p_.simd_width > 0)))
{
enqueue_block(statement, M, N, K, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A),
create_slice(ptr_matrix, B, 0, K, 0, N, swap_B),
create_slice(ptr_matrix, C, 0, M, 0, N, false), beta_value, programs, kernel_prefix, 1);
return;
}
scheduler::lhs_rhs_element Acopy = A;
scheduler::lhs_rhs_element Bcopy = B;
scheduler::lhs_rhs_element Ccopy = C;
atidlas_int_t lM = M / p_.mL * p_.mL;
atidlas_int_t lN = N / p_.nL * p_.nL;
atidlas_int_t lK = K / p_.kL * p_.kL;
enqueue_block(statement, lM, lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), beta_value, programs, kernel_prefix, 0);
enqueue_block(statement, lM, lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
enqueue_block(statement, lM, N - lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), beta_value, programs, kernel_prefix, 1);
enqueue_block(statement, lM, N - lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
enqueue_block(statement, M - lM, lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), beta_value, programs, kernel_prefix, 1);
enqueue_block(statement, M - lM, lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
enqueue_block(statement, M - lM, N - lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), beta_value, programs, kernel_prefix, 1);
enqueue_block(statement, M - lM, N - lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
}
public:
matrix_product_template(matrix_product_template::parameters_type const & parameters, char A_trans, char B_trans) : template_base_impl<matrix_product_template, matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), B_trans_(B_trans){ }
virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
{
using namespace device_specific::utils;
using namespace tree_parsing;
scheduler::statement const & st = statements.data().front();
bool A_trans, B_trans;
atidlas_int_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
scheduler::statement stcopy = st;
scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, A_leaf);
scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, B_leaf);
scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, C_leaf);
scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, beta_idx, beta_leaf);
if (C.numeric_type==scheduler::FLOAT_TYPE)
enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, A, B, C, beta, beta.host_float, programs, kernel_prefix);
else if (C.numeric_type==scheduler::DOUBLE_TYPE)
enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, A, B, C, beta, beta.host_double, programs, kernel_prefix);
else
throw generator_not_supported_exception("GEMM only supported for float/double");
}
private:
const char A_trans_;
const char B_trans_;
};
}
#endif

View File

@@ -0,0 +1,351 @@
#ifndef ATIDLAS_REDUCTION_TEMPLATE_HPP
#define ATIDLAS_REDUCTION_TEMPLATE_HPP
#include <vector>
#include "viennacl/backend/opencl.hpp"
#include "viennacl/scheduler/forwards.h"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/utils.hpp"
#include "atidlas/templates/template_base.hpp"
#include "atidlas/templates/utils.hpp"
#include "viennacl/tools/tools.hpp"
namespace atidlas
{
struct reduction_parameters : public template_base::parameters_type
{
reduction_parameters(unsigned int _simd_width,
unsigned int _group_size, unsigned int _num_groups,
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _group_size, 1, 2), num_groups(_num_groups), fetching_policy(_fetching_policy){ }
unsigned int num_groups;
fetching_policy_type fetching_policy;
};
class reduction_template : public template_base_impl<reduction_template, reduction_parameters>
{
private:
unsigned int n_lmem_elements() const
{
return p_.local_size_0;
}
int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
{
if (p_.fetching_policy==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID;
}
inline void reduce_1d_local_memory(utils::kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_reduction*> exprs,
std::string const & buf_str, std::string const & buf_value_str) const
{
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int stride = " << size/2 << "; stride >0; stride /=2)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
stream << "if (lid < stride)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
for (unsigned int k = 0; k < exprs.size(); k++)
if (exprs[k]->is_index_reduction())
compute_index_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]")
, exprs[k]->process(buf_value_str+"[lid]"), exprs[k]->process(buf_value_str+"[lid+stride]"),
exprs[k]->root_op());
else
compute_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]"), exprs[k]->root_op());
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
}
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
{
utils::kernel_generation_stream stream;
std::vector<mapped_scalar_reduction*> exprs;
for (std::vector<mapping_type>::const_iterator it = mappings.begin(); it != mappings.end(); ++it)
for (mapping_type::const_iterator iit = it->begin(); iit != it->end(); ++iit)
if (mapped_scalar_reduction * p = dynamic_cast<mapped_scalar_reduction*>(iit->second.get()))
exprs.push_back(p);
std::size_t N = exprs.size();
std::string arguments = generate_value_kernel_argument("unsigned int", "N");
for (unsigned int k = 0; k < N; ++k)
{
std::string numeric_type = utils::numeric_type_to_string(lhs_most(exprs[k]->statement().array(),
exprs[k]->statement().root()).lhs.numeric_type);
if (exprs[k]->is_index_reduction())
{
arguments += generate_pointer_kernel_argument("__global", "unsigned int", exprs[k]->process("#name_temp"));
arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp_value"));
}
else
arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp"));
}
/* ------------------------
* First Kernel
* -----------------------*/
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
stream << "__kernel void " << kernel_prefix << "_0" << "(" << arguments << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
stream << "unsigned int lid = get_local_id(0);" << std::endl;
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
("vector", "#pointer += #start;"), statements, mappings);
for (unsigned int k = 0; k < N; ++k)
{
if (exprs[k]->is_index_reduction())
{
stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
}
else
{
stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
}
}
class loop_body : public loop_body_base
{
public:
loop_body(std::vector<mapped_scalar_reduction*> const & _exprs) : exprs(_exprs){ }
void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const
{
std::string i = (simd_width==1)?"i*#stride":"i";
//Fetch vector entry
for (std::vector<mapped_scalar_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
(*it)->process_recursive(stream, PARENT_NODE_TYPE, utils::create_process_accessors("vector", utils::append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";")
("matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];")
("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];")
("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];"));
//Update accumulators
std::vector<std::string> str(simd_width);
if (simd_width==1)
str[0] = "#namereg";
else
for (unsigned int a = 0; a < simd_width; ++a)
str[a] = "#namereg.s" + tools::to_string(a);
for (unsigned int k = 0; k < exprs.size(); ++k)
{
for (unsigned int a = 0; a < simd_width; ++a)
{
std::map<std::string, std::string> accessors;
accessors["vector"] = str[a];
accessors["matrix_row"] = str[a];
accessors["matrix_column"] = str[a];
accessors["matrix_diag"] = str[a];
accessors["scalar"] = "#namereg";
std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
if (exprs[k]->is_index_reduction())
compute_index_reduction(stream, exprs[k]->process("#name_acc"), "i*"+tools::to_string(simd_width) + "+" + tools::to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
else
compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
}
}
}
private:
std::vector<mapped_scalar_reduction*> exprs;
};
element_wise_loop_1D(stream, loop_body(exprs), p_.fetching_policy, simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)");
//Fills local memory
for (unsigned int k = 0; k < N; ++k)
{
if (exprs[k]->is_index_reduction())
stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
}
//Reduce local memory
reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
//Write to temporary buffers
stream << "if (lid==0)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
for (unsigned int k = 0; k < N; ++k)
{
if (exprs[k]->is_index_reduction())
stream << exprs[k]->process("#name_temp_value[get_group_id(0)] = #name_buf_value[0];") << std::endl;
stream << exprs[k]->process("#name_temp[get_group_id(0)] = #name_buf[0];") << std::endl;
}
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
/* ------------------------
* Second kernel
* -----------------------*/
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
stream << "__kernel void " << kernel_prefix << "_1" << "(" << arguments << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
stream << "unsigned int lid = get_local_id(0);" << std::endl;
for (unsigned int k = 0; k < N; ++k)
{
if (exprs[k]->is_index_reduction())
{
stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];");
stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";");
}
else
{
stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";");
}
}
stream << "for(unsigned int i = lid; i < " << p_.num_groups << "; i += get_local_size(0))" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
for (unsigned int k = 0; k < N; ++k)
if (exprs[k]->is_index_reduction())
compute_index_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"),
exprs[k]->process("#name_acc_value"),exprs[k]->process("#name_temp_value[i]"),exprs[k]->root_op());
else
compute_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"), exprs[k]->root_op());
stream.dec_tab();
stream << "}" << std::endl;
for (unsigned int k = 0; k < N; ++k)
{
if (exprs[k]->is_index_reduction())
stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
}
//Reduce and write final result
reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
stream << "if (lid==0)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
std::map<std::string, std::string> accessors;
accessors["scalar_reduction"] = "#name_buf[0]";
accessors["scalar"] = "*#pointer";
accessors["vector"] = "#pointer[#start]";
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
return stream.str();
}
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
{
std::vector<std::string> result;
result.push_back(generate_impl(kernel_prefix + "_strided", statements, mappings, 1));
result.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width));
return result;
}
public:
reduction_template(reduction_template::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<reduction_template, reduction_parameters>(parameters, binding_policy) { }
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
{
std::vector<scheduler::statement_node const *> reductions;
cl_uint size = 0;
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
{
std::vector<size_t> reductions_idx = tree_parsing::filter_nodes(&utils::is_reduction, *it, false);
size = static_cast<cl_uint>(vector_size(lhs_most(it->array(), reductions_idx[0]), false));
for (std::vector<size_t>::iterator itt = reductions_idx.begin(); itt != reductions_idx.end(); ++itt)
reductions.push_back(&it->array()[*itt]);
}
scheduler::statement const & statement = statements.data().front();
unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
viennacl::ocl::kernel * kernels[2];
if (has_strided_access(statements) && p_.simd_width > 1)
{
kernels[0] = &programs[0].program().get_kernel(kernel_prefix+"_strided_0");
kernels[1] = &programs[0].program().get_kernel(kernel_prefix+"_strided_1");
}
else
{
kernels[0] = &programs[1].program().get_kernel(kernel_prefix+"_0");
kernels[1] = &programs[1].program().get_kernel(kernel_prefix+"_1");
}
kernels[0]->local_work_size(0, p_.local_size_0);
kernels[0]->global_work_size(0,p_.local_size_0*p_.num_groups);
kernels[1]->local_work_size(0, p_.local_size_0);
kernels[1]->global_work_size(0,p_.local_size_0);
for (unsigned int k = 0; k < 2; k++)
{
unsigned int n_arg = 0;
kernels[k]->arg(n_arg++, size);
unsigned int i = 0;
unsigned int j = 0;
for (std::vector<scheduler::statement_node const *>::const_iterator it = reductions.begin(); it != reductions.end(); ++it)
{
if (utils::is_index_reduction((*it)->op))
{
if (tmpidx_.size() <= j)
tmpidx_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*4));
kernels[k]->arg(n_arg++, tmpidx_[j]);
j++;
}
if (tmp_.size() <= i)
tmp_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*scalartype_size));
kernels[k]->arg(n_arg++, tmp_[i]);
i++;
}
set_arguments(statements, *kernels[k], n_arg);
}
for (unsigned int k = 0; k < 2; k++)
viennacl::ocl::enqueue(*kernels[k]);
}
private:
std::vector< viennacl::ocl::handle<cl_mem> > tmp_;
std::vector< viennacl::ocl::handle<cl_mem> > tmpidx_;
};
}
#endif

View File

@@ -0,0 +1,281 @@
#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
#include <vector>
#include "viennacl/scheduler/forwards.h"
#include "atidlas/mapped_objects.hpp"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/utils.hpp"
#include "atidlas/templates/template_base.hpp"
#include "atidlas/templates/utils.hpp"
#include "viennacl/tools/tools.hpp"
#include "viennacl/scheduler/io.hpp"
namespace atidlas
{
struct row_wise_reduction_parameters : public template_base::parameters_type
{
row_wise_reduction_parameters(unsigned int _simd_width,
unsigned int _local_size_0, unsigned int _local_size_1,
unsigned int _num_groups_0, fetching_policy_type _fetch_policy): template_base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1),
num_groups_0(_num_groups_0), fetch_policy(_fetch_policy) { }
unsigned int num_groups_0;
fetching_policy_type fetch_policy;
};
class row_wise_reduction_template : public template_base_impl<row_wise_reduction_template, row_wise_reduction_parameters>
{
private:
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
{
if (p_.fetch_policy==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID;
}
unsigned int n_lmem_elements() const
{
return p_.local_size_0*(p_.local_size_1+1);
}
static void parse(scheduler::statement const & statement, std::vector<size_t> & idx, bool & is_trans, scheduler::lhs_rhs_element & matrix)
{
idx = tree_parsing::filter_nodes(&utils::is_reduction, statement, false);
is_trans = is_node_trans(statement.array(), idx[0], LHS_NODE_TYPE);
matrix = lhs_most(statement.array(), idx[0]).lhs;
}
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width, bool is_trans, std::vector<mapped_row_wise_reduction*> const & exprs) const
{
using tools::to_string;
unsigned int lsize0 = p_.local_size_0;
unsigned int lsize1 = p_.local_size_1+1;
std::string lsize1str = to_string(lsize1);
utils::kernel_generation_stream stream;
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
stream << "__kernel void " << kernel_prefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
tree_parsing::process(stream, PARENT_NODE_TYPE,
utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
("matrix", "#pointer += #start1 + #start2*#ld;")
("matrix", "#ld *= #nldstride;")
("vector", "#pointer += #start;"), statements, mappings);
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
stream << (*it)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl;
stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
stream << "unsigned int upper_bound_0 = ( M +" << p_.local_size_0 - 1 << ")/" << p_.local_size_0 << "*" << p_.local_size_0 << ";" << std::endl;
stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl;
stream.inc_tab();
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
stream << (*it)->process("#scalartype #name_acc = " + neutral_element((*it)->root_op()) + ";") << std::endl;
stream << "if (r < M)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
class loop_body : public loop_body_base
{
public:
loop_body(std::vector<mapped_row_wise_reduction*> const & _exprs, bool _is_trans) : exprs(_exprs), is_trans(_is_trans){ }
void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const
{
std::string data_type = utils::append_width("#scalartype",simd_width);
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
{
std::multimap<std::string, std::string> accessors;
if (is_trans)
accessors.insert(std::make_pair("matrix_trans", data_type + " #namereg = " + vload(simd_width, "c*#stride1", "#pointer + r*#ld")+";"));
else
accessors.insert(std::make_pair("matrix","#scalartype #namereg = #pointer[r*#stride1 + c*#ld];"));
accessors.insert(std::make_pair("vector", data_type + " #namereg = " + vload(simd_width, "c*#stride", "#pointer")+";"));
(*it)->process_recursive(stream, PARENT_NODE_TYPE, accessors);
}
//Update accumulators
std::vector<std::string> str(simd_width);
if (simd_width==1)
str[0] = "#namereg";
else
for (unsigned int a = 0; a < simd_width; ++a)
str[a] = "#namereg.s" + to_string(a);
for (unsigned int k = 0; k < exprs.size(); ++k)
{
for (unsigned int a = 0; a < simd_width; ++a)
{
std::map<std::string, std::string> accessors;
if (is_trans)
accessors["matrix_trans"] = str[a];
else
accessors["matrix"] = str[a];
accessors["vector"] = str[a];
accessors["scalar"] = "#namereg";
std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
if (exprs[k]->is_index_reduction())
compute_index_reduction(stream, exprs[k]->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
else
compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
}
}
}
private:
std::vector<mapped_row_wise_reduction*> exprs;
bool is_trans;
};
element_wise_loop_1D(stream, loop_body(exprs, is_trans), p_.fetch_policy, simd_width, "c", "N", "get_local_id(1)", "get_local_size(1)");
stream.dec_tab();
stream << "}" << std::endl;
for (unsigned int k = 0; k < exprs.size(); ++k)
stream << exprs[k]->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl;
stream << "#pragma unroll" << std::endl;
stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
stream << "if (lid1 < stride)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
for (unsigned int k = 0; k < exprs.size(); k++)
if (exprs[k]->is_index_reduction())
compute_index_reduction(stream, exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]")
, exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"),
exprs[k]->root_op());
else
compute_reduction(stream,exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), exprs[k]->root_op());
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
stream << "if (lid1 == 0 && r < M)";
stream << "{" << std::endl;
stream.inc_tab();
std::map<std::string, std::string> accessors;
accessors["row_wise_reduction"] = "#name_buf[lid0*" + lsize1str + "]";
accessors["vector"] = "#pointer[r*#stride]";
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
return stream.str();
}
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
{
std::vector<mapped_row_wise_reduction*> exprs;
bool is_trans = false;
bool row_major = false;
statements_container::data_type::const_iterator sit;
std::vector<mapping_type>::const_iterator mit;
for (mit = mappings.begin(), sit = statements.data().begin(); mit != mappings.end(); ++mit, ++sit)
{
std::vector<size_t> idx;
scheduler::lhs_rhs_element A;
parse(*sit, idx, is_trans, A);
row_major = utils::call_on_matrix(A, utils::row_major_fun());
for (unsigned int j = 0; j < idx.size(); ++j)
exprs.push_back((mapped_row_wise_reduction*)(mit->at(mapping_key(idx[j], PARENT_NODE_TYPE)).get()));
}
is_trans = is_trans ^ row_major;
std::vector<std::string> res;
if (is_trans && p_.simd_width>1)
{
res.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width, is_trans, exprs));
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
}
else
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
return res;
}
public:
row_wise_reduction_template(row_wise_reduction_template::parameters_type const & parameters, char A_trans, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<row_wise_reduction_template, row_wise_reduction_parameters>(parameters, binding_policy), A_trans_(A_trans){ }
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
{
std::vector<size_t> idx;
scheduler::lhs_rhs_element A;
bool is_trans;
parse(statements.data().front(), idx, is_trans, A);
bool row_major = utils::call_on_matrix(A, utils::row_major_fun());
viennacl::ocl::kernel * kernel;
if((is_trans ^ row_major)&& p_.simd_width>1)
{
if (has_strided_access(statements))
kernel = &programs[1].program().get_kernel(kernel_prefix);
else
kernel = &programs[0].program().get_kernel(kernel_prefix);
}
else
kernel = &programs[0].program().get_kernel(kernel_prefix);
kernel->local_work_size(0,p_.local_size_0);
kernel->local_work_size(1,p_.local_size_1);
kernel->global_work_size(0,p_.local_size_0*p_.num_groups_0);
kernel->global_work_size(1,p_.local_size_1);
unsigned int current_arg = 0;
if (is_trans)
{
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
}
else
{
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
}
set_arguments(statements, *kernel, current_arg);
viennacl::ocl::enqueue(*kernel);
}
private:
const char A_trans_;
};
}
#endif

View File

@@ -0,0 +1,540 @@
#ifndef ATIDLAS_TEMPLATES_TEMPLATE_BASE_
#define ATIDLAS_TEMPLATES_TEMPLATE_BASE_
#include <list>
#include <set>
#include "viennacl/ocl/kernel.hpp"
#include "viennacl/ocl/device.hpp"
#include "viennacl/ocl/device_utils.hpp"
#include "viennacl/scheduler/forwards.h"
#include "viennacl/scheduler/io.hpp"
#include "atidlas/lazy_program_compiler.hpp"
#include "atidlas/mapped_objects.hpp"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/utils.hpp"
namespace atidlas
{
enum fetching_policy_type
{
FETCH_FROM_LOCAL,
FETCH_FROM_GLOBAL_STRIDED,
FETCH_FROM_GLOBAL_CONTIGUOUS
};
class template_base
{
public:
struct parameters_type
{
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels){ }
unsigned int simd_width;
unsigned int local_size_0;
unsigned int local_size_1;
unsigned int num_kernels;
};
private:
/** @brief Functor to map the statements to the types defined in mapped_objects.hpp */
class map_functor : public tree_parsing::traversal_functor
{
viennacl::scheduler::statement_node_numeric_type numeric_type(viennacl::scheduler::statement const * statement, atidlas_int_t root_idx) const
{
viennacl::scheduler::statement_node const * root_node = &statement->array()[root_idx];
while (root_node->lhs.numeric_type==viennacl::scheduler::INVALID_NUMERIC_TYPE)
root_node = &statement->array()[root_node->lhs.node_index];
return root_node->lhs.numeric_type;
}
public:
typedef tools::shared_ptr<mapped_object> result_type;
map_functor(symbolic_binder & binder, mapping_type & mapping) : binder_(binder), mapping_(mapping){ }
/** @brief Binary leaf */
template<class T>
result_type binary_leaf(viennacl::scheduler::statement const * statement, atidlas_int_t root_idx, mapping_type const * mapping) const
{
return result_type(new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
}
template<class NumericT>
result_type operator()(NumericT const & /*scalar*/) const
{
return result_type(new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
}
/** @brief Scalar mapping */
template<class NumericT>
result_type operator()(viennacl::scalar<NumericT> const & scal) const
{
return result_type(new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(scal))));
}
/** @brief Vector mapping */
template<class NumericT>
result_type operator()(viennacl::vector_base<NumericT> const & vec) const
{
return result_type(new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(vec))));
}
/** @brief Implicit vector mapping */
template<class NumericT>
result_type operator()(viennacl::implicit_vector_base<NumericT> const & /*vec*/) const
{
return result_type(new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
}
/** @brief Matrix mapping */
template<class NumericT>
result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
{
return result_type(new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(mat)),
viennacl::traits::row_major(mat)));
}
/** @brief Implicit matrix mapping */
template<class NumericT>
result_type operator()(viennacl::implicit_matrix_base<NumericT> const & /*mat*/) const
{
return result_type(new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
}
/** @brief Traversal functor */
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const {
mapping_type::key_type key(root_idx, leaf_t);
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *this)));
else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.rhs, *this)));
else if ( leaf_t== PARENT_NODE_TYPE)
{
if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
else if (is_scalar_reduction(root_node))
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
else if (is_vector_reduction(root_node))
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
else if (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
else if (root_node.op.type == viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
}
}
private:
symbolic_binder & binder_;
mapping_type & mapping_;
};
/** @brief functor for setting the arguments of a kernel */
class set_arguments_functor : public tree_parsing::traversal_functor
{
public:
typedef void result_type;
set_arguments_functor(symbolic_binder & binder, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ }
template<class NumericT>
result_type operator()(NumericT const & scal) const
{
typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
kernel_.arg(current_arg_++, cl_scalartype(scal));
}
/** @brief Scalar mapping */
template<class NumericT>
result_type operator()(viennacl::scalar<NumericT> const & scal) const
{
if (binder_.bind(&viennacl::traits::handle(scal)))
kernel_.arg(current_arg_++, scal.handle().opencl_handle());
}
/** @brief Vector mapping */
template<class NumericT>
result_type operator()(viennacl::vector_base<NumericT> const & vec) const
{
if (binder_.bind(&viennacl::traits::handle(vec)))
{
kernel_.arg(current_arg_++, vec.handle().opencl_handle());
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
}
}
/** @brief Implicit vector mapping */
template<class NumericT>
result_type operator()(viennacl::implicit_vector_base<NumericT> const & vec) const
{
typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
if (vec.has_index())
kernel_.arg(current_arg_++, cl_uint(vec.index()));
}
/** @brief Matrix mapping */
template<class NumericT>
result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
{
if (binder_.bind(&viennacl::traits::handle(mat)))
{
kernel_.arg(current_arg_++, mat.handle().opencl_handle());
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat)));
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
}
}
/** @brief Implicit matrix mapping */
template<class NumericT>
result_type operator()(viennacl::implicit_matrix_base<NumericT> const & mat) const
{
kernel_.arg(current_arg_++, typename viennacl::result_of::cl_type<NumericT>::type(mat.value()));
}
/** @brief Traversal functor: */
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const
{
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
utils::call_on_element(root_node.lhs, *this);
else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
utils::call_on_element(root_node.rhs, *this);
}
private:
symbolic_binder & binder_;
unsigned int & current_arg_;
viennacl::ocl::kernel & kernel_;
};
protected:
static std::string generate_arguments(std::string const & data_type, std::vector<mapping_type> const & mappings, statements_container const & statements)
{
utils::kernel_generation_stream stream;
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "__global #scalartype* #pointer,")
("host_scalar", "#scalartype #name,")
("matrix", "__global " + data_type + "* #pointer, uint #ld, uint #start1, uint #start2, uint #stride1, uint #stride2,")
("vector", "__global " + data_type + "* #pointer, uint #start, uint #stride,")
("implicit_vector", "#scalartype #name,")
("implicit_matrix", "#scalartype #name,")
,statements, mappings);
std::string res = stream.str();
res.erase(res.rfind(','));
return res;
}
void set_arguments(statements_container const & statements, viennacl::ocl::kernel & kernel, unsigned int & current_arg)
{
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
for (statements_container::data_type::const_iterator itt = statements.data().begin(); itt != statements.data().end(); ++itt)
tree_parsing::traverse(*itt, itt->root(), set_arguments_functor(*binder,current_arg,kernel), true);
}
class invalid_template_exception : public std::exception
{
public:
invalid_template_exception() : message_() {}
invalid_template_exception(std::string message) :
message_("ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message + "\n"
"If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
"If you are using your own template, please try using other parameters") {}
virtual const char* what() const throw() { return message_.c_str(); }
virtual ~invalid_template_exception() throw() {}
private:
std::string message_;
};
static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, utils::kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size)
{
if (policy==FETCH_FROM_GLOBAL_STRIDED)
{
init = domain_id;
upper_bound = bound;
inc = domain_size;
}
else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
{
std::string chunk_size = "chunk_size";
std::string chunk_start = "chunk_start";
std::string chunk_end = "chunk_end";
stream << "unsigned int " << chunk_size << " = (" << bound << "+" << domain_size << "-1)/" << domain_size << ";" << std::endl;
stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << chunk_size << ";" << std::endl;
stream << "unsigned int " << chunk_end << " = min(" << chunk_start << "+" << chunk_size << ", " << bound << ");" << std::endl;
init = chunk_start;
upper_bound = chunk_end;
inc = "1";
}
}
static bool is_node_trans(viennacl::scheduler::statement::container_type const & array, size_t root_idx, leaf_t leaf_type)
{
bool res = false;
viennacl::scheduler::lhs_rhs_element viennacl::scheduler::statement_node::*ptr;
if (leaf_type==LHS_NODE_TYPE)
ptr = &viennacl::scheduler::statement_node::lhs;
else
ptr = &viennacl::scheduler::statement_node::rhs;
viennacl::scheduler::statement_node const * node = &array[root_idx];
while ((node->*ptr).type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
{
if (array[(node->*ptr).node_index].op.type==viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
res = !res;
node = &array[(node->*ptr).node_index];
}
return res;
}
protected:
static bool is_offset_modifier(viennacl::scheduler::statement_node const & node)
{
return node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE;
}
static bool has_strided_access(statements_container const & statements)
{
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
{
//checks for vectors
std::vector<viennacl::scheduler::lhs_rhs_element> vectors = tree_parsing::filter_elements(viennacl::scheduler::DENSE_VECTOR_TYPE, *it);
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
if (utils::call_on_vector(*itt, utils::stride_fun())>1)
return true;
//checks for matrix
std::vector<viennacl::scheduler::lhs_rhs_element> matrices = tree_parsing::filter_elements(viennacl::scheduler::DENSE_MATRIX_TYPE, *it);
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
return true;
if(tree_parsing::filter_nodes(&is_offset_modifier, *it, true).empty()==false)
return true;
}
return false;
}
static atidlas_int_t vector_size(viennacl::scheduler::statement_node const & node, bool up_to_internal_size)
{
using namespace viennacl::scheduler;
using namespace utils;
if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE)
{
atidlas_int_t size1 = up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
atidlas_int_t size2 = up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
return std::min<atidlas_int_t>(size1, size2);
}
else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE)
return up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE)
return up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
else
return up_to_internal_size?call_on_vector(node.lhs, internal_size_fun()):call_on_vector(node.lhs, size_fun());
}
//NB : templates are not used here because declaring a functor out of the generate() functions would be harder to read
struct loop_body_base
{
virtual void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const = 0;
};
static void element_wise_loop_1D(utils::kernel_generation_stream & stream, loop_body_base const & loop_body,
fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size)
{
std::string strwidth = tools::to_string(simd_width);
std::string boundround = bound + "/" + strwidth;
std::string init, upper_bound, inc;
fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size);
stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
loop_body(stream, simd_width);
stream.dec_tab();
stream << "}" << std::endl;
if (simd_width>1)
{
stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
loop_body(stream, 1);
stream.dec_tab();
stream << "}" << std::endl;
}
}
static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr)
{
if (simd_width==1)
return "(" + ptr + ")[" + offset + "] = " + value;
else
return utils::append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
}
static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr)
{
if (simd_width==1)
return "(" + ptr + ")[" + offset + "]";
else
return utils::append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")";
}
private:
/** @brief Generates the body of the associated kernel function */
virtual std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mapping) const = 0;
public:
template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
virtual ~template_base(){ }
std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
{
statements_container::data_type::const_iterator sit;
std::vector<mapping_type>::iterator mit;
if(int err = check_invalid(statements, device))
throw generator_not_supported_exception("The supplied parameters for this template are invalid : err " + tools::to_string(err));
//Create mapping
std::vector<mapping_type> mappings(statements.data().size());
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), true);
return generate_impl(kernel_prefix, statements, mappings);
}
/** @brief returns whether or not the profile has undefined behavior on particular device */
virtual int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const = 0;
virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) = 0;
virtual tools::shared_ptr<template_base> clone() const = 0;
private:
binding_policy_t binding_policy_;
};
template<class TemplateType, class ParametersType>
class template_base_impl : public template_base
{
private:
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const { return TEMPLATE_VALID; }
virtual unsigned int n_lmem_elements() const { return 0; }
protected:
bool has_misaligned_offset(statements_container const & statements)
{
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
{
//checks for vectors
std::vector<viennacl::scheduler::lhs_rhs_element> vectors = tree_parsing::filter_elements(viennacl::scheduler::DENSE_VECTOR_TYPE, *it);
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
if (utils::call_on_vector(*itt, utils::stride_fun())>1)
return true;
//checks for matrix
std::vector<viennacl::scheduler::lhs_rhs_element> matrices = tree_parsing::filter_elements(viennacl::scheduler::DENSE_MATRIX_TYPE, *it);
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
return true;
}
return false;
}
public:
typedef ParametersType parameters_type;
/** @brief The constructor */
template_base_impl(parameters_type const & parameters, binding_policy_t binding_policy) : template_base(binding_policy), p_(parameters){ }
parameters_type const & parameters() const
{
return p_;
}
tools::shared_ptr<template_base> clone() const
{
return tools::shared_ptr<template_base>(new TemplateType(*dynamic_cast<TemplateType const *>(this)));
}
/** @brief returns whether or not the profile has undefined behavior on particular device */
int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const
{
using namespace viennacl::tools;
viennacl::scheduler::statement const & statement = statements.data().front();
unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
//Query device informations
size_t lmem_available = static_cast<size_t>(device.local_mem_size());
size_t lmem_usage = scalartype_size*n_lmem_elements();
if (lmem_usage>lmem_available)
return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
//Invalid work group size
size_t max_workgroup_size = device.max_work_group_size();
std::vector<size_t> max_work_item_sizes = device.max_work_item_sizes();
if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
if (p_.local_size_0 > max_work_item_sizes[0])
return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
if (p_.local_size_1 > max_work_item_sizes[1])
return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
//Advice from the Intel guide
unsigned int warp_size = 8;
if (device.type()==CL_DEVICE_TYPE_GPU)
{
//Advice from the nvidia guide
warp_size = 32;
//Advice from the AMD guide
if (device.vendor_id()==4098)
warp_size = 64;
}
if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
//Invalid SIMD Width
if (p_.simd_width!=1 && p_.simd_width!=2 &&
p_.simd_width!=4 && p_.simd_width!=8 &&
p_.simd_width!=16)
return TEMPLATE_INVALID_SIMD_WIDTH;
return check_invalid_impl(device);
}
protected:
parameters_type p_;
binding_policy_t binding_policy_;
};
}
#endif

View File

@@ -0,0 +1,79 @@
#ifndef ATIDLAS_TEMPLATES_REDUCTION_UTILS_HPP
#define ATIDLAS_TEMPLATES_REDUCTION_UTILS_HPP
#include <vector>
#include "viennacl/scheduler/forwards.h"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/utils.hpp"
namespace atidlas
{
inline void compute_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, viennacl::scheduler::op_element const & op)
{
if (utils::elementwise_function(op))
os << acc << "=" << tree_parsing::evaluate(op.type) << "(" << acc << "," << cur << ");" << std::endl;
else
os << acc << "= (" << acc << ")" << tree_parsing::evaluate(op.type) << "(" << cur << ");" << std::endl;
}
inline void compute_index_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, viennacl::scheduler::op_element const & op)
{
// os << acc << " = " << cur_value << ">" << acc_value << "?" << cur << ":" << acc << ";" << std::endl;
os << acc << "= select(" << acc << "," << cur << "," << cur_value << ">" << acc_value << ");" << std::endl;
os << acc_value << "=";
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE) os << "fmax";
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE) os << "max";
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE) os << "fmin";
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE) os << "min";
os << "(" << acc_value << "," << cur_value << ");"<< std::endl;
}
inline void process_all(std::string const & type_key, std::string const & str,
utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings)
{
for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
for (mapping_type::const_iterator mmit = mit->begin(); mmit != mit->end(); ++mmit)
if (mmit->second->type_key()==type_key)
stream << mmit->second->process(str) << std::endl;
}
inline void process_all_at(std::string const & type_key, std::string const & str,
utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings,
size_t root_idx, leaf_t leaf)
{
for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
{
mapped_object * obj = mit->at(mapping_key(root_idx, leaf)).get();
if (obj->type_key()==type_key)
stream << obj->process(str) << std::endl;
}
}
inline std::string neutral_element(viennacl::scheduler::op_element const & op)
{
switch (op.type)
{
case viennacl::scheduler::OPERATION_BINARY_ADD_TYPE : return "0";
case viennacl::scheduler::OPERATION_BINARY_MULT_TYPE : return "1";
case viennacl::scheduler::OPERATION_BINARY_DIV_TYPE : return "1";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "-INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "-INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_MAX_TYPE : return "-INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "-INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_MIN_TYPE : return "INFINITY";
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "INFINITY";
default: throw generator_not_supported_exception("Unsupported reduction operator : no neutral element known");
}
}
}
#endif

View File

@@ -0,0 +1,130 @@
#ifndef ATIDLAS_TEMPLATES_VECTOR_AXPY_HPP
#define ATIDLAS_TEMPLATES_VECTOR_AXPY_HPP
#include <vector>
#include <cmath>
#include "viennacl/scheduler/forwards.h"
#include "atidlas/mapped_objects.hpp"
#include "atidlas/tree_parsing.hpp"
#include "atidlas/forwards.h"
#include "atidlas/utils.hpp"
#include "atidlas/templates/template_base.hpp"
#include "atidlas/templates/utils.hpp"
#include "viennacl/tools/tools.hpp"
namespace atidlas
{
class vector_axpy_parameters : public template_base::parameters_type
{
public:
vector_axpy_parameters(unsigned int _simd_width,
unsigned int _group_size, unsigned int _num_groups,
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _group_size, 1, 1), num_groups(_num_groups), fetching_policy(_fetching_policy){ }
unsigned int num_groups;
fetching_policy_type fetching_policy;
};
class vector_axpy_template : public template_base_impl<vector_axpy_template, vector_axpy_parameters>
{
private:
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
{
if (p_.fetching_policy==FETCH_FROM_LOCAL)
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
return TEMPLATE_VALID;
}
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
{
std::vector<std::string> result;
for (unsigned int i = 0; i < 2; ++i)
{
utils::kernel_generation_stream stream;
unsigned int simd_width = (i==0)?1:p_.simd_width;
std::string str_simd_width = tools::to_string(simd_width);
std::string data_type = utils::append_width("#scalartype",simd_width);
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
stream << "__kernel void " << kernel_prefix << i << "(unsigned int N," << generate_arguments(data_type, mappings, statements) << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
tree_parsing::process(stream, PARENT_NODE_TYPE,
utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
("matrix", "#pointer += #start1 + #start2*#ld;")
("vector", "#pointer += #start;")
("vector", "#start/=" + str_simd_width + ";"), statements, mappings);
std::string init, upper_bound, inc;
fetching_loop_info(p_.fetching_policy, "N/"+str_simd_width, stream, init, upper_bound, inc, "get_global_id(0)", "get_global_size(0)");
stream << "for(unsigned int i = " << init << "; i < " << upper_bound << "; i += " << inc << ")" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
tree_parsing::process(stream, PARENT_NODE_TYPE,
utils::create_process_accessors("vector", data_type + " #namereg = #pointer[i*#stride];")
("matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];")
("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];")
("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];")
, statements, mappings);
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, utils::create_evaluate_accessors("vector", "#namereg")
("matrix_row", "#namereg")
("matrix_column", "#namereg")
("matrix_diag", "#namereg")
("scalar", "#namereg"), statements, mappings);
tree_parsing::process(stream, LHS_NODE_TYPE, utils::create_process_accessors("vector", "#pointer[i*#stride] = #namereg;")
("matrix_row", "#pointer[$OFFSET{#row, i}] = #namereg;")
("matrix_column", "#pointer[$OFFSET{i, #column}] = #namereg;")
("matrix_diag", "#pointer[#diag_offset<0?$OFFSET{i - #diag_offset, i}:$OFFSET{i, i + #diag_offset}] = #namereg;")
,statements, mappings);
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;
result.push_back(stream.str());
}
return result;
}
public:
vector_axpy_template(vector_axpy_template::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<vector_axpy_template, vector_axpy_parameters>(parameters, binding_policy), up_to_internal_size_(false){ }
void up_to_internal_size(bool v) { up_to_internal_size_ = v; }
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
{
viennacl::scheduler::statement const & statement = statements.data().front();
atidlas_int_t size = vector_size(lhs_most(statement.array(), statement.root()), up_to_internal_size_);
viennacl::ocl::kernel * kernel;
if(p_.simd_width > 1 && (has_strided_access(statements) || (size%p_.simd_width>0) || has_misaligned_offset(statements)))
kernel = &programs[0].program().get_kernel(kernel_prefix+"0");
else
kernel = &programs[1].program().get_kernel(kernel_prefix+"1");
kernel->local_work_size(0, p_.local_size_0);
kernel->global_work_size(0, p_.local_size_0*p_.num_groups);
unsigned int current_arg = 0;
kernel->arg(current_arg++, static_cast<cl_uint>(size));
set_arguments(statements, *kernel, current_arg);
viennacl::ocl::enqueue(*kernel);
}
private:
bool up_to_internal_size_;
};
}
#endif

View File

@@ -0,0 +1,27 @@
#ifndef ATIDLAS_TOOLS_FIND_AND_REPLACE_HPP
#define ATIDLAS_TOOLS_FIND_AND_REPLACE_HPP
#include <string>
namespace atidlas
{
namespace tools
{
int inline find_and_replace(std::string & source, std::string const & find, std::string const & replace)
{
int num=0;
size_t fLen = find.size();
size_t rLen = replace.size();
for (size_t pos=0; (pos=source.find(find, pos))!=std::string::npos; pos+=rLen)
{
num++;
source.replace(pos, fLen, replace);
}
return num;
}
}
}
#endif

View File

@@ -0,0 +1,178 @@
#ifndef ATIDLAS_TOOLS_SHARED_PTR_HPP
#define ATIDLAS_TOOLS_SHARED_PTR_HPP
/* =========================================================================
Copyright (c) 2010-2012, Institute for Microelectronics,
Institute for Analysis and Scientific Computing,
TU Wien.
Portions of this software are copyright by UChicago Argonne, LLC.
-----------------
ViennaCL - The Vienna Computing Library
-----------------
Project Head: Karl Rupp rupp@iue.tuwien.ac.at
(A list of authors and contributors can be found in the PDF manual)
License: MIT (X11), see file LICENSE in the base directory
============================================================================= */
/** @file tools/shared_ptr.hpp
@brief Implementation of a shared pointer class (cf. std::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
Contributed by Philippe Tillet.
*/
#include <cstdlib>
#include <algorithm>
namespace atidlas
{
namespace tools
{
namespace detail
{
/** @brief Reference counting class for the shared_ptr implementation */
class count
{
public:
count(unsigned int val) : val_(val){ }
void dec(){ --val_; }
void inc(){ ++val_; }
bool is_null(){ return val_ == 0; }
unsigned int val(){ return val_; }
private:
unsigned int val_;
};
/** @brief Interface for the reference counter inside the shared_ptr */
struct aux
{
detail::count count;
aux() :count(1) {}
virtual void destroy()=0;
virtual ~aux() {}
};
/** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
template<class U, class Deleter>
struct auximpl: public detail::aux
{
U* p;
Deleter d;
auximpl(U* pu, Deleter x) :p(pu), d(x) {}
virtual void destroy() { d(p); }
};
/** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
template<class U>
struct default_deleter
{
void operator()(U* p) const { delete p; }
};
}
/** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by std::shared_ptr as soon as C++11 is widely available. */
template<class T>
class shared_ptr
{
template<class U>
friend class shared_ptr;
detail::aux* pa;
T* pt;
public:
shared_ptr() :pa(NULL), pt(NULL) {}
template<class U, class Deleter>
shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
template<class U>
explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
T* get() const { return pt; }
T* operator->() const { return pt; }
T& operator*() const { return *pt; }
shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt)
{
inc();
}
template<class U>
shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt)
{
inc();
}
~shared_ptr()
{
dec();
}
void reset()
{
shared_ptr<T>().swap(*this);
}
void reset(T * ptr)
{
shared_ptr<T>(ptr).swap(*this);
}
void swap(shared_ptr<T> & other)
{
std::swap(pt,other.pt);
std::swap(pa, other.pa);
}
shared_ptr& operator=(const shared_ptr& s)
{
if (this!=&s)
{
dec();
pa = s.pa;
pt = s.pt;
inc();
}
return *this;
}
void inc()
{
if (pa) pa->count.inc();
}
void dec()
{
if (pa)
{
pa->count.dec();
if (pa->count.is_null())
{
pa->destroy();
delete pa;
pa = NULL;
}
}
}
};
}
}
#endif

View File

@@ -0,0 +1,22 @@
#ifndef ATIDLAS_TOOLS_TO_STRING_HPP
#define ATIDLAS_TOOLS_TO_STRING_HPP
#include <string>
namespace atidlas
{
namespace tools
{
template<class T>
inline std::string to_string ( T const t )
{
std::stringstream ss;
ss << t;
return ss.str();
}
}
}
#endif

500
atidlas/tree_parsing.hpp Normal file
View File

@@ -0,0 +1,500 @@
#ifndef ATIDLAS_TREE_PARSING_HPP
#define ATIDLAS_TREE_PARSING_HPP
#include <set>
#include "CL/cl.h"
#include "viennacl/forwards.h"
#include "viennacl/scheduler/forwards.h"
#include "atidlas/utils.hpp"
#include "atidlas/forwards.h"
namespace atidlas
{
namespace tree_parsing
{
/** @brief base functor class for traversing a statement */
class traversal_functor
{
public:
void call_before_expansion(viennacl::scheduler::statement const &, atidlas_int_t) const { }
void call_after_expansion(viennacl::scheduler::statement const &, atidlas_int_t) const { }
};
/** @brief Recursively execute a functor on a statement */
template<class Fun>
inline void traverse(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, Fun const & fun, bool inspect)
{
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
bool recurse = utils::node_leaf(root_node.op)?inspect:true;
fun.call_before_expansion(statement, root_idx);
//Lhs:
if (recurse)
{
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
traverse(statement, root_node.lhs.node_index, fun, inspect);
if (root_node.lhs.type_family != viennacl::scheduler::INVALID_TYPE_FAMILY)
fun(statement, root_idx, LHS_NODE_TYPE);
}
//Self:
fun(statement, root_idx, PARENT_NODE_TYPE);
//Rhs:
if (recurse && root_node.rhs.type_family!=viennacl::scheduler::INVALID_TYPE_FAMILY)
{
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
traverse(statement, root_node.rhs.node_index, fun, inspect);
if (root_node.rhs.type_family != viennacl::scheduler::INVALID_TYPE_FAMILY)
fun(statement, root_idx, RHS_NODE_TYPE);
}
fun.call_after_expansion(statement, root_idx);
}
class filter_fun : public traversal_functor
{
public:
typedef bool (*pred_t)(viennacl::scheduler::statement_node const & node);
filter_fun(pred_t pred, std::vector<size_t> & out) : pred_(pred), out_(out){ }
void operator()(viennacl::scheduler::statement const & statement, size_t root_idx, leaf_t) const
{
viennacl::scheduler::statement_node const * root_node = &statement.array()[root_idx];
if (pred_(*root_node))
out_.push_back(root_idx);
}
private:
pred_t pred_;
std::vector<size_t> & out_;
};
std::vector<size_t> filter_nodes(bool (*pred)(viennacl::scheduler::statement_node const & node), viennacl::scheduler::statement const & statement, bool inspect)
{
std::vector<size_t> res;
tree_parsing::traverse(statement, statement.root(), filter_fun(pred, res), inspect);
return res;
}
class filter_elements_fun : public traversal_functor
{
public:
filter_elements_fun(viennacl::scheduler::statement_node_subtype subtype, std::vector<viennacl::scheduler::lhs_rhs_element> & out) : subtype_(subtype), out_(out) { }
void operator()(viennacl::scheduler::statement const & statement, size_t root_idx, leaf_t) const
{
viennacl::scheduler::statement_node const * root_node = &statement.array()[root_idx];
if (root_node->lhs.subtype==subtype_)
out_.push_back(root_node->lhs);
if (root_node->rhs.subtype==subtype_)
out_.push_back(root_node->rhs);
}
private:
viennacl::scheduler::statement_node_subtype subtype_;
std::vector<viennacl::scheduler::lhs_rhs_element> & out_;
};
std::vector<viennacl::scheduler::lhs_rhs_element> filter_elements(viennacl::scheduler::statement_node_subtype subtype, viennacl::scheduler::statement const & statement)
{
std::vector<viennacl::scheduler::lhs_rhs_element> res;
tree_parsing::traverse(statement, statement.root(), filter_elements_fun(subtype, res), true);
return res;
}
/** @brief generate a string from an operation_node_type */
inline const char * evaluate(viennacl::scheduler::operation_node_type type)
{
using namespace viennacl::scheduler;
// unary expression
switch (type)
{
//Function
case OPERATION_UNARY_ABS_TYPE : return "abs";
case OPERATION_UNARY_ACOS_TYPE : return "acos";
case OPERATION_UNARY_ASIN_TYPE : return "asin";
case OPERATION_UNARY_ATAN_TYPE : return "atan";
case OPERATION_UNARY_CEIL_TYPE : return "ceil";
case OPERATION_UNARY_COS_TYPE : return "cos";
case OPERATION_UNARY_COSH_TYPE : return "cosh";
case OPERATION_UNARY_EXP_TYPE : return "exp";
case OPERATION_UNARY_FABS_TYPE : return "fabs";
case OPERATION_UNARY_FLOOR_TYPE : return "floor";
case OPERATION_UNARY_LOG_TYPE : return "log";
case OPERATION_UNARY_LOG10_TYPE : return "log10";
case OPERATION_UNARY_SIN_TYPE : return "sin";
case OPERATION_UNARY_SINH_TYPE : return "sinh";
case OPERATION_UNARY_SQRT_TYPE : return "sqrt";
case OPERATION_UNARY_TAN_TYPE : return "tan";
case OPERATION_UNARY_TANH_TYPE : return "tanh";
case OPERATION_UNARY_CAST_CHAR_TYPE : return "(char)";
case OPERATION_UNARY_CAST_UCHAR_TYPE : return "(uchar)";
case OPERATION_UNARY_CAST_SHORT_TYPE : return "(short)";
case OPERATION_UNARY_CAST_USHORT_TYPE : return "(ushort)";
case OPERATION_UNARY_CAST_INT_TYPE : return "(int)";
case OPERATION_UNARY_CAST_UINT_TYPE : return "(uint)";
case OPERATION_UNARY_CAST_LONG_TYPE : return "(long)";
case OPERATION_UNARY_CAST_ULONG_TYPE : return "(ulong)";
case OPERATION_UNARY_CAST_HALF_TYPE : return "(half)";
case OPERATION_UNARY_CAST_FLOAT_TYPE : return "(float)";
case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "(double)";
case OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "argfmax";
case OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "argmax";
case OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "argfmin";
case OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "argmin";
case OPERATION_BINARY_ELEMENT_POW_TYPE : return "pow";
//Arithmetic
case OPERATION_UNARY_MINUS_TYPE : return "-";
case OPERATION_BINARY_ASSIGN_TYPE : return "=";
case OPERATION_BINARY_INPLACE_ADD_TYPE : return "+=";
case OPERATION_BINARY_INPLACE_SUB_TYPE : return "-=";
case OPERATION_BINARY_ADD_TYPE : return "+";
case OPERATION_BINARY_SUB_TYPE : return "-";
case OPERATION_BINARY_MULT_TYPE : return "*";
case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "*";
case OPERATION_BINARY_DIV_TYPE : return "/";
case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "/";
case OPERATION_BINARY_ACCESS_TYPE : return "[]";
//Relational
case OPERATION_BINARY_ELEMENT_EQ_TYPE : return "isequal";
case OPERATION_BINARY_ELEMENT_NEQ_TYPE : return "isnotequal";
case OPERATION_BINARY_ELEMENT_GREATER_TYPE : return "isgreater";
case OPERATION_BINARY_ELEMENT_GEQ_TYPE : return "isgreaterequal";
case OPERATION_BINARY_ELEMENT_LESS_TYPE : return "isless";
case OPERATION_BINARY_ELEMENT_LEQ_TYPE : return "islessequal";
case OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "fmax";
case OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "fmin";
case OPERATION_BINARY_ELEMENT_MAX_TYPE : return "max";
case OPERATION_BINARY_ELEMENT_MIN_TYPE : return "min";
//Unary
case OPERATION_UNARY_TRANS_TYPE : return "trans";
//Binary
case OPERATION_BINARY_INNER_PROD_TYPE : return "iprod";
case OPERATION_BINARY_MAT_MAT_PROD_TYPE : return "mmprod";
case OPERATION_BINARY_MAT_VEC_PROD_TYPE : return "mvprod";
case OPERATION_BINARY_VECTOR_DIAG_TYPE : return "vdiag";
case OPERATION_BINARY_MATRIX_DIAG_TYPE : return "mdiag";
case OPERATION_BINARY_MATRIX_ROW_TYPE : return "row";
case OPERATION_BINARY_MATRIX_COLUMN_TYPE : return "col";
default : throw generator_not_supported_exception("Unsupported operator");
}
}
inline const char * operator_string(viennacl::scheduler::operation_node_type type)
{
using namespace viennacl::scheduler;
switch (type)
{
case OPERATION_UNARY_CAST_CHAR_TYPE : return "char";
case OPERATION_UNARY_CAST_UCHAR_TYPE : return "uchar";
case OPERATION_UNARY_CAST_SHORT_TYPE : return "short";
case OPERATION_UNARY_CAST_USHORT_TYPE : return "ushort";
case OPERATION_UNARY_CAST_INT_TYPE : return "int";
case OPERATION_UNARY_CAST_UINT_TYPE : return "uint";
case OPERATION_UNARY_CAST_LONG_TYPE : return "long";
case OPERATION_UNARY_CAST_ULONG_TYPE : return "ulong";
case OPERATION_UNARY_CAST_HALF_TYPE : return "half";
case OPERATION_UNARY_CAST_FLOAT_TYPE : return "float";
case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "double";
case OPERATION_UNARY_MINUS_TYPE : return "umin";
case OPERATION_BINARY_ASSIGN_TYPE : return "assign";
case OPERATION_BINARY_INPLACE_ADD_TYPE : return "ip_add";
case OPERATION_BINARY_INPLACE_SUB_TYPE : return "ip_sub";
case OPERATION_BINARY_ADD_TYPE : return "add";
case OPERATION_BINARY_SUB_TYPE : return "sub";
case OPERATION_BINARY_MULT_TYPE : return "mult";
case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "eprod";
case OPERATION_BINARY_DIV_TYPE : return "div";
case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "ediv";
case OPERATION_BINARY_ACCESS_TYPE : return "acc";
default : return evaluate(type);
}
}
/** @brief functor for generating the expression string from a statement */
class evaluate_expression_traversal: public tree_parsing::traversal_functor
{
private:
std::map<std::string, std::string> const & accessors_;
std::string & str_;
mapping_type const & mapping_;
public:
evaluate_expression_traversal(std::map<std::string, std::string> const & accessors, std::string & str, mapping_type const & mapping) : accessors_(accessors), str_(str), mapping_(mapping){ }
void call_before_expansion(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx) const
{
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if ((root_node.op.type_family==viennacl::scheduler::OPERATION_UNARY_TYPE_FAMILY || utils::elementwise_function(root_node.op))
&& !utils::node_leaf(root_node.op))
str_+=tree_parsing::evaluate(root_node.op.type);
str_+="(";
}
void call_after_expansion(viennacl::scheduler::statement const & /*statement*/, atidlas_int_t /*root_idx*/) const
{
str_+=")";
}
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf) const
{
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
mapping_type::key_type key = std::make_pair(root_idx, leaf);
if (leaf==PARENT_NODE_TYPE)
{
if (utils::node_leaf(root_node.op))
str_ += mapping_.at(key)->evaluate(accessors_);
else if (utils::elementwise_operator(root_node.op))
str_ += tree_parsing::evaluate(root_node.op.type);
else if (root_node.op.type_family!=viennacl::scheduler::OPERATION_UNARY_TYPE_FAMILY && utils::elementwise_function(root_node.op))
str_ += ",";
}
else
{
if (leaf==LHS_NODE_TYPE)
{
if (root_node.lhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
str_ += mapping_.at(key)->evaluate(accessors_);
}
if (leaf==RHS_NODE_TYPE)
{
if (root_node.rhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
str_ += mapping_.at(key)->evaluate(accessors_);
}
}
}
};
inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors,
viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, mapping_type const & mapping)
{
std::string res;
evaluate_expression_traversal traversal_functor(accessors, res, mapping);
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if (leaf==RHS_NODE_TYPE)
{
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, false);
else
traversal_functor(statement, root_idx, leaf);
}
else if (leaf==LHS_NODE_TYPE)
{
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, false);
else
traversal_functor(statement, root_idx, leaf);
}
else
tree_parsing::traverse(statement, root_idx, traversal_functor, false);
return res;
}
inline void evaluate(utils::kernel_generation_stream & stream, leaf_t leaf, std::map<std::string, std::string> const & accessors,
statements_container const & statements, std::vector<mapping_type> const & mappings)
{
statements_container::data_type::const_iterator sit;
std::vector<mapping_type>::const_iterator mit;
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
stream << evaluate(leaf, accessors, *sit, sit->root(), *mit) << ";" << std::endl;
}
/** @brief functor for fetching or writing-back the elements in a statement */
class process_traversal : public tree_parsing::traversal_functor
{
public:
process_traversal(std::multimap<std::string, std::string> const & accessors, utils::kernel_generation_stream & stream,
mapping_type const & mapping, std::set<std::string> & already_processed) : accessors_(accessors), stream_(stream), mapping_(mapping), already_processed_(already_processed){ }
void operator()(viennacl::scheduler::statement const & /*statement*/, atidlas_int_t root_idx, leaf_t leaf) const
{
mapping_type::const_iterator it = mapping_.find(std::make_pair(root_idx, leaf));
if (it!=mapping_.end())
{
mapped_object * obj = it->second.get();
std::string key = obj->type_key();
if(already_processed_.insert(obj->process("#name")).second && accessors_.find(obj->type_key())!=accessors_.end())
for(std::multimap<std::string, std::string>::const_iterator it = accessors_.lower_bound(key) ; it != accessors_.upper_bound(key) ; ++it)
stream_ << obj->process(it->second) << std::endl;
}
}
private:
std::multimap<std::string, std::string> accessors_;
utils::kernel_generation_stream & stream_;
mapping_type const & mapping_;
std::set<std::string> & already_processed_;
};
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
viennacl::scheduler::statement const & statement, size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed)
{
process_traversal traversal_functor(accessors, stream, mapping, already_processed);
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if (leaf==RHS_NODE_TYPE)
{
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, true);
else
traversal_functor(statement, root_idx, leaf);
}
else if (leaf==LHS_NODE_TYPE)
{
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, true);
else
traversal_functor(statement, root_idx, leaf);
}
else
{
tree_parsing::traverse(statement, root_idx, traversal_functor, true);
}
}
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
statements_container const & statements, std::vector<mapping_type> const & mappings)
{
statements_container::data_type::const_iterator sit;
std::vector<mapping_type>::const_iterator mit;
std::set<std::string> already_processed;
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
process(stream, leaf, accessors, *sit, sit->root(), *mit, already_processed);
}
class statement_representation_functor : public traversal_functor{
private:
static void append_id(char * & ptr, unsigned int val)
{
if (val==0)
*ptr++='0';
else
while (val>0)
{
*ptr++= (char)('0' + (val % 10));
val /= 10;
}
}
public:
typedef void result_type;
statement_representation_functor(symbolic_binder & binder, char *& ptr) : binder_(binder), ptr_(ptr){ }
template<class NumericT>
inline result_type operator()(NumericT const & /*scal*/) const
{
*ptr_++='h'; //host
*ptr_++='s'; //scalar
*ptr_++=utils::first_letter_of_type<NumericT>::value();
}
/** @brief Scalar mapping */
template<class NumericT>
inline result_type operator()(viennacl::scalar<NumericT> const & scal) const
{
*ptr_++='s'; //scalar
*ptr_++=utils::first_letter_of_type<NumericT>::value();
append_id(ptr_, binder_.get(&viennacl::traits::handle(scal)));
}
/** @brief Vector mapping */
template<class NumericT>
inline result_type operator()(viennacl::vector_base<NumericT> const & vec) const
{
*ptr_++='v'; //vector
*ptr_++=utils::first_letter_of_type<NumericT>::value();
append_id(ptr_, binder_.get(&viennacl::traits::handle(vec)));
}
/** @brief Implicit vector mapping */
template<class NumericT>
inline result_type operator()(viennacl::implicit_vector_base<NumericT> const & /*vec*/) const
{
*ptr_++='i'; //implicit
*ptr_++='v'; //vector
*ptr_++=utils::first_letter_of_type<NumericT>::value();
}
/** @brief Matrix mapping */
template<class NumericT>
inline result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
{
*ptr_++='m'; //Matrix
*ptr_++=mat.row_major()?'r':'c';
*ptr_++=utils::first_letter_of_type<NumericT>::value();
append_id(ptr_, binder_.get(&viennacl::traits::handle(mat)));
}
/** @brief Implicit matrix mapping */
template<class NumericT>
inline result_type operator()(viennacl::implicit_matrix_base<NumericT> const & /*mat*/) const
{
*ptr_++='i'; //implicit
*ptr_++='m'; //matrix
*ptr_++=utils::first_letter_of_type<NumericT>::value();
}
static inline void append(char*& p, const char * str)
{
std::size_t n = std::strlen(str);
std::memcpy(p, str, n);
p+=n;
}
inline void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const
{
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
utils::call_on_element(root_node.lhs, *this);
else if (root_node.op.type_family==viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY && leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
utils::call_on_element(root_node.rhs, *this);
else if (leaf_t==PARENT_NODE_TYPE)
append_id(ptr_,root_node.op.type);
}
private:
symbolic_binder & binder_;
char *& ptr_;
};
inline std::string statements_representation(statements_container const & statements, binding_policy_t binding_policy)
{
std::vector<char> program_name_vector(256);
char* program_name = program_name_vector.data();
if (statements.order()==statements_container::INDEPENDENT)
*program_name++='i';
else
*program_name++='s';
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy);
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
tree_parsing::traverse(*it, it->root(), tree_parsing::statement_representation_functor(*binder, program_name),true);
*program_name='\0';
return std::string(program_name_vector.data());
}
}
}
#endif

572
atidlas/utils.hpp Normal file
View File

@@ -0,0 +1,572 @@
#ifndef ATIDLAS_UTILS_HPP
#define ATIDLAS_UTILS_HPP
#include <sstream>
#include "viennacl/matrix_def.hpp"
#include "viennacl/vector_def.hpp"
#include "viennacl/ocl/forwards.h"
#include "viennacl/scheduler/forwards.h"
#include "viennacl/traits/size.hpp"
#include "viennacl/traits/handle.hpp"
#include "viennacl/traits/row_major.hpp"
#include "atidlas/tools/to_string.hpp"
#include "atidlas/forwards.h"
namespace atidlas
{
namespace utils
{
//CUDA Conversion
inline std::string opencl_source_to_cuda_source(std::string const & opencl_src)
{
std::string res = opencl_src;
viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
//Pointer
viennacl::tools::find_and_replace(res, "__global float*", "float*");
viennacl::tools::find_and_replace(res, "__local float*", "float*");
viennacl::tools::find_and_replace(res, "__global double*", "double*");
viennacl::tools::find_and_replace(res, "__local double*", "double*");
//Qualifiers
viennacl::tools::find_and_replace(res,"__global","__device__");
viennacl::tools::find_and_replace(res,"__kernel","__global__");
viennacl::tools::find_and_replace(res,"__constant","__constant__");
viennacl::tools::find_and_replace(res,"__local","__shared__");
//Indexing
viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
//Synchronization
viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
return res;
}
static std::string numeric_type_to_string(viennacl::scheduler::statement_node_numeric_type const & type){
switch (type)
{
//case viennacl::scheduler::CHAR_TYPE: return "char";
//case viennacl::scheduler::UCHAR_TYPE: return "unsigned char";
//case viennacl::scheduler::SHORT_TYPE: return "short";
//case viennacl::scheduler::USHORT_TYPE: return "unsigned short";
case viennacl::scheduler::INT_TYPE: return "int";
case viennacl::scheduler::UINT_TYPE: return "unsigned int";
case viennacl::scheduler::LONG_TYPE: return "long";
case viennacl::scheduler::ULONG_TYPE: return "unsigned long";
case viennacl::scheduler::FLOAT_TYPE : return "float";
case viennacl::scheduler::DOUBLE_TYPE : return "double";
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_host_scalar(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.type_family == viennacl::scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a host scalar"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(element.host_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(element.host_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(element.host_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(element.host_ushort);
case viennacl::scheduler::INT_TYPE: return fun(element.host_int);
case viennacl::scheduler::UINT_TYPE: return fun(element.host_uint);
case viennacl::scheduler::LONG_TYPE: return fun(element.host_long);
case viennacl::scheduler::ULONG_TYPE: return fun(element.host_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(element.host_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(element.host_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_scalar(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.type_family == viennacl::scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a scalar"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.scalar_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.scalar_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.scalar_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.scalar_ushort);
case viennacl::scheduler::INT_TYPE: return fun(*element.scalar_int);
case viennacl::scheduler::UINT_TYPE: return fun(*element.scalar_uint);
case viennacl::scheduler::LONG_TYPE: return fun(*element.scalar_long);
case viennacl::scheduler::ULONG_TYPE: return fun(*element.scalar_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.scalar_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.scalar_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_vector(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.type_family == viennacl::scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a vector"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.vector_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.vector_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.vector_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.vector_ushort);
case viennacl::scheduler::INT_TYPE: return fun(*element.vector_int);
case viennacl::scheduler::UINT_TYPE: return fun(*element.vector_uint);
case viennacl::scheduler::LONG_TYPE: return fun(*element.vector_long);
case viennacl::scheduler::ULONG_TYPE: return fun(*element.vector_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.vector_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.vector_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_implicit_vector(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.type_family == viennacl::scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a implicit_vector"));
assert(element.subtype == viennacl::scheduler::IMPLICIT_VECTOR_TYPE && bool("Must be called on a implicit_vector"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.implicit_vector_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.implicit_vector_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.implicit_vector_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.implicit_vector_ushort);
case viennacl::scheduler::INT_TYPE: return fun(*element.implicit_vector_int);
case viennacl::scheduler::UINT_TYPE: return fun(*element.implicit_vector_uint);
case viennacl::scheduler::LONG_TYPE: return fun(*element.implicit_vector_long);
case viennacl::scheduler::ULONG_TYPE: return fun(*element.implicit_vector_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.implicit_vector_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.implicit_vector_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_matrix(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.type_family == viennacl::scheduler::MATRIX_TYPE_FAMILY && bool("Must be called on a matrix"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.matrix_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.matrix_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.matrix_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.matrix_ushort);
case viennacl::scheduler::INT_TYPE: return fun(*element.matrix_int);
case viennacl::scheduler::UINT_TYPE: return fun(*element.matrix_uint);
case viennacl::scheduler::LONG_TYPE: return fun(*element.matrix_long);
case viennacl::scheduler::ULONG_TYPE: return fun(*element.matrix_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.matrix_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.matrix_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_implicit_matrix(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
assert(element.subtype == viennacl::scheduler::IMPLICIT_MATRIX_TYPE && bool("Must be called on a implicit matrix"));
switch (element.numeric_type)
{
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.implicit_matrix_char);
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.implicit_matrix_uchar);
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.implicit_matrix_short);
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.implicit_matrix_ushort);
case viennacl::scheduler::INT_TYPE: return fun(*element.implicit_matrix_int);
case viennacl::scheduler::UINT_TYPE: return fun(*element.implicit_matrix_uint);
case viennacl::scheduler::LONG_TYPE: return fun(*element.implicit_matrix_long);
case viennacl::scheduler::ULONG_TYPE: return fun(*element.implicit_matrix_ulong);
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.implicit_matrix_float);
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.implicit_matrix_double);
default : throw generator_not_supported_exception("Unsupported Scalartype");
}
}
template<class Fun>
static typename Fun::result_type call_on_element(viennacl::scheduler::lhs_rhs_element const & element, Fun const & fun){
switch (element.type_family)
{
case viennacl::scheduler::SCALAR_TYPE_FAMILY:
if (element.subtype == viennacl::scheduler::HOST_SCALAR_TYPE)
return call_on_host_scalar(element, fun);
else
return call_on_scalar(element, fun);
case viennacl::scheduler::VECTOR_TYPE_FAMILY :
if (element.subtype == viennacl::scheduler::IMPLICIT_VECTOR_TYPE)
return call_on_implicit_vector(element, fun);
else
return call_on_vector(element, fun);
case viennacl::scheduler::MATRIX_TYPE_FAMILY:
if (element.subtype == viennacl::scheduler::IMPLICIT_MATRIX_TYPE)
return call_on_implicit_matrix(element, fun);
else
return call_on_matrix(element,fun);
default:
throw generator_not_supported_exception("Unsupported datastructure type : Not among {Scalar, Vector, Matrix}");
}
}
struct scalartype_size_fun
{
typedef atidlas_int_t result_type;
result_type operator()(float const &) const { return sizeof(float); }
result_type operator()(double const &) const { return sizeof(double); }
template<class T> result_type operator()(T const &) const { return sizeof(typename viennacl::result_of::cpu_value_type<T>::type); }
};
struct internal_size_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::internal_size(t); }
};
struct size_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::size(t); }
};
struct start_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start(t); }
};
struct stride_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride(t); }
};
struct start1_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start1(t); }
};
struct start2_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start2(t); }
};
struct leading_stride_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::stride2(t):viennacl::traits::stride1(t); }
};
struct leading_start_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::start2(t):viennacl::traits::start1(t); }
};
struct stride1_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride1(t); }
};
struct stride2_fun
{
typedef atidlas_int_t result_type;
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride2(t); }
};
struct handle_fun
{
typedef cl_mem result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::opencl_handle(t); }
};
struct internal_size1_fun
{
typedef atidlas_int_t result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::internal_size1(t); }
};
struct row_major_fun
{
typedef bool result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::row_major(t); }
};
struct internal_size2_fun
{
typedef atidlas_int_t result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::internal_size2(t); }
};
struct size1_fun
{
typedef atidlas_int_t result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::size1(t); }
};
struct size2_fun
{
typedef atidlas_int_t result_type;
template<class T>
result_type operator()(T const &t) const { return viennacl::traits::size2(t); }
};
template<class T, class U>
struct is_same_type { enum { value = 0 }; };
template<class T>
struct is_same_type<T,T> { enum { value = 1 }; };
inline bool is_reduction(viennacl::scheduler::statement_node const & node)
{
return node.op.type_family==viennacl::scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
|| node.op.type_family==viennacl::scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY
|| node.op.type_family==viennacl::scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE;
}
inline bool is_index_reduction(viennacl::scheduler::op_element const & op)
{
return op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE;
}
template<class T>
struct type_to_string;
template<> struct type_to_string<unsigned char> { static const char * value() { return "unsigned char"; } };
template<> struct type_to_string<char> { static const char * value() { return "char"; } };
template<> struct type_to_string<unsigned short> { static const char * value() { return "unsigned short"; } };
template<> struct type_to_string<short> { static const char * value() { return "short"; } };
template<> struct type_to_string<unsigned int> { static const char * value() { return "unsigned int"; } };
template<> struct type_to_string<int> { static const char * value() { return "int"; } };
template<> struct type_to_string<unsigned long> { static const char * value() { return "unsigned long"; } };
template<> struct type_to_string<long> { static const char * value() { return "long"; } };
template<> struct type_to_string<float> { static const char * value() { return "float"; } };
template<> struct type_to_string<double> { static const char * value() { return "double"; } };
template<class T>
struct first_letter_of_type;
template<> struct first_letter_of_type<char> { static char value() { return 'c'; } };
template<> struct first_letter_of_type<unsigned char> { static char value() { return 'd'; } };
template<> struct first_letter_of_type<short> { static char value() { return 's'; } };
template<> struct first_letter_of_type<unsigned short> { static char value() { return 't'; } };
template<> struct first_letter_of_type<int> { static char value() { return 'i'; } };
template<> struct first_letter_of_type<unsigned int> { static char value() { return 'j'; } };
template<> struct first_letter_of_type<long> { static char value() { return 'l'; } };
template<> struct first_letter_of_type<unsigned long> { static char value() { return 'm'; } };
template<> struct first_letter_of_type<float> { static char value() { return 'f'; } };
template<> struct first_letter_of_type<double> { static char value() { return 'd'; } };
class kernel_generation_stream : public std::ostream
{
class kgenstream : public std::stringbuf
{
public:
kgenstream(std::ostringstream& oss,unsigned int const & tab_count) : oss_(oss), tab_count_(tab_count){ }
int sync() {
for (unsigned int i=0; i<tab_count_;++i)
oss_ << " ";
oss_ << str();
str("");
return !oss_;
}
~kgenstream() { pubsync(); }
private:
std::ostream& oss_;
unsigned int const & tab_count_;
};
public:
kernel_generation_stream() : std::ostream(new kgenstream(oss,tab_count_)), tab_count_(0){ }
~kernel_generation_stream(){ delete rdbuf(); }
std::string str(){ return oss.str(); }
void inc_tab(){ ++tab_count_; }
void dec_tab(){ --tab_count_; }
private:
unsigned int tab_count_;
std::ostringstream oss;
};
inline bool node_leaf(viennacl::scheduler::op_element const & op)
{
using namespace viennacl::scheduler;
return op.type==OPERATION_UNARY_NORM_1_TYPE
|| op.type==OPERATION_UNARY_NORM_2_TYPE
|| op.type==OPERATION_UNARY_NORM_INF_TYPE
|| op.type==OPERATION_UNARY_TRANS_TYPE
|| op.type==OPERATION_BINARY_MAT_VEC_PROD_TYPE
|| op.type==OPERATION_BINARY_MAT_MAT_PROD_TYPE
|| op.type==OPERATION_BINARY_INNER_PROD_TYPE
|| op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE
|| op.type==OPERATION_BINARY_VECTOR_DIAG_TYPE
|| op.type==OPERATION_BINARY_MATRIX_ROW_TYPE
|| op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE
|| op.type_family==OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
|| op.type_family==OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|| op.type_family==OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
}
inline bool elementwise_operator(viennacl::scheduler::op_element const & op)
{
using namespace viennacl::scheduler;
return op.type== OPERATION_BINARY_ASSIGN_TYPE
|| op.type== OPERATION_BINARY_INPLACE_ADD_TYPE
|| op.type== OPERATION_BINARY_INPLACE_SUB_TYPE
|| op.type== OPERATION_BINARY_ADD_TYPE
|| op.type== OPERATION_BINARY_SUB_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_PROD_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_DIV_TYPE
|| op.type== OPERATION_BINARY_MULT_TYPE
|| op.type== OPERATION_BINARY_DIV_TYPE;
}
inline bool elementwise_function(viennacl::scheduler::op_element const & op)
{
using namespace viennacl::scheduler;
return
op.type == OPERATION_UNARY_CAST_CHAR_TYPE
|| op.type == OPERATION_UNARY_CAST_UCHAR_TYPE
|| op.type == OPERATION_UNARY_CAST_SHORT_TYPE
|| op.type == OPERATION_UNARY_CAST_USHORT_TYPE
|| op.type == OPERATION_UNARY_CAST_INT_TYPE
|| op.type == OPERATION_UNARY_CAST_UINT_TYPE
|| op.type == OPERATION_UNARY_CAST_LONG_TYPE
|| op.type == OPERATION_UNARY_CAST_ULONG_TYPE
|| op.type == OPERATION_UNARY_CAST_HALF_TYPE
|| op.type == OPERATION_UNARY_CAST_FLOAT_TYPE
|| op.type == OPERATION_UNARY_CAST_DOUBLE_TYPE
|| op.type== OPERATION_UNARY_ABS_TYPE
|| op.type== OPERATION_UNARY_ACOS_TYPE
|| op.type== OPERATION_UNARY_ASIN_TYPE
|| op.type== OPERATION_UNARY_ATAN_TYPE
|| op.type== OPERATION_UNARY_CEIL_TYPE
|| op.type== OPERATION_UNARY_COS_TYPE
|| op.type== OPERATION_UNARY_COSH_TYPE
|| op.type== OPERATION_UNARY_EXP_TYPE
|| op.type== OPERATION_UNARY_FABS_TYPE
|| op.type== OPERATION_UNARY_FLOOR_TYPE
|| op.type== OPERATION_UNARY_LOG_TYPE
|| op.type== OPERATION_UNARY_LOG10_TYPE
|| op.type== OPERATION_UNARY_SIN_TYPE
|| op.type== OPERATION_UNARY_SINH_TYPE
|| op.type== OPERATION_UNARY_SQRT_TYPE
|| op.type== OPERATION_UNARY_TAN_TYPE
|| op.type== OPERATION_UNARY_TANH_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_POW_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_EQ_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_NEQ_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_GREATER_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_LESS_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_GEQ_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_LEQ_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_FMAX_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_FMIN_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_MAX_TYPE
|| op.type== OPERATION_BINARY_ELEMENT_MIN_TYPE;
}
inline viennacl::scheduler::lhs_rhs_element & lhs_rhs_element(viennacl::scheduler::statement const & st, atidlas_int_t idx, leaf_t leaf)
{
using namespace tree_parsing;
assert(leaf==LHS_NODE_TYPE || leaf==RHS_NODE_TYPE);
if (leaf==LHS_NODE_TYPE)
return const_cast<viennacl::scheduler::lhs_rhs_element &>(st.array()[idx].lhs);
return const_cast<viennacl::scheduler::lhs_rhs_element &>(st.array()[idx].rhs);
}
inline unsigned int size_of(viennacl::scheduler::statement_node_numeric_type type)
{
using namespace viennacl::scheduler;
switch (type)
{
case UCHAR_TYPE:
case CHAR_TYPE: return 1;
case USHORT_TYPE:
case SHORT_TYPE:
case HALF_TYPE: return 2;
case UINT_TYPE:
case INT_TYPE:
case FLOAT_TYPE: return 4;
case ULONG_TYPE:
case LONG_TYPE:
case DOUBLE_TYPE: return 8;
default: throw generator_not_supported_exception("Unsupported scalartype");
}
}
inline std::string append_width(std::string const & str, unsigned int width)
{
if (width==1)
return str;
return str + tools::to_string(width);
}
template<typename MapT>
class create_map
{
typedef typename MapT::key_type T;
typedef typename MapT::mapped_type U;
public:
create_map(const T& key, const U& val)
{
map_.insert(std::make_pair(key,val));
}
create_map<MapT>& operator()(const T& key, const U& val)
{
map_.insert(std::make_pair(key,val));
return *this;
}
operator MapT()
{
return map_;
}
private:
MapT map_;
};
typedef create_map<std::multimap<std::string, std::string> > create_process_accessors;
typedef create_map<std::map<std::string, std::string> > create_evaluate_accessors;
}
}
#endif

View File

@@ -0,0 +1,88 @@
# - Find the OpenCL headers and library
#
# Defines the following if found:
# OPENCL_FOUND : TRUE if found, FALSE otherwise
# OPENCL_INCLUDE_DIRS : Include directories for OpenCL
# OPENCL_LIBRARIES : The libraries to link against
#
# The user can set the OPENCLROOT environment variable to help finding OpenCL
# if it is installed in a non-standard place.
set(ENV_ATISTREAMSDKROOT $ENV{ATISTREAMSDKROOT})
if(ENV_ATISTREAMSDKROOT)
set(ENV_OPENCLROOT $ENV{ATISTREAMSDKROOT})
endif(ENV_ATISTREAMSDKROOT)
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
if(ENV_AMDAPPSDKROOT)
set(ENV_OPENCLROOT $ENV{AMDAPPSDKROOT})
endif(ENV_AMDAPPSDKROOT)
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
if(ENV_INTELOCLSDKROOT)
set(ENV_OPENCLROOT $ENV{INTELOCLSDKROOT})
endif(ENV_INTELOCLSDKROOT)
set(ENV_OPENCLROOT2 $ENV{OPENCLROOT})
if(ENV_OPENCLROOT2)
set(ENV_OPENCLROOT $ENV{OPENCLROOT})
endif(ENV_OPENCLROOT2)
if(ENV_OPENCLROOT)
find_path(
OPENCL_INCLUDE_DIR
NAMES CL/cl.h OpenCL/cl.h
PATHS ${ENV_OPENCLROOT}/include
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
)
if (("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH
${OPENCL_LIB_SEARCH_PATH}
${ENV_OPENCLROOT}/lib/x86)
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
set(OPENCL_LIB_SEARCH_PATH
${OPENCL_LIB_SEARCH_PATH}
${ENV_OPENCLROOT}/lib/x86_64)
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
endif(("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
find_library(
OPENCL_LIBRARY
NAMES OpenCL
PATHS ${OPENCL_LIB_SEARCH_PATH}
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
)
else(ENV_OPENCLROOT)
find_path(
OPENCL_INCLUDE_DIR
NAMES CL/cl.h OpenCL/cl.h
PATHS ${PROJECT_SOURCE_DIR} #use the CL/ include folder provided with ViennaCL
)
find_library(
OPENCL_LIBRARY
NAMES OpenCL
)
endif(ENV_OPENCLROOT)
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(
OPENCL
DEFAULT_MSG
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
)
if(OPENCL_FOUND)
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
else(OPENCL_FOUND)
set(OPENCL_INCLUDE_DIRS)
set(OPENCL_LIBRARIES)
endif(OPENCL_FOUND)
mark_as_advanced(
OPENCL_INCLUDE_DIR
OPENCL_LIBRARY
)

View File

@@ -0,0 +1,7 @@
find_path(
VIENNACL_INCLUDE_DIR
NAMES viennacl/vector.hpp
)
set(VIENNACL_INCLUDE_DIRS ${VIENNACL_INCLUDE_DIR})
mark_as_advanced(VIENNACL_INCLUDE_DIRS)

6
tests/CMakeLists.txt Normal file
View File

@@ -0,0 +1,6 @@
foreach(PROG blas1 blas3)
add_executable(${PROG}-test ${PROG}.cpp)
add_test(${PROG} ${PROG}-test)
target_link_libraries(${PROG}-test ${OPENCL_LIBRARIES})
set_target_properties(${PROG}-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra")
endforeach(PROG)

147
tests/blas1.cpp Normal file
View File

@@ -0,0 +1,147 @@
#include "common.hpp"
#include "viennacl/vector.hpp"
#include "atidlas/templates/vector_axpy_template.hpp"
#include "atidlas/execute.hpp"
template<typename NumericT, class XType, class YType, class ZType>
int test_vectors(NumericT epsilon, atidlas::vector_axpy_parameters const & vector_axpy_parameters,
XType & cx, YType & cy, ZType & cz)
{
int failure_count = 0;
ZType buffer = cz;
NumericT a = 3.12, b = 3.5;
viennacl::scalar<NumericT> da(a), db(b);
viennacl::vector<NumericT> xtmp(cx.internal_size());
viennacl::vector<NumericT> ytmp(cy.internal_size());
viennacl::vector<NumericT> ztmp(cz.internal_size());
typename vector_maker<XType>::result_type x = vector_maker<XType>::make(xtmp, cx);
typename vector_maker<YType>::result_type y = vector_maker<YType>::make(ytmp, cy);
typename vector_maker<ZType>::result_type z = vector_maker<ZType>::make(ztmp, cz);
#define RUN_TEST_VECTOR_AXPY(NAME, CPU_LOOP, GPU_STATEMENT) \
std::cout << NAME "..." << std::flush;\
for(int_t i = 0 ; i < cz.size() ; ++i)\
CPU_LOOP;\
atidlas::execute(atidlas::vector_axpy_template(vector_axpy_parameters),\
GPU_STATEMENT,\
viennacl::ocl::current_context(), true);\
viennacl::copy(z, buffer);\
if(failure_vector(cz, buffer, epsilon))\
{\
failure_count++;\
std::cout << " [Failure!]" << std::endl;\
}\
else\
std::cout << std::endl;
RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x))
RUN_TEST_VECTOR_AXPY("z = x + y", cz[i] = cx[i] + cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x + y))
RUN_TEST_VECTOR_AXPY("z = x - y", cz[i] = cx[i] - cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x - y))
RUN_TEST_VECTOR_AXPY("z = x + y + z", cz[i] = cx[i] + cy[i] + cz[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x + y + z))
RUN_TEST_VECTOR_AXPY("z = a*x", cz[i] = a*cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x))
RUN_TEST_VECTOR_AXPY("z = da*x", cz[i] = a*cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x))
RUN_TEST_VECTOR_AXPY("z = a*x + b*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x + b*y))
RUN_TEST_VECTOR_AXPY("z = da*x + b*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x + b*y))
RUN_TEST_VECTOR_AXPY("z = a*x + db*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x + db*y))
RUN_TEST_VECTOR_AXPY("z = da*x + db*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x + db*y))
#undef RUN_TEST_VECTOR_AXPY
return failure_count;
}
template<typename NumericT>
int test_impl(NumericT epsilon)
{
int_t N = 24378;
int x_start = 4, y_start = 7, z_start = 15;
int x_stride = 5, y_stride = 8, z_stride = 12;
viennacl::range xr(x_start, N + x_start), yr(y_start, N + y_start), zr(z_start, N + z_start);
viennacl::slice xs(x_start, x_stride, N), ys(y_start, y_stride, N), zs(z_start, z_stride, N);
simple_vector<NumericT> x_vector(N), y_vector(N), z_vector(N);
init_rand(x_vector);
init_rand(y_vector);
init_rand(z_vector);
simple_vector<NumericT> x_range_holder(N + x_start);
simple_vector<NumericT> x_slice_holder(x_start + N*x_stride);
init_rand(x_range_holder);
init_rand(x_slice_holder);
simple_vector_range< simple_vector<NumericT> > x_range(x_range_holder, xr);
simple_vector_slice< simple_vector<NumericT> > x_slice(x_slice_holder, xs);
simple_vector<NumericT> y_range_holder(N + y_start);
simple_vector<NumericT> y_slice_holder(y_start + N*y_stride);
init_rand(y_range_holder);
init_rand(y_slice_holder);
simple_vector_range< simple_vector<NumericT> > y_range(y_range_holder, yr);
simple_vector_slice< simple_vector<NumericT> > y_slice(y_slice_holder, ys);
simple_vector<NumericT> z_range_holder(N + z_start);
simple_vector<NumericT> z_slice_holder(z_start + N*z_stride);
init_rand(z_range_holder);
init_rand(z_slice_holder);
simple_vector_range< simple_vector<NumericT> > z_range(z_range_holder, zr);
simple_vector_slice< simple_vector<NumericT> > z_slice(z_slice_holder, zs);
int_t failure_count = 0;
atidlas::vector_axpy_parameters vector_axpy_parameters(4, 32, 128, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS);
#define TEST_OPERATIONS(XTYPE, YTYPE, ZTYPE)\
std::cout << "> x : " #XTYPE " | y : " #YTYPE " | z : " #ZTYPE << std::endl;\
failure_count += test_vectors(epsilon, vector_axpy_parameters, x_ ## XTYPE, y_ ## YTYPE, z_ ## ZTYPE);\
TEST_OPERATIONS(vector, vector, vector)
TEST_OPERATIONS(vector, vector, range)
TEST_OPERATIONS(vector, vector, slice)
TEST_OPERATIONS(vector, range, vector)
TEST_OPERATIONS(vector, range, range)
TEST_OPERATIONS(vector, range, slice)
TEST_OPERATIONS(vector, slice, vector)
TEST_OPERATIONS(vector, slice, range)
TEST_OPERATIONS(vector, slice, slice)
TEST_OPERATIONS(range, vector, vector)
TEST_OPERATIONS(range, vector, range)
TEST_OPERATIONS(range, vector, slice)
TEST_OPERATIONS(range, range, vector)
TEST_OPERATIONS(range, range, range)
TEST_OPERATIONS(range, range, slice)
TEST_OPERATIONS(range, slice, vector)
TEST_OPERATIONS(range, slice, range)
TEST_OPERATIONS(range, slice, slice)
TEST_OPERATIONS(slice, vector, vector)
TEST_OPERATIONS(slice, vector, range)
TEST_OPERATIONS(slice, vector, slice)
TEST_OPERATIONS(slice, range, vector)
TEST_OPERATIONS(slice, range, range)
TEST_OPERATIONS(slice, range, slice)
TEST_OPERATIONS(slice, slice, vector)
TEST_OPERATIONS(slice, slice, range)
TEST_OPERATIONS(slice, slice, slice)
return failure_count;
}
int main()
{
int n_failures = 0;
std::cout << ">> float" << std::endl;
n_failures += test_impl<float>(1e-5);
std::cout << ">> double" << std::endl;
n_failures += test_impl<double>(1e-9);
if(n_failures>0)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

223
tests/blas2.cpp Normal file
View File

@@ -0,0 +1,223 @@
#include <cstddef>
#include "common.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/matrix_proxy.hpp"
#include "viennacl/linalg/prod.hpp"
template<typename T, typename CType, typename AType, typename BType>
int test_layout(CType & C, AType const & A, AType const & AT, BType const & B, BType const & BT,
simple_matrix<T> const & ground, T epsilon)
{
using viennacl::linalg::prod;
using viennacl::trans;
int failures_count = 0;
simple_matrix<T> tmp(C.size1(), C.size2());
#define TEST_OP(NAME, OPERATION)\
std::cout << NAME " ..." << std::flush;\
OPERATION;\
viennacl::copy(C, tmp);\
if (failure(ground, tmp, epsilon))\
{\
std::cout << " [Failure!]" << std::endl;\
failures_count++;\
}\
else\
std::cout << std::endl;
TEST_OP("C = A.B", C = prod(A, B));
TEST_OP("C = A'.B", C = prod(trans(AT), B));
TEST_OP("C = A.B'", C = prod(A, trans(BT)));
TEST_OP("C = A'.B'", C = prod(trans(AT), trans(BT)));
return failures_count;
}
template<typename T, typename RefAType, typename RefBType, typename RefCType>
int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & cA, RefAType & cAT, int BK, int BN, RefBType & cB, RefBType & cBT, T epsilon)
{
viennacl::matrix<T, viennacl::row_major> ArowTmp(AM, AK);
viennacl::matrix<T, viennacl::row_major> ATrowTmp(AK, AM);
viennacl::matrix<T, viennacl::row_major> BrowTmp(BK, BN);
viennacl::matrix<T, viennacl::row_major> BTrowTmp(BN, BK);
viennacl::matrix<T, viennacl::row_major> CrowTmp(CM, CN);
viennacl::matrix<T, viennacl::column_major> AcolTmp(AM, AK);
viennacl::matrix<T, viennacl::column_major> ATcolTmp(AK, AM);
viennacl::matrix<T, viennacl::column_major> BcolTmp(BK, BN);
viennacl::matrix<T, viennacl::column_major> BTcolTmp(BN, BK);
viennacl::matrix<T, viennacl::column_major> CcolTmp(CM, CN);
typename matrix_maker<RefCType, viennacl::row_major>::result_type Crow = matrix_maker<RefCType, viennacl::row_major>::make(CrowTmp, cC);
typename matrix_maker<RefAType, viennacl::row_major>::result_type Arow = matrix_maker<RefAType, viennacl::row_major>::make(ArowTmp, cA);
typename matrix_maker<RefAType, viennacl::row_major>::result_type ATrow = matrix_maker<RefAType, viennacl::row_major>::make(ATrowTmp, cAT);
typename matrix_maker<RefBType, viennacl::row_major>::result_type Brow = matrix_maker<RefBType, viennacl::row_major>::make(BrowTmp, cB);
typename matrix_maker<RefBType, viennacl::row_major>::result_type BTrow = matrix_maker<RefBType, viennacl::row_major>::make(BTrowTmp, cBT);
typename matrix_maker<RefCType, viennacl::column_major>::result_type Ccol = matrix_maker<RefCType, viennacl::column_major>::make(CcolTmp, cC);
typename matrix_maker<RefAType, viennacl::column_major>::result_type Acol = matrix_maker<RefAType, viennacl::column_major>::make(AcolTmp, cA);
typename matrix_maker<RefAType, viennacl::column_major>::result_type ATcol = matrix_maker<RefAType, viennacl::column_major>::make(ATcolTmp, cAT);
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
simple_matrix<T> ground = simple_prod<T>(cA, cB);
int failures_count = 0;
#define TEST_LAYOUT(Clayout, Alayout, Blayout) \
std::cout << "> " #Clayout " = " #Alayout "." #Blayout << std::endl; \
failures_count += test_layout(C ## Clayout, A ## Alayout, AT ## Alayout, B ## Blayout, BT ## Blayout, ground, epsilon);
TEST_LAYOUT(row, row, row);
TEST_LAYOUT(row, row, col);
TEST_LAYOUT(row, col, row);
TEST_LAYOUT(row, col, col);
TEST_LAYOUT(col, row, row);
TEST_LAYOUT(col, row, col);
TEST_LAYOUT(col, col, row);
TEST_LAYOUT(col, col, col);
#undef TEST_LAYOUT
return failures_count;
}
template<class MatrixType>
void init_rand(MatrixType & A)
{
typedef typename MatrixType::value_type T;
for (unsigned int i = 0; i < A.size1(); ++i)
for (unsigned int j = 0; j < A.size2(); ++j)
A(i, j) = static_cast<T>(0.1) * rand()/RAND_MAX;
}
template<typename T>
int run_test(T epsilon)
{
typedef viennacl::range range_type;
typedef viennacl::slice slice_type;
typedef simple_matrix<T> matrix_type;
typedef simple_matrix_range<matrix_type> matrix_range_type;
typedef simple_matrix_slice<matrix_type> matrix_slice_type;
int matrix_holder_M = 143;
int matrix_holder_N = 124;
int matrix_holder_K = 184;
int start_M = 14;
int start_N = 20;
int start_K = 73;
int range_holder_M = start_M + matrix_holder_M;
int range_holder_N = start_N + matrix_holder_N;
int range_holder_K = start_K + matrix_holder_K;
range_type range_M(start_M, range_holder_M);
range_type range_N(start_N, range_holder_N);
range_type range_K(start_K, range_holder_K);
int stride_M = 9;
int stride_N = 13;
int stride_K = 4;
int slice_holder_M = start_M + stride_M*matrix_holder_M;
int slice_holder_N = start_N + stride_N*matrix_holder_N;
int slice_holder_K = start_K + stride_K*matrix_holder_K;
slice_type slice_M(start_M, stride_M, matrix_holder_M);
slice_type slice_N(start_N, stride_N, matrix_holder_N);
slice_type slice_K(start_K, stride_K, matrix_holder_K);
int failures_count = 0;
#define DECLARE(NAME, size1, size2) \
matrix_type NAME ## _matrix(matrix_holder_ ## size1, matrix_holder_ ## size2);\
init_rand(NAME ## _matrix);\
matrix_type NAME ## T_matrix = simple_trans(NAME ## _matrix);\
\
matrix_type NAME ## _range_holder(range_holder_ ## size1, range_holder_ ## size2);\
init_rand(NAME ## _range_holder);\
matrix_range_type NAME ## _range(NAME ## _range_holder, range_ ## size1, range_ ## size2);\
matrix_type NAME ## T_range_holder = simple_trans(NAME ## _range_holder);\
matrix_range_type NAME ## T_range(NAME ## T_range_holder, range_ ## size2, range_ ## size1);\
\
matrix_type NAME ## _slice_holder(slice_holder_ ## size1, slice_holder_ ## size2);\
init_rand(NAME ## _slice_holder);\
matrix_slice_type NAME ## _slice(NAME ## _slice_holder, slice_ ## size1, slice_ ## size2);\
matrix_type NAME ## T_slice_holder = simple_trans(NAME ## _slice_holder);\
matrix_slice_type NAME ## T_slice(NAME ## T_slice_holder, slice_ ## size2, slice_ ## size1);\
DECLARE(A, M, K);
DECLARE(B, K, N);
DECLARE(C, M, N);
#undef DECLARE
#define TEST_ALL_LAYOUTS(C_TYPE, A_TYPE, B_TYPE)\
std::cout << ">> " #C_TYPE " = " #A_TYPE "." #B_TYPE << std::endl;\
failures_count += test_all_layouts<T>(C_TYPE ## _holder_M, C_TYPE ## _holder_N, C_ ## C_TYPE,\
A_TYPE ## _holder_M, A_TYPE ## _holder_K, A_ ## A_TYPE, AT_ ## A_TYPE,\
B_TYPE ## _holder_K, B_TYPE ## _holder_N, B_ ## B_TYPE, BT_ ## B_TYPE, epsilon);
\
// //C=matrix
TEST_ALL_LAYOUTS(matrix, matrix, matrix)
TEST_ALL_LAYOUTS(matrix, matrix, range)
TEST_ALL_LAYOUTS(matrix, matrix, slice)
TEST_ALL_LAYOUTS(matrix, range, matrix)
TEST_ALL_LAYOUTS(matrix, range, range)
TEST_ALL_LAYOUTS(matrix, range, slice)
TEST_ALL_LAYOUTS(matrix, slice, matrix)
TEST_ALL_LAYOUTS(matrix, slice, range)
TEST_ALL_LAYOUTS(matrix, slice, slice)
// C = range
TEST_ALL_LAYOUTS(range, matrix, matrix)
TEST_ALL_LAYOUTS(range, matrix, range)
TEST_ALL_LAYOUTS(range, matrix, slice)
TEST_ALL_LAYOUTS(range, range, matrix)
TEST_ALL_LAYOUTS(range, range, range)
TEST_ALL_LAYOUTS(range, range, slice)
TEST_ALL_LAYOUTS(range, slice, matrix)
TEST_ALL_LAYOUTS(range, slice, range)
TEST_ALL_LAYOUTS(range, slice, slice)
// C = slice
TEST_ALL_LAYOUTS(slice, matrix, matrix)
TEST_ALL_LAYOUTS(slice, matrix, range)
TEST_ALL_LAYOUTS(slice, matrix, slice)
TEST_ALL_LAYOUTS(slice, range, matrix)
TEST_ALL_LAYOUTS(slice, range, range)
TEST_ALL_LAYOUTS(slice, range, slice)
TEST_ALL_LAYOUTS(slice, slice, matrix)
TEST_ALL_LAYOUTS(slice, slice, range)
TEST_ALL_LAYOUTS(slice, slice, slice)
#undef TEST_ALL_LAYOUTS
return failures_count;
}
int main()
{
int n_failures = 0;
std::cout << ">>> float" << std::endl;
n_failures += run_test<float>(1e-5);
std::cout << ">>> double" << std::endl;
n_failures += run_test<double>(1e-9);
if(n_failures>0)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

214
tests/blas3.cpp Normal file
View File

@@ -0,0 +1,214 @@
#include <cstddef>
#include "common.hpp"
#include "viennacl/matrix.hpp"
#include "viennacl/matrix_proxy.hpp"
#include "viennacl/linalg/prod.hpp"
template<typename T, typename CType, typename AType, typename BType>
int test_layout(CType & C, AType const & A, AType const & AT, BType const & B, BType const & BT,
simple_matrix<T> const & ground, T epsilon)
{
using viennacl::linalg::prod;
using viennacl::trans;
int failures_count = 0;
simple_matrix<T> tmp(C.size1(), C.size2());
#define TEST_OP(NAME, OPERATION)\
std::cout << NAME " ..." << std::flush;\
OPERATION;\
viennacl::copy(C, tmp);\
if (failure(ground, tmp, epsilon))\
{\
std::cout << " [Failure!]" << std::endl;\
failures_count++;\
}\
else\
std::cout << std::endl;
TEST_OP("C = A.B", C = prod(A, B));
TEST_OP("C = A'.B", C = prod(trans(AT), B));
TEST_OP("C = A.B'", C = prod(A, trans(BT)));
TEST_OP("C = A'.B'", C = prod(trans(AT), trans(BT)));
return failures_count;
}
template<typename T, typename RefAType, typename RefBType, typename RefCType>
int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & cA, RefAType & cAT, int BK, int BN, RefBType & cB, RefBType & cBT, T epsilon)
{
viennacl::matrix<T, viennacl::row_major> ArowTmp(AM, AK);
viennacl::matrix<T, viennacl::row_major> ATrowTmp(AK, AM);
viennacl::matrix<T, viennacl::row_major> BrowTmp(BK, BN);
viennacl::matrix<T, viennacl::row_major> BTrowTmp(BN, BK);
viennacl::matrix<T, viennacl::row_major> CrowTmp(CM, CN);
viennacl::matrix<T, viennacl::column_major> AcolTmp(AM, AK);
viennacl::matrix<T, viennacl::column_major> ATcolTmp(AK, AM);
viennacl::matrix<T, viennacl::column_major> BcolTmp(BK, BN);
viennacl::matrix<T, viennacl::column_major> BTcolTmp(BN, BK);
viennacl::matrix<T, viennacl::column_major> CcolTmp(CM, CN);
typename matrix_maker<RefCType, viennacl::row_major>::result_type Crow = matrix_maker<RefCType, viennacl::row_major>::make(CrowTmp, cC);
typename matrix_maker<RefAType, viennacl::row_major>::result_type Arow = matrix_maker<RefAType, viennacl::row_major>::make(ArowTmp, cA);
typename matrix_maker<RefAType, viennacl::row_major>::result_type ATrow = matrix_maker<RefAType, viennacl::row_major>::make(ATrowTmp, cAT);
typename matrix_maker<RefBType, viennacl::row_major>::result_type Brow = matrix_maker<RefBType, viennacl::row_major>::make(BrowTmp, cB);
typename matrix_maker<RefBType, viennacl::row_major>::result_type BTrow = matrix_maker<RefBType, viennacl::row_major>::make(BTrowTmp, cBT);
typename matrix_maker<RefCType, viennacl::column_major>::result_type Ccol = matrix_maker<RefCType, viennacl::column_major>::make(CcolTmp, cC);
typename matrix_maker<RefAType, viennacl::column_major>::result_type Acol = matrix_maker<RefAType, viennacl::column_major>::make(AcolTmp, cA);
typename matrix_maker<RefAType, viennacl::column_major>::result_type ATcol = matrix_maker<RefAType, viennacl::column_major>::make(ATcolTmp, cAT);
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
simple_matrix<T> ground = simple_prod<T>(cA, cB);
int failures_count = 0;
#define TEST_LAYOUT(Clayout, Alayout, Blayout) \
std::cout << "> " #Clayout " = " #Alayout "." #Blayout << std::endl; \
failures_count += test_layout(C ## Clayout, A ## Alayout, AT ## Alayout, B ## Blayout, BT ## Blayout, ground, epsilon);
TEST_LAYOUT(row, row, row);
TEST_LAYOUT(row, row, col);
TEST_LAYOUT(row, col, row);
TEST_LAYOUT(row, col, col);
TEST_LAYOUT(col, row, row);
TEST_LAYOUT(col, row, col);
TEST_LAYOUT(col, col, row);
TEST_LAYOUT(col, col, col);
#undef TEST_LAYOUT
return failures_count;
}
template<typename T>
int run_test(T epsilon)
{
typedef viennacl::range range_type;
typedef viennacl::slice slice_type;
typedef simple_matrix<T> matrix_type;
typedef simple_matrix_range<matrix_type> matrix_range_type;
typedef simple_matrix_slice<matrix_type> matrix_slice_type;
int matrix_holder_M = 143;
int matrix_holder_N = 124;
int matrix_holder_K = 184;
int start_M = 14;
int start_N = 20;
int start_K = 73;
int range_holder_M = start_M + matrix_holder_M;
int range_holder_N = start_N + matrix_holder_N;
int range_holder_K = start_K + matrix_holder_K;
range_type range_M(start_M, range_holder_M);
range_type range_N(start_N, range_holder_N);
range_type range_K(start_K, range_holder_K);
int stride_M = 9;
int stride_N = 13;
int stride_K = 4;
int slice_holder_M = start_M + stride_M*matrix_holder_M;
int slice_holder_N = start_N + stride_N*matrix_holder_N;
int slice_holder_K = start_K + stride_K*matrix_holder_K;
slice_type slice_M(start_M, stride_M, matrix_holder_M);
slice_type slice_N(start_N, stride_N, matrix_holder_N);
slice_type slice_K(start_K, stride_K, matrix_holder_K);
int failures_count = 0;
#define DECLARE(NAME, size1, size2) \
matrix_type NAME ## _matrix(matrix_holder_ ## size1, matrix_holder_ ## size2);\
init_rand(NAME ## _matrix);\
matrix_type NAME ## T_matrix = simple_trans(NAME ## _matrix);\
\
matrix_type NAME ## _range_holder(range_holder_ ## size1, range_holder_ ## size2);\
init_rand(NAME ## _range_holder);\
matrix_range_type NAME ## _range(NAME ## _range_holder, range_ ## size1, range_ ## size2);\
matrix_type NAME ## T_range_holder = simple_trans(NAME ## _range_holder);\
matrix_range_type NAME ## T_range(NAME ## T_range_holder, range_ ## size2, range_ ## size1);\
\
matrix_type NAME ## _slice_holder(slice_holder_ ## size1, slice_holder_ ## size2);\
init_rand(NAME ## _slice_holder);\
matrix_slice_type NAME ## _slice(NAME ## _slice_holder, slice_ ## size1, slice_ ## size2);\
matrix_type NAME ## T_slice_holder = simple_trans(NAME ## _slice_holder);\
matrix_slice_type NAME ## T_slice(NAME ## T_slice_holder, slice_ ## size2, slice_ ## size1);\
DECLARE(A, M, K);
DECLARE(B, K, N);
DECLARE(C, M, N);
#undef DECLARE
#define TEST_ALL_LAYOUTS(C_TYPE, A_TYPE, B_TYPE)\
std::cout << ">> " #C_TYPE " = " #A_TYPE "." #B_TYPE << std::endl;\
failures_count += test_all_layouts<T>(C_TYPE ## _holder_M, C_TYPE ## _holder_N, C_ ## C_TYPE,\
A_TYPE ## _holder_M, A_TYPE ## _holder_K, A_ ## A_TYPE, AT_ ## A_TYPE,\
B_TYPE ## _holder_K, B_TYPE ## _holder_N, B_ ## B_TYPE, BT_ ## B_TYPE, epsilon);
\
// //C=matrix
TEST_ALL_LAYOUTS(matrix, matrix, matrix)
TEST_ALL_LAYOUTS(matrix, matrix, range)
TEST_ALL_LAYOUTS(matrix, matrix, slice)
TEST_ALL_LAYOUTS(matrix, range, matrix)
TEST_ALL_LAYOUTS(matrix, range, range)
TEST_ALL_LAYOUTS(matrix, range, slice)
TEST_ALL_LAYOUTS(matrix, slice, matrix)
TEST_ALL_LAYOUTS(matrix, slice, range)
TEST_ALL_LAYOUTS(matrix, slice, slice)
// C = range
TEST_ALL_LAYOUTS(range, matrix, matrix)
TEST_ALL_LAYOUTS(range, matrix, range)
TEST_ALL_LAYOUTS(range, matrix, slice)
TEST_ALL_LAYOUTS(range, range, matrix)
TEST_ALL_LAYOUTS(range, range, range)
TEST_ALL_LAYOUTS(range, range, slice)
TEST_ALL_LAYOUTS(range, slice, matrix)
TEST_ALL_LAYOUTS(range, slice, range)
TEST_ALL_LAYOUTS(range, slice, slice)
// C = slice
TEST_ALL_LAYOUTS(slice, matrix, matrix)
TEST_ALL_LAYOUTS(slice, matrix, range)
TEST_ALL_LAYOUTS(slice, matrix, slice)
TEST_ALL_LAYOUTS(slice, range, matrix)
TEST_ALL_LAYOUTS(slice, range, range)
TEST_ALL_LAYOUTS(slice, range, slice)
TEST_ALL_LAYOUTS(slice, slice, matrix)
TEST_ALL_LAYOUTS(slice, slice, range)
TEST_ALL_LAYOUTS(slice, slice, slice)
#undef TEST_ALL_LAYOUTS
return failures_count;
}
int main()
{
int n_failures = 0;
std::cout << ">>> float" << std::endl;
n_failures += run_test<float>(1e-5);
std::cout << ">>> double" << std::endl;
n_failures += run_test<double>(1e-9);
if(n_failures>0)
return EXIT_FAILURE;
return EXIT_SUCCESS;
}

314
tests/common.hpp Normal file
View File

@@ -0,0 +1,314 @@
#ifndef TEST_COMMON_HPP_
#define TEST_COMMON_HPP_
#include "vector"
#include "viennacl/matrix.hpp"
#include "viennacl/matrix_proxy.hpp"
#include "atidlas/forwards.h"
typedef atidlas::atidlas_int_t int_t;
/*---------
* Vector
* -------*/
template<class NumericT>
class simple_vector
{
public:
typedef NumericT value_type;
typedef size_t size_type;
simple_vector(size_t N) : N_(N), data_(N){ }
size_t size() const { return N_; }
value_type & operator[](size_t i) { return data_[i]; }
value_type operator[](size_t i) const { return data_[i]; }
size_t internal_size() const { return data_.size(); }
typename std::vector<value_type>::iterator begin() { return data_.begin(); }
typename std::vector<value_type>::iterator end() { return data_.begin() + size(); }
typename std::vector<value_type>::const_iterator begin() const { return data_.begin(); }
typename std::vector<value_type>::const_iterator end() const { return data_.begin() + size(); }
private:
size_t N_;
std::vector<value_type> data_;
};
template<class T>
class simple_vector_range
{
public:
typedef typename T::value_type value_type;
typedef typename T::size_type size_type;
simple_vector_range(simple_vector<value_type> & data, viennacl::range const & r) : data_(data), r_(r) { }
size_t size() const { return r_.size(); }
viennacl::range const & range() const { return r_; }
value_type & operator[](size_t i) { return data_[i]; }
value_type operator[](size_t i) const { return data_[i]; }
size_t internal_size() const { return data_.size(); }
typename std::vector<value_type>::iterator begin() { return data_.begin(); }
typename std::vector<value_type>::iterator end() { return data_.begin() + size(); }
typename std::vector<value_type>::const_iterator begin() const { return data_.begin(); }
typename std::vector<value_type>::const_iterator end() const { return data_.begin() + size(); }
private:
simple_vector<value_type> & data_;
viennacl::range r_;
};
template<class T>
class simple_vector_slice
{
public:
typedef typename T::value_type value_type;
typedef typename T::size_type size_type;
simple_vector_slice(simple_vector<value_type> & data, viennacl::slice const & s) : data_(data), s_(s) { }
size_type size() const { return s_.size(); }
size_t internal_size() const { return data_.size(); }
viennacl::slice const & slice() const { return s_; }
value_type & operator[](size_t i) { return data_[i]; }
value_type operator[](size_t i) const { return data_[i]; }
private:
simple_vector<value_type> & data_;
viennacl::slice s_;
};
//Helper to initialize a viennacl vector from a simple type
template<class SimpleType>
struct vector_maker;
template<class NumericT>
struct vector_maker< simple_vector<NumericT> >
{
typedef viennacl::vector<NumericT> result_type;
static result_type make(viennacl::vector<NumericT> const &, simple_vector<NumericT> & base)
{
viennacl::vector<NumericT> result(base.size());
viennacl::copy(base, result);
return result;
}
};
template<class NumericT>
struct vector_maker< simple_vector_range< simple_vector<NumericT> > >
{
typedef viennacl::vector_range< viennacl::vector<NumericT> > result_type;
static result_type make(viennacl::vector<NumericT> & x, simple_vector_range< simple_vector<NumericT> > & base)
{
result_type result(x, base.range());
viennacl::copy(base, result);
return result;
}
};
template<class NumericT>
struct vector_maker< simple_vector_slice<simple_vector<NumericT> > >
{
typedef viennacl::vector_slice< viennacl::vector<NumericT> > result_type;
static result_type make(viennacl::vector<NumericT> & M, simple_vector_slice< simple_vector<NumericT> > & base)
{
result_type result(M, base.slice());
viennacl::copy(base, result);
return result;
}
};
/*---------
* Matrix
* -------*/
template<class NumericT>
class simple_matrix
{
public:
typedef NumericT value_type;
typedef size_t size_type;
simple_matrix(size_t M, size_t N) : data_(M*N), M_(M), N_(N){ }
size_t size1() const { return M_; }
size_t size2() const { return N_; }
value_type & operator()(size_t i, size_t j) { return data_[i + M_*j]; }
value_type operator()(size_t i, size_t j) const { return data_[i + M_*j]; }
private:
std::vector<value_type> data_;
size_t M_;
size_t N_;
};
template<class T>
class simple_matrix_range
{
public:
typedef typename T::value_type value_type;
simple_matrix_range(T & A, viennacl::range const & r1, viennacl::range const & r2) : A_(A), r1_(r1), r2_(r2){ }
size_t size1() const { return r1_.size(); }
size_t size2() const { return r2_.size(); }
viennacl::range const & r1() const { return r1_; }
viennacl::range const & r2() const { return r2_; }
value_type & operator()(size_t i, size_t j) { return A_(i+r1_.start(), j+r2_.start()); }
value_type operator()(size_t i, size_t j) const { return A_(i+r1_.start(), j+r2_.start()); }
private:
T & A_;
viennacl::range r1_;
viennacl::range r2_;
};
template<class T>
class simple_matrix_slice
{
public:
typedef typename T::value_type value_type;
simple_matrix_slice(T & A, viennacl::slice const & s1, viennacl::slice const & s2) : A_(A), s1_(s1), s2_(s2){ }
viennacl::slice::size_type size1() const { return s1_.size(); }
viennacl::slice::size_type size2() const { return s2_.size(); }
viennacl::slice const & s1() const { return s1_; }
viennacl::slice const & s2() const { return s2_; }
value_type & operator()(size_t i, size_t j) { return A_(i*s1_.stride() + s1_.start(), j*s2_.stride() + s2_.start()); }
value_type operator()(size_t i, size_t j) const { return A_(i+s1_.start(), j+s2_.start()); }
private:
T & A_;
viennacl::slice s1_;
viennacl::slice s2_;
};
/*-------
* Helpers
*-------*/
template<typename T>
void init_rand(simple_matrix<T> & A)
{
for (unsigned int i = 0; i < A.size1(); ++i)
for (unsigned int j = 0; j < A.size2(); ++j)
A(i, j) = T(0.1) * rand()/RAND_MAX;
}
template<typename T>
void init_rand(simple_vector<T> & x)
{
for (unsigned int i = 0; i < x.size(); ++i)
x[i] = T(0.1) * rand()/RAND_MAX;
}
template<class T>
simple_matrix<T> simple_trans(simple_matrix<T> const & A)
{
int M = A.size1();
int N = A.size2();
simple_matrix<T> result(N, M);
for(int i = 0; i < N; ++i)
for(int j = 0; j < M; ++j)
result(i,j) = A(j,i);
return result;
}
template<class T, class U, class V>
simple_matrix<T> simple_prod(U const & A, V const & B)
{
int M = A.size1();
int N = B.size2();
int K = A.size2();
simple_matrix<T> result(M, N);
for(int i = 0 ; i < M ; ++i)
for(int j = 0 ; j < N ; ++j)
{
T val = 0;
for(int k = 0 ; k < K ; ++k)
val+= A(i, k)*B(k,j);
result(i, j) = val;
}
return result;
}
template<class SimpleType, class F>
struct matrix_maker;
template<class T, class F>
struct matrix_maker< simple_matrix<T>, F>
{
typedef viennacl::matrix<T, F> result_type;
static result_type make(viennacl::matrix<T, F> const &, simple_matrix<T> & base)
{
viennacl::matrix<T, F> result(base.size1(), base.size2());
viennacl::copy(base, result);
return result;
}
};
template<class MatrixT, class F>
struct matrix_maker< simple_matrix_range<MatrixT>, F>
{
typedef typename MatrixT::value_type T;
typedef viennacl::matrix_range< viennacl::matrix<T, F> > result_type;
static result_type make(viennacl::matrix<T, F> & M, simple_matrix_range<MatrixT> & base)
{
result_type result(M, base.r1(), base.r2());
viennacl::copy(base, result);
return result;
}
};
template<class MatrixT, class F>
struct matrix_maker< simple_matrix_slice<MatrixT>, F>
{
typedef typename MatrixT::value_type T;
typedef viennacl::matrix_slice< viennacl::matrix<T, F> > result_type;
static result_type make(viennacl::matrix<T, F> & M, simple_matrix_slice<MatrixT> & base)
{
result_type result(M, base.s1(), base.s2());
viennacl::copy(base, result);
return result;
}
};
template<class VectorType>
bool failure_vector(VectorType const & x, VectorType const & y, typename VectorType::value_type epsilon)
{
typedef typename VectorType::value_type value_type;
for(int_t i = 0 ; i < x.size() ; ++i)
{
value_type delta = std::abs(x[i] - y[i]);
if(delta > epsilon)
return true;
}
return false;
}
template<class NumericT>
bool failure(simple_matrix<NumericT> const & A, simple_matrix<NumericT> const & B, NumericT epsilon)
{
int M = A.size1();
int N = A.size2();
for(int i = 0 ; i < M ; ++i)
for(int j = 0 ; j < N ; ++j)
{
NumericT delta = std::abs(A(i,j) - B(i,j));
if(delta > epsilon)
return true;
}
return false;
}
namespace viennacl
{
namespace traits
{
template<class T> int size1(simple_matrix<T> const & M) { return M.size1(); }
template<class T> int size2(simple_matrix<T> const & M) { return M.size2(); }
}
}
#endif