Initial commit
This commit is contained in:
18
CMakeLists.txt
Normal file
18
CMakeLists.txt
Normal file
@@ -0,0 +1,18 @@
|
||||
cmake_minimum_required(VERSION 2.8)
|
||||
|
||||
# Add visibility of headers
|
||||
file( GLOB_RECURSE MAKE_HEADERS_VISIBLE_SRC *.hpp *.h)
|
||||
add_custom_target( MAKE_HEADERS_VISIBLE SOURCES ${MAKE_HEADERS_VISIBLE_SRC} )
|
||||
|
||||
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake_modules")
|
||||
|
||||
find_package(ViennaCL QUIET REQUIRED)
|
||||
include_directories(${VIENNACL_INCLUDE_DIRS})
|
||||
|
||||
find_package(OpenCL QUIET REQUIRED)
|
||||
include_directories(${OPENCL_INCLUDE_DIRS})
|
||||
|
||||
include_directories(${PROJECT_SOURCE_DIR})
|
||||
|
||||
INCLUDE(CTest)
|
||||
add_subdirectory(tests)
|
31
atidlas/execute.hpp
Normal file
31
atidlas/execute.hpp
Normal file
@@ -0,0 +1,31 @@
|
||||
#ifndef VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
|
||||
#define VIENNACL_DEVICE_SPECIFIC_EXECUTE_HPP
|
||||
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
#include <typeinfo>
|
||||
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
#include "viennacl/tools/timer.hpp"
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/forwards.h"
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/execution_handler.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
inline void execute(template_base const & T, statements_container const & statements, viennacl::ocl::context & ctx = viennacl::ocl::current_context(), bool force_compilation = false)
|
||||
{
|
||||
//Generate program name
|
||||
std::string program_name = tree_parsing::statements_representation(statements, BIND_TO_HANDLE);
|
||||
execution_handler handler(program_name, ctx, ctx.current_device(), force_compilation);
|
||||
handler.add(program_name, T, statements);
|
||||
handler.execute(program_name, statements);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
77
atidlas/execution_handler.hpp
Normal file
77
atidlas/execution_handler.hpp
Normal file
@@ -0,0 +1,77 @@
|
||||
#ifndef ATIDLAS_EXECUTION_HANDLER_HPP
|
||||
#define ATIDLAS_EXECUTION_HANDLER_HPP
|
||||
|
||||
#include <map>
|
||||
|
||||
#include "viennacl/tools/shared_ptr.hpp"
|
||||
|
||||
#include "atidlas/lazy_program_compiler.hpp"
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
class execution_handler
|
||||
{
|
||||
public:
|
||||
typedef std::map< std::string, tools::shared_ptr<template_base> > container_type;
|
||||
|
||||
private:
|
||||
std::string append_prefix(std::string const & str)
|
||||
{
|
||||
return "_" + str;
|
||||
}
|
||||
|
||||
std::string define_extension(std::string const & ext)
|
||||
{
|
||||
// Note: On devices without double precision support, 'ext' is an empty string.
|
||||
return (ext.length() > 1) ? std::string("#pragma OPENCL EXTENSION " + ext + " : enable\n") : std::string("\n");
|
||||
}
|
||||
|
||||
void init_program_compiler(std::string const & name, bool force_recompilation)
|
||||
{
|
||||
lazy_programs_.push_back(lazy_program_compiler(&ctx_, name, force_recompilation));
|
||||
lazy_programs_.back().add(define_extension(device_.double_support_extension()));
|
||||
}
|
||||
|
||||
public:
|
||||
execution_handler(std::string const & program_name_base, viennacl::ocl::context & ctx, viennacl::ocl::device const & device, bool force_recompilation = false) : ctx_(ctx), device_(device), program_names_(2), init_done_(false)
|
||||
{
|
||||
lazy_programs_.reserve(2);
|
||||
init_program_compiler(program_name_base + "_0", force_recompilation);
|
||||
init_program_compiler(program_name_base + "_1", force_recompilation);
|
||||
}
|
||||
|
||||
void add(std::string const & key, template_base const & T, statements_container const & statements)
|
||||
{
|
||||
if (kernels_.insert(container_type::value_type(key, T.clone())).second)
|
||||
{
|
||||
std::vector<std::string> sources = kernels_.at(key)->generate(append_prefix(key), statements, device_);
|
||||
assert(sources.size()<=2);
|
||||
for (unsigned int i = 0; i < sources.size(); ++i)
|
||||
lazy_programs_[i].add(sources[i]);
|
||||
}
|
||||
}
|
||||
|
||||
template_base * template_of(std::string const & key)
|
||||
{
|
||||
return kernels_.at(key).get();
|
||||
}
|
||||
|
||||
void execute(container_type::key_type const & key, statements_container const & statements)
|
||||
{
|
||||
tools::shared_ptr<template_base> & template_pointer = kernels_.at(key);
|
||||
template_pointer->enqueue(append_prefix(key), lazy_programs_, statements);
|
||||
}
|
||||
|
||||
private:
|
||||
viennacl::ocl::context & ctx_;
|
||||
viennacl::ocl::device const & device_;
|
||||
container_type kernels_;
|
||||
std::vector<std::string> program_names_;
|
||||
std::vector<lazy_program_compiler> lazy_programs_;
|
||||
bool init_done_;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
258
atidlas/forwards.h
Normal file
258
atidlas/forwards.h
Normal file
@@ -0,0 +1,258 @@
|
||||
#ifndef ATIDLAS_FORWARDS_H
|
||||
#define ATIDLAS_FORWARDS_H
|
||||
|
||||
#include <list>
|
||||
#include <map>
|
||||
#include <set>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "atidlas/tools/shared_ptr.hpp"
|
||||
|
||||
#include "viennacl/scheduler/io.hpp"
|
||||
#include "viennacl/ocl/forwards.h"
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
#include "viennacl/backend/mem_handle.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
typedef int atidlas_int_t;
|
||||
|
||||
//Error codes
|
||||
static const int TEMPLATE_VALID = 0;
|
||||
static const int TEMPLATE_LOCAL_MEMORY_OVERFLOW = -1;
|
||||
static const int TEMPLATE_WORK_GROUP_SIZE_OVERFLOW = -2;
|
||||
static const int TEMPLATE_LOCAL_SIZE_0_OVERFLOW = -3;
|
||||
static const int TEMPLATE_LOCAL_SIZE_1_OVERFLOW = -4;
|
||||
static const int TEMPLATE_LOCAL_SIZE_2_OVERFLOW = -5;
|
||||
static const int TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE = -6;
|
||||
static const int TEMPLATE_INVALID_SIMD_WIDTH = -7;
|
||||
static const int TEMPLATE_AlignmentV_MUST_BE_BLOCK_SIZE_MULTIPLE = -8;
|
||||
static const int TEMPLATE_INVALID_FETCHING_POLICY_TYPE= -9;
|
||||
|
||||
static const int TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH = -10;
|
||||
static const int TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE = -11;
|
||||
static const int TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL = -12;
|
||||
static const int TEMPLATE_SIMD_WIDTH_MUST_BE_ONE = -13;
|
||||
static const int TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT = -14;
|
||||
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE = -15;
|
||||
static const int TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE = -16;
|
||||
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE = -17;
|
||||
static const int TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE = -18;
|
||||
|
||||
struct atidlas_int_tuple
|
||||
{
|
||||
atidlas_int_tuple(std::string const & _i, std::string const & _bound0) : i(_i), bound0(_bound0), j(""), bound1(""){ }
|
||||
atidlas_int_tuple(std::string const & _i, std::string const & _bound0, std::string const & _j, std::string const & _bound1) : i(_i), bound0(_bound0), j(_j), bound1(_bound1){ }
|
||||
std::string i;
|
||||
std::string bound0;
|
||||
std::string j;
|
||||
std::string bound1;
|
||||
};
|
||||
|
||||
inline bool is_scalar_reduction(viennacl::scheduler::statement_node const & node)
|
||||
{
|
||||
return node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE || node.op.type_family==viennacl::scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY;
|
||||
}
|
||||
|
||||
inline bool is_vector_reduction(viennacl::scheduler::statement_node const & node)
|
||||
{
|
||||
return node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
|
||||
|| node.op.type_family==viennacl::scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|
||||
|| node.op.type_family==viennacl::scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
|
||||
}
|
||||
|
||||
inline viennacl::scheduler::statement_node const & lhs_most(viennacl::scheduler::statement::container_type const & array, size_t root)
|
||||
{
|
||||
viennacl::scheduler::statement_node const * current = &array[root];
|
||||
while (current->lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
current = &array[current->lhs.node_index];
|
||||
return *current;
|
||||
}
|
||||
|
||||
enum expression_type
|
||||
{
|
||||
SCALAR_AXPY_TYPE,
|
||||
VECTOR_AXPY_TYPE,
|
||||
MATRIX_AXPY_TYPE,
|
||||
REDUCTION_TYPE,
|
||||
ROW_WISE_REDUCTION_Nx_TYPE,
|
||||
ROW_WISE_REDUCTION_Tx_TYPE,
|
||||
MATRIX_PRODUCT_NN_TYPE,
|
||||
MATRIX_PRODUCT_TN_TYPE,
|
||||
MATRIX_PRODUCT_NT_TYPE,
|
||||
MATRIX_PRODUCT_TT_TYPE,
|
||||
INVALID_EXPRESSION_TYPE
|
||||
};
|
||||
|
||||
inline const char * expression_type_to_string(expression_type type)
|
||||
{
|
||||
switch (type)
|
||||
{
|
||||
case SCALAR_AXPY_TYPE : return "Scalar AXPY";
|
||||
case VECTOR_AXPY_TYPE : return "Vector AXPY";
|
||||
case MATRIX_AXPY_TYPE : return "Matrix AXPY";
|
||||
case REDUCTION_TYPE : return "Reduction";
|
||||
case ROW_WISE_REDUCTION_Nx_TYPE : return "Row-wise reduction: Ax";
|
||||
case ROW_WISE_REDUCTION_Tx_TYPE : return "Row-wise reduction : Tx";
|
||||
case MATRIX_PRODUCT_NN_TYPE : return "Matrix-Matrix Product : AA";
|
||||
case MATRIX_PRODUCT_TN_TYPE : return "Matrix-Matrix Product : TA";
|
||||
case MATRIX_PRODUCT_NT_TYPE : return "Matrix-Matrix Product : AT";
|
||||
case MATRIX_PRODUCT_TT_TYPE : return "Matrix-Matrix Product : TT";
|
||||
default : return "INVALID EXPRESSION";
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief generate the string for a pointer kernel argument */
|
||||
static std::string generate_value_kernel_argument(std::string const & scalartype, std::string const & name)
|
||||
{
|
||||
return scalartype + ' ' + name + ",";
|
||||
}
|
||||
|
||||
/** @brief generate the string for a pointer kernel argument */
|
||||
static std::string generate_pointer_kernel_argument(std::string const & address_space, std::string const & scalartype, std::string const & name)
|
||||
{
|
||||
return address_space + " " + scalartype + "* " + name + ",";
|
||||
}
|
||||
|
||||
/** @brief Emulation of C++11's .at() member for std::map<> */
|
||||
template<typename KeyT, typename ValueT>
|
||||
ValueT const & at(std::map<KeyT, ValueT> const & map, KeyT const & key)
|
||||
{
|
||||
typename std::map<KeyT, ValueT>::const_iterator it = map.find(key);
|
||||
if (it != map.end())
|
||||
return it->second;
|
||||
|
||||
throw std::out_of_range("Generator: Key not found in map");
|
||||
}
|
||||
|
||||
/** @brief Exception for the case the generator is unable to deal with the operation */
|
||||
class generator_not_supported_exception : public std::exception
|
||||
{
|
||||
public:
|
||||
generator_not_supported_exception() : message_() {}
|
||||
generator_not_supported_exception(std::string message) : message_("ViennaCL: Internal error: The generator cannot handle the statement provided: " + message) {}
|
||||
virtual const char* what() const throw() { return message_.c_str(); }
|
||||
virtual ~generator_not_supported_exception() throw() {}
|
||||
private:
|
||||
std::string message_;
|
||||
};
|
||||
|
||||
namespace utils
|
||||
{
|
||||
class kernel_generation_stream;
|
||||
}
|
||||
|
||||
|
||||
enum leaf_t
|
||||
{
|
||||
LHS_NODE_TYPE,
|
||||
PARENT_NODE_TYPE,
|
||||
RHS_NODE_TYPE
|
||||
};
|
||||
|
||||
class mapped_object;
|
||||
class template_base;
|
||||
|
||||
typedef std::pair<atidlas_int_t, leaf_t> mapping_key;
|
||||
typedef std::map<mapping_key, tools::shared_ptr<mapped_object> > mapping_type;
|
||||
|
||||
|
||||
namespace tree_parsing
|
||||
{
|
||||
|
||||
template<class Fun>
|
||||
inline void traverse(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, Fun const & fun, bool inspect);
|
||||
|
||||
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
|
||||
viennacl::scheduler::statement const & statement, size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed);
|
||||
inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors, viennacl::scheduler::statement const & statement, atidlas_int_t root_idx,mapping_type const & mapping);
|
||||
}
|
||||
|
||||
using viennacl::scheduler::INT_TYPE;
|
||||
using viennacl::scheduler::UINT_TYPE;
|
||||
using viennacl::scheduler::ULONG_TYPE;
|
||||
using viennacl::scheduler::LONG_TYPE;
|
||||
using viennacl::scheduler::FLOAT_TYPE;
|
||||
using viennacl::scheduler::DOUBLE_TYPE;
|
||||
|
||||
typedef cl_uint vendor_id_type;
|
||||
typedef cl_device_type device_type;
|
||||
typedef std::string device_name_type;
|
||||
|
||||
class symbolic_binder
|
||||
{
|
||||
public:
|
||||
virtual ~symbolic_binder(){ }
|
||||
virtual bool bind(viennacl::backend::mem_handle const * ph) = 0;
|
||||
virtual unsigned int get(viennacl::backend::mem_handle const * ph) = 0;
|
||||
};
|
||||
|
||||
class bind_to_handle : public symbolic_binder
|
||||
{
|
||||
public:
|
||||
bind_to_handle() : current_arg_(0){ }
|
||||
bool bind(viennacl::backend::mem_handle const * ph) {return (ph==NULL)?true:memory.insert(std::make_pair((void*)ph, current_arg_)).second; }
|
||||
unsigned int get(viennacl::backend::mem_handle const * ph){ return bind(ph)?current_arg_++:memory.at((void*)ph); }
|
||||
private:
|
||||
unsigned int current_arg_;
|
||||
std::map<void*,unsigned int> memory;
|
||||
};
|
||||
|
||||
class bind_all_unique : public symbolic_binder
|
||||
{
|
||||
public:
|
||||
bind_all_unique() : current_arg_(0){ }
|
||||
bool bind(viennacl::backend::mem_handle const *) {return true; }
|
||||
unsigned int get(viennacl::backend::mem_handle const *){ return current_arg_++; }
|
||||
private:
|
||||
unsigned int current_arg_;
|
||||
std::map<void*,unsigned int> memory;
|
||||
};
|
||||
|
||||
enum binding_policy_t{
|
||||
BIND_ALL_UNIQUE,
|
||||
BIND_TO_HANDLE
|
||||
};
|
||||
|
||||
inline tools::shared_ptr<symbolic_binder> make_binder(binding_policy_t policy)
|
||||
{
|
||||
if (policy==BIND_TO_HANDLE)
|
||||
return tools::shared_ptr<symbolic_binder>(new bind_to_handle());
|
||||
else
|
||||
return tools::shared_ptr<symbolic_binder>(new bind_all_unique());
|
||||
}
|
||||
|
||||
template<char C>
|
||||
struct char_to_type{ };
|
||||
|
||||
class statements_container
|
||||
{
|
||||
public:
|
||||
typedef std::list<viennacl::scheduler::statement> data_type;
|
||||
enum order_type { SEQUENTIAL, INDEPENDENT };
|
||||
|
||||
statements_container(data_type const & data, order_type order) : data_(data), order_(order)
|
||||
{ }
|
||||
|
||||
statements_container(viennacl::scheduler::statement const & s0) : order_(INDEPENDENT)
|
||||
{
|
||||
data_.push_back(s0);
|
||||
}
|
||||
|
||||
statements_container(viennacl::scheduler::statement const & s0, viennacl::scheduler::statement const & s1, order_type order) : order_(order)
|
||||
{
|
||||
data_.push_back(s0);
|
||||
data_.push_back(s1);
|
||||
}
|
||||
|
||||
std::list<viennacl::scheduler::statement> const & data() const { return data_; }
|
||||
|
||||
order_type order() const { return order_; }
|
||||
|
||||
private:
|
||||
std::list<viennacl::scheduler::statement> data_;
|
||||
order_type order_;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
41
atidlas/lazy_program_compiler.hpp
Normal file
41
atidlas/lazy_program_compiler.hpp
Normal file
@@ -0,0 +1,41 @@
|
||||
#ifndef ATIDLAS_LAZY_PROGRAM_COMPILER_HPP
|
||||
#define ATIDLAS_LAZY_PROGRAM_COMPILER_HPP
|
||||
|
||||
#include <map>
|
||||
#include "viennacl/ocl/context.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
class lazy_program_compiler
|
||||
{
|
||||
public:
|
||||
|
||||
lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, std::string const & src, bool force_recompilation) : ctx_(ctx), name_(name), src_(src), force_recompilation_(force_recompilation){ }
|
||||
lazy_program_compiler(viennacl::ocl::context * ctx, std::string const & name, bool force_recompilation) : ctx_(ctx), name_(name), force_recompilation_(force_recompilation){ }
|
||||
|
||||
void add(std::string const & src) { src_+=src; }
|
||||
|
||||
std::string const & src() const { return src_; }
|
||||
|
||||
viennacl::ocl::program & program()
|
||||
{
|
||||
if (force_recompilation_ && ctx_->has_program(name_))
|
||||
ctx_->delete_program(name_);
|
||||
if (!ctx_->has_program(name_))
|
||||
{
|
||||
// std::cout << src_ << std::endl;
|
||||
ctx_->add_program(src_, name_);
|
||||
}
|
||||
return ctx_->get_program(name_);
|
||||
}
|
||||
|
||||
private:
|
||||
viennacl::ocl::context * ctx_;
|
||||
std::string name_;
|
||||
std::string src_;
|
||||
bool force_recompilation_;
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
429
atidlas/mapped_objects.hpp
Normal file
429
atidlas/mapped_objects.hpp
Normal file
@@ -0,0 +1,429 @@
|
||||
#ifndef ATIDLAS_MAPPED_TYPE_HPP
|
||||
#define ATIDLAS_MAPPED_TYPE_HPP
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/forwards.h"
|
||||
#include "atidlas/tools/find_and_replace.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
/** @brief Mapped Object
|
||||
*
|
||||
* This object populates the symbolic mapping associated with a statement. (root_id, LHS|RHS|PARENT) => mapped_object
|
||||
* The tree can then be reconstructed in its symbolic form
|
||||
*/
|
||||
class mapped_object
|
||||
{
|
||||
private:
|
||||
virtual void postprocess(std::string &) const { }
|
||||
|
||||
protected:
|
||||
struct MorphBase { virtual ~MorphBase(){} };
|
||||
struct MorphBase1D : public MorphBase { public: virtual std::string operator()(std::string const & i) const = 0; };
|
||||
struct MorphBase2D : public MorphBase { public: virtual std::string operator()(std::string const & i, std::string const & j) const = 0; };
|
||||
|
||||
static void replace_offset(std::string & str, MorphBase const & morph)
|
||||
{
|
||||
size_t pos = 0;
|
||||
while ((pos=str.find("$OFFSET", pos))!=std::string::npos)
|
||||
{
|
||||
std::string postprocessed;
|
||||
size_t pos_po = str.find('{', pos);
|
||||
size_t pos_pe = str.find('}', pos_po);
|
||||
|
||||
if (MorphBase2D const * p = dynamic_cast<MorphBase2D const *>(&morph))
|
||||
{
|
||||
size_t pos_comma = str.find(',', pos_po);
|
||||
std::string i = str.substr(pos_po + 1, pos_comma - pos_po - 1);
|
||||
std::string j = str.substr(pos_comma + 1, pos_pe - pos_comma - 1);
|
||||
postprocessed = (*p)(i, j);
|
||||
}
|
||||
else if (MorphBase1D const * p = dynamic_cast<MorphBase1D const *>(&morph))
|
||||
{
|
||||
std::string i = str.substr(pos_po + 1, pos_pe - pos_po - 1);
|
||||
postprocessed = (*p)(i);
|
||||
}
|
||||
|
||||
str.replace(pos, pos_pe + 1 - pos, postprocessed);
|
||||
pos = pos_pe;
|
||||
}
|
||||
}
|
||||
|
||||
void register_attribute(std::string & attribute, std::string const & key, std::string const & value)
|
||||
{
|
||||
attribute = value;
|
||||
keywords_[key] = attribute;
|
||||
}
|
||||
|
||||
public:
|
||||
struct node_info
|
||||
{
|
||||
node_info(mapping_type const * _mapping, viennacl::scheduler::statement const * _statement, atidlas_int_t _root_idx) :
|
||||
mapping(_mapping), statement(_statement), root_idx(_root_idx) { }
|
||||
mapping_type const * mapping;
|
||||
viennacl::scheduler::statement const * statement;
|
||||
atidlas_int_t root_idx;
|
||||
};
|
||||
|
||||
public:
|
||||
mapped_object(std::string const & scalartype, unsigned int id, std::string const & type_key) : type_key_(type_key)
|
||||
{
|
||||
register_attribute(scalartype_, "#scalartype", scalartype);
|
||||
register_attribute(name_, "#name", "obj" + tools::to_string(id));
|
||||
}
|
||||
|
||||
virtual ~mapped_object(){ }
|
||||
|
||||
std::string type_key() const { return type_key_; }
|
||||
|
||||
std::string const & name() const { return name_; }
|
||||
|
||||
std::string process(std::string const & in) const
|
||||
{
|
||||
std::string res(in);
|
||||
for (std::map<std::string,std::string>::const_iterator it = keywords_.begin(); it != keywords_.end(); ++it)
|
||||
tools::find_and_replace(res, it->first, it->second);
|
||||
postprocess(res);
|
||||
return res;
|
||||
}
|
||||
|
||||
std::string evaluate(std::map<std::string, std::string> const & accessors) const
|
||||
{
|
||||
if (accessors.find(type_key_)==accessors.end())
|
||||
return name_;
|
||||
return process(accessors.at(type_key_));
|
||||
}
|
||||
|
||||
|
||||
protected:
|
||||
std::string name_;
|
||||
std::string scalartype_;
|
||||
std::string type_key_;
|
||||
std::map<std::string, std::string> keywords_;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Binary leaf interface
|
||||
*
|
||||
* Some subtrees have to be interpret at leaves when reconstructing the final expression. It is the case of trans(), diag(), prod(), etc...
|
||||
* This interface stores basic infos about the sub-trees
|
||||
*/
|
||||
class binary_leaf
|
||||
{
|
||||
public:
|
||||
binary_leaf(mapped_object::node_info info) : info_(info){ }
|
||||
|
||||
void process_recursive(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors)
|
||||
{
|
||||
std::set<std::string> already_fetched;
|
||||
tree_parsing::process(stream, leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping, already_fetched);
|
||||
}
|
||||
|
||||
std::string evaluate_recursive(leaf_t leaf, std::map<std::string, std::string> const & accessors)
|
||||
{
|
||||
return tree_parsing::evaluate(leaf, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
protected:
|
||||
mapped_object::node_info info_;
|
||||
};
|
||||
|
||||
/** @brief Matrix product
|
||||
*
|
||||
* Maps prod(matrix_expression, matrix_expression)
|
||||
*/
|
||||
class mapped_matrix_product : public mapped_object, public binary_leaf
|
||||
{
|
||||
public:
|
||||
mapped_matrix_product(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_product"), binary_leaf(info) { }
|
||||
};
|
||||
|
||||
/** @brief Reduction
|
||||
*
|
||||
* Base class for mapping a reduction
|
||||
*/
|
||||
class mapped_reduction : public mapped_object, public binary_leaf
|
||||
{
|
||||
public:
|
||||
mapped_reduction(std::string const & scalartype, unsigned int id, node_info info, std::string const & type_key) : mapped_object(scalartype, id, type_key), binary_leaf(info){ }
|
||||
|
||||
atidlas_int_t root_idx() const { return info_.root_idx; }
|
||||
viennacl::scheduler::statement const & statement() const { return *info_.statement; }
|
||||
viennacl::scheduler::statement_node root_node() const { return statement().array()[root_idx()]; }
|
||||
bool is_index_reduction() const { return utils::is_index_reduction(info_.statement->array()[info_.root_idx].op); }
|
||||
|
||||
viennacl::scheduler::op_element root_op() const
|
||||
{
|
||||
viennacl::scheduler::op_element res = info_.statement->array()[info_.root_idx].op;
|
||||
if (res.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE
|
||||
||res.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
|
||||
res.type = viennacl::scheduler::OPERATION_BINARY_ADD_TYPE;
|
||||
return res;
|
||||
}
|
||||
};
|
||||
|
||||
/** @brief Scalar reduction
|
||||
*
|
||||
* Maps a scalar reduction (max, min, argmax, inner_prod, etc..)
|
||||
*/
|
||||
class mapped_scalar_reduction : public mapped_reduction
|
||||
{
|
||||
public:
|
||||
mapped_scalar_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "scalar_reduction"){ }
|
||||
};
|
||||
|
||||
/** @brief Vector reduction
|
||||
*
|
||||
* Maps a row-wise reduction (max, min, argmax, matrix-vector product, etc..)
|
||||
*/
|
||||
class mapped_row_wise_reduction : public mapped_reduction
|
||||
{
|
||||
public:
|
||||
mapped_row_wise_reduction(std::string const & scalartype, unsigned int id, node_info info) : mapped_reduction(scalartype, id, info, "row_wise_reduction") { }
|
||||
};
|
||||
|
||||
/** @brief Host scalar
|
||||
*
|
||||
* Maps a host scalar (passed by value)
|
||||
*/
|
||||
class mapped_host_scalar : public mapped_object
|
||||
{
|
||||
public:
|
||||
mapped_host_scalar(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "host_scalar"){ }
|
||||
};
|
||||
|
||||
/** @brief Handle
|
||||
*
|
||||
* Maps an object passed by pointer
|
||||
*/
|
||||
class mapped_handle : public mapped_object
|
||||
{
|
||||
public:
|
||||
mapped_handle(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_object(scalartype, id, type_key)
|
||||
{
|
||||
register_attribute(pointer_, "#pointer", name_ + "_pointer");
|
||||
}
|
||||
private:
|
||||
std::string pointer_;
|
||||
};
|
||||
|
||||
|
||||
/** @brief Scalar
|
||||
*
|
||||
* Maps a scalar passed by pointer
|
||||
*/
|
||||
class mapped_scalar : public mapped_handle
|
||||
{
|
||||
public:
|
||||
mapped_scalar(std::string const & scalartype, unsigned int id) : mapped_handle(scalartype, id, "scalar") { }
|
||||
};
|
||||
|
||||
/** @brief Buffered
|
||||
*
|
||||
* Maps a buffered object (vector, matrix)
|
||||
*/
|
||||
class mapped_buffer : public mapped_handle
|
||||
{
|
||||
public:
|
||||
mapped_buffer(std::string const & scalartype, unsigned int id, std::string const & type_key) : mapped_handle(scalartype, id, type_key){ }
|
||||
};
|
||||
|
||||
/** @brief Vector
|
||||
*
|
||||
* Maps a vector
|
||||
*/
|
||||
class mapped_vector : public mapped_buffer
|
||||
{
|
||||
public:
|
||||
mapped_vector(std::string const & scalartype, unsigned int id) : mapped_buffer(scalartype, id, "vector")
|
||||
{
|
||||
register_attribute(start_, "#start", name_ + "_start");
|
||||
register_attribute(stride_, "#stride", name_ + "_stride");
|
||||
}
|
||||
|
||||
private:
|
||||
std::string start_;
|
||||
std::string stride_;
|
||||
};
|
||||
|
||||
/** @brief Matrix
|
||||
*
|
||||
* Maps a matrix
|
||||
*/
|
||||
class mapped_matrix : public mapped_buffer
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string & str) const
|
||||
{
|
||||
struct Morph : public MorphBase2D
|
||||
{
|
||||
Morph(bool _is_row_major, std::string const & _ld) : is_row_major(_is_row_major), ld(_ld){ }
|
||||
std::string operator()(std::string const & i, std::string const & j) const
|
||||
{
|
||||
if (is_row_major)
|
||||
return "(" + i + ") * " + ld + " + (" + j + ")";
|
||||
return "(" + i + ") + (" + j + ") * " + ld;
|
||||
}
|
||||
private:
|
||||
bool is_row_major;
|
||||
std::string const & ld;
|
||||
};
|
||||
replace_offset(str, Morph(row_major_, ld_));
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_matrix(std::string const & scalartype, unsigned int id, bool row_major) : mapped_buffer(scalartype, id, "matrix"), row_major_(row_major)
|
||||
{
|
||||
register_attribute(ld_, "#ld", name_ + "_ld");
|
||||
register_attribute(start1_, "#start1", name_ + "_start1");
|
||||
register_attribute(start2_, "#start2", name_ + "_start2");
|
||||
register_attribute(stride1_, "#stride1", name_ + "_stride1");
|
||||
register_attribute(stride2_, "#stride2", name_ + "_stride2");
|
||||
if (row_major_)
|
||||
keywords_["#nldstride"] = "#stride1";
|
||||
else
|
||||
keywords_["#nldstride"] = "#stride2";
|
||||
}
|
||||
|
||||
bool row_major() const
|
||||
{
|
||||
return row_major_;
|
||||
}
|
||||
|
||||
private:
|
||||
std::string ld_;
|
||||
std::string start1_;
|
||||
std::string start2_;
|
||||
std::string stride1_;
|
||||
std::string stride2_;
|
||||
bool row_major_;
|
||||
};
|
||||
|
||||
/** @brief Vector diag
|
||||
*
|
||||
* Maps a diag(vector_expression) node into a diagonal matrix
|
||||
*/
|
||||
class mapped_vector_diag : public mapped_object, public binary_leaf
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string &res) const
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
|
||||
accessors["vector"] = res;
|
||||
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_vector_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "vector_diag"), binary_leaf(info){ }
|
||||
};
|
||||
|
||||
|
||||
/** @brief Trans
|
||||
*
|
||||
* Maps trans(matrix_expression) into the transposed of matrix_expression
|
||||
*/
|
||||
class mapped_trans: public mapped_object, public binary_leaf
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string &res) const
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["matrix"] = res;
|
||||
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_trans(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_trans"), binary_leaf(info){ }
|
||||
};
|
||||
|
||||
/** @brief Matrix row
|
||||
*
|
||||
* Maps row(matrix_expression, scalar_expression) into the scalar_expression's row of matrix_expression
|
||||
*/
|
||||
class mapped_matrix_row : public mapped_object, binary_leaf
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string &res) const
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
tools::find_and_replace(res, "#row", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
|
||||
accessors["matrix"] = res;
|
||||
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_matrix_row(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_row"), binary_leaf(info)
|
||||
{ }
|
||||
};
|
||||
|
||||
|
||||
/** @brief Matrix column
|
||||
*
|
||||
* Maps column(matrix_expression, scalar_expression) into the scalar_expression's column of matrix_expression
|
||||
*/
|
||||
class mapped_matrix_column : public mapped_object, binary_leaf
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string &res) const
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
tools::find_and_replace(res, "#column", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
|
||||
accessors["matrix"] = res;
|
||||
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_matrix_column(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_column"), binary_leaf(info)
|
||||
{ }
|
||||
};
|
||||
|
||||
/** @brief Matrix diag
|
||||
*
|
||||
* Maps a diag(matrix_expression) node into the vector of its diagonal elements
|
||||
*/
|
||||
class mapped_matrix_diag : public mapped_object, binary_leaf
|
||||
{
|
||||
private:
|
||||
void postprocess(std::string &res) const
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
tools::find_and_replace(res, "#diag_offset", tree_parsing::evaluate(RHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping));
|
||||
accessors["matrix"] = res;
|
||||
res = tree_parsing::evaluate(LHS_NODE_TYPE, accessors, *info_.statement, info_.root_idx, *info_.mapping);
|
||||
}
|
||||
|
||||
public:
|
||||
mapped_matrix_diag(std::string const & scalartype, unsigned int id, node_info info) : mapped_object(scalartype, id, "matrix_diag"), binary_leaf(info)
|
||||
{ }
|
||||
};
|
||||
|
||||
/** @brief Implicit vector
|
||||
*
|
||||
* Maps an implicit vector
|
||||
*/
|
||||
class mapped_implicit_vector : public mapped_object
|
||||
{
|
||||
public:
|
||||
mapped_implicit_vector(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_vector")
|
||||
{ }
|
||||
};
|
||||
|
||||
/** @brief Implicit matrix
|
||||
*
|
||||
* Maps an implicit matrix
|
||||
*/
|
||||
class mapped_implicit_matrix : public mapped_object
|
||||
{
|
||||
public:
|
||||
mapped_implicit_matrix(std::string const & scalartype, unsigned int id) : mapped_object(scalartype, id, "implicit_matrix")
|
||||
{ }
|
||||
};
|
||||
|
||||
}
|
||||
#endif
|
137
atidlas/templates/matrix_axpy_template.hpp
Normal file
137
atidlas/templates/matrix_axpy_template.hpp
Normal file
@@ -0,0 +1,137 @@
|
||||
#ifndef ATIDLAS_TEMPLATES_MATRIX_AXPY_HPP
|
||||
#define ATIDLAS_TEMPLATES_MATRIX_AXPY_HPP
|
||||
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "atidlas/mapped_objects.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
class matrix_axpy_parameters_type : public template_base::parameters_type
|
||||
{
|
||||
public:
|
||||
matrix_axpy_parameters_type(unsigned int _simd_width,
|
||||
unsigned int _local_size_0, unsigned int _local_size_1,
|
||||
unsigned int _num_groups_0, unsigned int _num_groups_1,
|
||||
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1), num_groups_0(_num_groups_0), num_groups_1(_num_groups_1), fetching_policy(_fetching_policy){ }
|
||||
|
||||
unsigned int num_groups_0;
|
||||
unsigned int num_groups_1;
|
||||
fetching_policy_type fetching_policy;
|
||||
};
|
||||
|
||||
class matrix_axpy_template : public template_base_impl<matrix_axpy_template, matrix_axpy_parameters_type>
|
||||
{
|
||||
private:
|
||||
int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
|
||||
{
|
||||
if (p_.simd_width>1)
|
||||
return TEMPLATE_INVALID_SIMD_WIDTH;
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
|
||||
{
|
||||
utils::kernel_generation_stream stream;
|
||||
|
||||
std::string init0, upper_bound0, inc0, init1, upper_bound1, inc1;
|
||||
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
|
||||
stream << "__kernel void " << kernel_prefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
|
||||
("matrix", "#pointer += $OFFSET{#start1, #start2};")
|
||||
("vector", "#pointer += #start;"), statements, mappings);
|
||||
|
||||
fetching_loop_info(p_.fetching_policy, "M", stream, init0, upper_bound0, inc0, "get_global_id(0)", "get_global_size(0)");
|
||||
stream << "for(unsigned int i = " << init0 << "; i < " << upper_bound0 << "; i += " << inc0 << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
fetching_loop_info(p_.fetching_policy, "N", stream, init1, upper_bound1, inc1, "get_global_id(1)", "get_global_size(1)");
|
||||
stream << "for(unsigned int j = " << init1 << "; j < " << upper_bound1 << "; j += " << inc1 << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("matrix", utils::append_width("#scalartype",simd_width) + " #namereg = #pointer[$OFFSET{i*#stride1,j*#stride2}];")
|
||||
("vector_diag", "#scalartype #namereg = ((i + ((#diag_offset<0)?#diag_offset:0))!=(j-((#diag_offset>0)?#diag_offset:0)))?0:#pointer[min(i*#stride, j*#stride)];")
|
||||
, statements, mappings);
|
||||
|
||||
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, utils::create_evaluate_accessors("matrix", "#namereg")
|
||||
("vector_diag", "#namereg")
|
||||
("scalar", "#namereg")
|
||||
, statements, mappings);
|
||||
|
||||
tree_parsing::process(stream, LHS_NODE_TYPE, utils::create_process_accessors("matrix", "#pointer[$OFFSET{i*#stride1,j*#stride2}] = #namereg;")
|
||||
, statements, mappings);
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1));
|
||||
return res;
|
||||
}
|
||||
|
||||
public:
|
||||
matrix_axpy_template(parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<matrix_axpy_template, matrix_axpy_parameters_type>(parameters, binding_policy), up_to_internal_size_(false){ }
|
||||
|
||||
void up_to_internal_size(bool v)
|
||||
{
|
||||
up_to_internal_size_ = v;
|
||||
}
|
||||
|
||||
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
|
||||
{
|
||||
viennacl::ocl::kernel & kernel = programs[0].program().get_kernel(kernel_prefix);
|
||||
|
||||
kernel.local_work_size(0, p_.local_size_0);
|
||||
kernel.local_work_size(1, p_.local_size_1);
|
||||
kernel.global_work_size(0,p_.local_size_0*p_.num_groups_0);
|
||||
kernel.global_work_size(1,p_.local_size_1*p_.num_groups_1);
|
||||
|
||||
scheduler::statement_node const & root = statements.data().front().array()[statements.data().front().root()];
|
||||
unsigned int current_arg = 0;
|
||||
if (up_to_internal_size_)
|
||||
{
|
||||
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size1_fun())));
|
||||
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::internal_size2_fun())));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size1_fun())));
|
||||
kernel.arg(current_arg++, cl_uint(utils::call_on_matrix(root.lhs, utils::size2_fun())));
|
||||
}
|
||||
|
||||
set_arguments(statements, kernel, current_arg);
|
||||
|
||||
viennacl::ocl::enqueue(kernel);
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
bool up_to_internal_size_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
845
atidlas/templates/matrix_product_template.hpp
Normal file
845
atidlas/templates/matrix_product_template.hpp
Normal file
@@ -0,0 +1,845 @@
|
||||
#ifndef ATIDLAS_TEMPLATES_MATRIX_PRODUCT_HPP
|
||||
#define ATIDLAS_TEMPLATES_MATRIX_PRODUCT_HPP
|
||||
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "viennacl/matrix_def.hpp"
|
||||
#include "viennacl/matrix_proxy.hpp"
|
||||
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
#include "atidlas/mapped_objects.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "viennacl/forwards.h"
|
||||
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
struct matrix_product_parameters : public template_base::parameters_type
|
||||
{
|
||||
matrix_product_parameters(unsigned int simd_width
|
||||
, unsigned int local_size_0, unsigned int KL, unsigned int local_size_1
|
||||
, unsigned int ms, unsigned int ks, unsigned int ns
|
||||
, fetching_policy_type A_fetching_policy, fetching_policy_type B_fetching_policy
|
||||
, unsigned int local_fetch_0, unsigned int local_fetch_1): template_base::parameters_type(simd_width, local_size_0, local_size_1, 1),
|
||||
kL(KL), mS(ms), kS(ks), nS(ns), A_fetching_policy(A_fetching_policy), B_fetching_policy(B_fetching_policy),
|
||||
local_fetch_0(local_fetch_0), local_fetch_1(local_fetch_1),
|
||||
mL(ms*local_size_0), nL(ns*local_size_1){}
|
||||
|
||||
unsigned int kL;
|
||||
|
||||
unsigned int mS;
|
||||
unsigned int kS;
|
||||
unsigned int nS;
|
||||
|
||||
fetching_policy_type A_fetching_policy;
|
||||
fetching_policy_type B_fetching_policy;
|
||||
|
||||
unsigned int local_fetch_0;
|
||||
unsigned int local_fetch_1;
|
||||
|
||||
unsigned int mL;
|
||||
unsigned int nL;
|
||||
};
|
||||
|
||||
class matrix_product_template : public template_base_impl<matrix_product_template, matrix_product_parameters>
|
||||
{
|
||||
|
||||
private:
|
||||
unsigned int n_lmem_elements() const
|
||||
{
|
||||
unsigned int N = 0;
|
||||
if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
N += p_.kL * (p_.mL+1);
|
||||
if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
N += p_.nL * (p_.kL+1);
|
||||
return N;
|
||||
}
|
||||
|
||||
int check_invalid_impl(viennacl::ocl::device const & /*device*/) const
|
||||
{
|
||||
if (p_.A_fetching_policy!=FETCH_FROM_LOCAL && p_.B_fetching_policy!=FETCH_FROM_LOCAL&& (p_.local_fetch_0!=0 || p_.local_fetch_1!=0))
|
||||
return TEMPLATE_GLOBAL_MEMORY_REQUIRES_ZERO_LOCAL_FETCH;
|
||||
|
||||
if (viennacl::dense_padding_size % p_.mL > 0 || viennacl::dense_padding_size % p_.kL > 0 || viennacl::dense_padding_size % p_.nL > 0)
|
||||
return TEMPLATE_AlignmentV_MUST_BE_BLOCK_SIZE_MULTIPLE;
|
||||
|
||||
if ((p_.mS % p_.simd_width) > 0 || (p_.nS % p_.simd_width) > 0)
|
||||
return TEMPLATE_MS_NS_MUST_BE_SIMD_WIDTH_MULTIPLE;
|
||||
|
||||
if (p_.kS > p_.kL)
|
||||
return TEMPLATE_KS_MUST_BE_SMALLER_THAN_KL;
|
||||
|
||||
if (!(A_trans_=='N' && B_trans_=='T') && p_.simd_width>1)
|
||||
return TEMPLATE_SIMD_WIDTH_MUST_BE_ONE;
|
||||
|
||||
if (p_.A_fetching_policy==FETCH_FROM_LOCAL || p_.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
if ((p_.local_fetch_0*p_.local_fetch_1) !=(p_.local_size_0*p_.local_size_1))
|
||||
return TEMPLATE_LOCAL_FETCH_PRODUCT_MUST_MATCH_LOCAL_SIZE_PRODUCT;
|
||||
}
|
||||
|
||||
if (p_.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
unsigned int bound1 = (A_trans_=='N')?p_.kL:p_.mL;
|
||||
unsigned int bound0 = (A_trans_=='N')?p_.mL:p_.kL;
|
||||
|
||||
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
|
||||
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
|
||||
|
||||
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
|
||||
return A_trans_=='N'?TEMPLATE_LOCAL_FETCH_0_MUST_BE_NL_MULTIPLE:TEMPLATE_LOCAL_FETCH_0_MUST_BE_KL_MULTIPLE;
|
||||
|
||||
}
|
||||
if (p_.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
unsigned int bound1 = (B_trans_=='T')?p_.kL:p_.nL;
|
||||
unsigned int bound0 = (B_trans_=='T')?p_.nL:p_.kL;
|
||||
|
||||
if (p_.local_fetch_1>0 && (bound1 % p_.local_fetch_1)> 0)
|
||||
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
|
||||
|
||||
if (p_.local_fetch_0>0 && (bound0 % (p_.local_fetch_0*p_.simd_width)) > 0)
|
||||
return B_trans_=='T'?TEMPLATE_LOCAL_FETCH_1_MUST_BE_KL_MULTIPLE:TEMPLATE_LOCAL_FETCH_1_MUST_BE_ML_MULTIPLE;
|
||||
|
||||
}
|
||||
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
static void parse(scheduler::statement const & s,
|
||||
atidlas_int_t & C_idx, leaf_t & C_leaf, atidlas_int_t & alpha_idx, leaf_t & alpha_leaf,
|
||||
atidlas_int_t & A_idx, leaf_t & A_leaf, bool& A_trans, atidlas_int_t & B_idx, leaf_t & B_leaf, bool& B_trans,
|
||||
atidlas_int_t & beta_idx, leaf_t & beta_leaf)
|
||||
{
|
||||
using namespace tree_parsing;
|
||||
using namespace scheduler;
|
||||
|
||||
scheduler::statement::container_type const & array = s.array();
|
||||
atidlas_int_t root_idx = s.root();
|
||||
|
||||
C_idx = root_idx;
|
||||
C_leaf = LHS_NODE_TYPE;
|
||||
|
||||
atidlas_int_t node_add_idx = array[root_idx].rhs.node_index;
|
||||
|
||||
atidlas_int_t node_1_idx = array[node_add_idx].lhs.node_index;
|
||||
alpha_idx = node_1_idx;
|
||||
alpha_leaf = RHS_NODE_TYPE;
|
||||
|
||||
atidlas_int_t mat_prod_idx = array[node_1_idx].lhs.node_index;
|
||||
if (array[mat_prod_idx].lhs.type_family==MATRIX_TYPE_FAMILY)
|
||||
{
|
||||
A_trans = false;
|
||||
A_idx = mat_prod_idx;
|
||||
}
|
||||
else
|
||||
{
|
||||
A_trans = true;
|
||||
A_idx = array[mat_prod_idx].lhs.node_index;
|
||||
}
|
||||
A_leaf = LHS_NODE_TYPE;
|
||||
|
||||
if (array[mat_prod_idx].rhs.type_family==MATRIX_TYPE_FAMILY)
|
||||
{
|
||||
B_trans = false;
|
||||
B_idx = mat_prod_idx;
|
||||
B_leaf = RHS_NODE_TYPE;
|
||||
}
|
||||
else
|
||||
{
|
||||
B_trans = true;
|
||||
B_idx = array[mat_prod_idx].rhs.node_index;
|
||||
B_leaf = LHS_NODE_TYPE;
|
||||
}
|
||||
|
||||
atidlas_int_t node_2_idx = array[node_add_idx].rhs.node_index;
|
||||
beta_idx = node_2_idx;
|
||||
beta_leaf = RHS_NODE_TYPE;
|
||||
}
|
||||
|
||||
void handle_bounds(bool fallback, utils::kernel_generation_stream & stream, std::string const & inbounds, std::string const & do_if, std::string do_else) const
|
||||
{
|
||||
if (fallback)
|
||||
{
|
||||
stream << "if (" << inbounds << ")" << std::endl;
|
||||
stream.inc_tab();
|
||||
stream << do_if << ";" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "else" << std::endl;
|
||||
stream.inc_tab();
|
||||
stream << do_else << ";" << std::endl;
|
||||
stream.dec_tab();
|
||||
}
|
||||
else
|
||||
stream << do_if << ";" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
std::string generate_impl(const std::string &kernel_prefix, const statements_container &statements, const std::vector<mapping_type> &mappings, bool fallback) const
|
||||
{
|
||||
using std::string;
|
||||
using tools::to_string;
|
||||
|
||||
parameters_type pfallback(1, p_.local_size_0, p_.kL, p_.local_size_1, p_.mS, 1, p_.nS, p_.A_fetching_policy, p_.B_fetching_policy, p_.local_fetch_0, p_.local_fetch_1);
|
||||
parameters_type const & p = fallback?pfallback:p_;
|
||||
|
||||
#define MUL_STRIDE1 string(fallback?"*#stride1":"")
|
||||
#define HANDLE_BOUNDS(in_bounds, to_load) (!fallback?string(to_load):string( string(in_bounds) + "?" + string(to_load) + ":0"))
|
||||
#define VLOAD(offset, ptr) vload(p.simd_width, offset, ptr)
|
||||
#define VSTORE(value, offset, ptr) vstore(p.simd_width, value, offset, ptr)
|
||||
|
||||
string widthstr = tools::to_string(p.simd_width);
|
||||
|
||||
//////////////////
|
||||
/// INIT
|
||||
/// //////////////
|
||||
utils::kernel_generation_stream stream;
|
||||
scheduler::statement const & st = statements.data().front();
|
||||
mapping_type const & mapping = mappings.front();
|
||||
|
||||
bool A_trans = false, B_trans = false;
|
||||
atidlas_int_t C_idx=0, alpha_idx=0, A_idx=0, B_idx=0, beta_idx=0;
|
||||
leaf_t C_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
|
||||
parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
|
||||
|
||||
mapped_matrix * C = (mapped_matrix*)mapping.at(mapping_key(C_idx, C_leaf)).get();
|
||||
mapped_host_scalar * alpha = (mapped_host_scalar*)mapping.at(mapping_key(alpha_idx, alpha_leaf)).get();
|
||||
mapped_matrix * A = (mapped_matrix*)mapping.at(mapping_key(A_idx, A_leaf)).get();
|
||||
mapped_matrix * B = (mapped_matrix*)mapping.at(mapping_key(B_idx, B_leaf)).get();
|
||||
mapped_host_scalar * beta = (mapped_host_scalar*)mapping.at(mapping_key(beta_idx, beta_leaf)).get();
|
||||
|
||||
//////////////////
|
||||
/// DECLARATIONS
|
||||
/// //////////////
|
||||
|
||||
stream << " __attribute__((reqd_work_group_size(" << p.local_size_0 << "," << p.local_size_1 << ",1)))" << std::endl;
|
||||
std::map<std::string, unsigned int> widths;
|
||||
widths[A->name()] = p.simd_width;
|
||||
widths[B->name()] = p.simd_width;
|
||||
generate_prototype(stream, kernel_prefix, "unsigned int M, unsigned int N, unsigned int K, ", mappings, statements, widths);
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
if(!fallback)
|
||||
{
|
||||
stream << A->process("#start1 /= " + to_string(p.simd_width) + ";") << std::endl;
|
||||
stream << A->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
|
||||
stream << B->process("#start1/= " + to_string(p.simd_width) + ";") << std::endl;
|
||||
stream << B->process("#ld /= " + to_string(p.simd_width) + ";") << std::endl;
|
||||
}
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("matrix", "#pointer += $OFFSET{#start1, #start2};")
|
||||
("matrix", "#ld *= #nldstride;"), statements, mappings);
|
||||
|
||||
///Result Values
|
||||
stream << C->process("#scalartype rC[" + to_string(p.mS) + "][" + to_string(p.nS) + "] = {{(#scalartype)0}};") << std::endl;
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
stream << A->process("#scalartype rA[" + to_string(p.kS) + "][" + to_string(p.mS) + "];") << std::endl;
|
||||
else
|
||||
stream << A->process(utils::append_width("#scalartype",p.simd_width) + " rA[" + to_string(p.kS) + "][" + to_string(p.mS/p.simd_width) + "];") << std::endl;
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
stream << B->process("#scalartype rB[" + to_string(p.kS) + "][" + to_string(p.nS) + "];");
|
||||
else
|
||||
stream << B->process(utils::append_width("#scalartype",p.simd_width) + " rB[" + to_string(p.kS) + "][" + to_string(p.nS/p.simd_width) + "];") << std::endl;
|
||||
|
||||
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
stream << A->process("__local #scalartype lA[" + to_string(p.kL*(p.mL+1)) + "];");
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
stream << B->process("__local #scalartype lB[" + to_string(p.kL*(p.nL+1)) + "];");
|
||||
stream << std::endl;
|
||||
|
||||
stream << "uint gidx = get_group_id(0);" << std::endl;
|
||||
stream << "uint gidy = get_group_id(1);" << std::endl;
|
||||
stream << "uint idx = get_local_id(0);" << std::endl;
|
||||
stream << "uint idy = get_local_id(1);" << std::endl;
|
||||
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
stream << std::endl;
|
||||
stream << "uint idt = " << p.local_size_0 << "*idy + idx;" << std::endl;
|
||||
stream << "uint idxT = idt % " << p.local_fetch_0 << ";" << std::endl;
|
||||
stream << "uint idyT = idt / " << p.local_fetch_0 << ";" << std::endl;
|
||||
}
|
||||
stream << std::endl;
|
||||
|
||||
if (fallback)
|
||||
{
|
||||
//Bounds checking for M (in A, C)
|
||||
stream << "bool in_bounds_m[" << p.mS << "];" << std::endl;
|
||||
stream << "for(unsigned int m = 0; m < " << p.mS << "; m++)" << std::endl;
|
||||
stream.inc_tab();
|
||||
switch (p.A_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx*" << p.mS << " + m < M;" << std::endl;
|
||||
break;
|
||||
default:
|
||||
stream << "in_bounds_m[m] = gidx*" << p.mL << " + idx + m*" << p.local_size_0 << " < M;" << std::endl;
|
||||
break;
|
||||
}
|
||||
stream.dec_tab();
|
||||
|
||||
//Bounds checking for A if Local
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
unsigned int fetch_size = (A_trans_=='N'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
|
||||
stream << "bool in_bounds_m_local[" << p.mL/fetch_size << "];" << std::endl;
|
||||
stream << "for(unsigned int m = 0; m < " << p.mL/fetch_size << "; m++)" << std::endl;
|
||||
stream.inc_tab();
|
||||
stream << "in_bounds_m_local[m] = gidx*" << p.mL << " + " << (A_trans_=='N'?"idxT":"idyT") << " + m*" << fetch_size << " < M;" << std::endl;
|
||||
stream.dec_tab();
|
||||
}
|
||||
|
||||
//Bounds checking for N (in B, C)
|
||||
stream << "bool in_bounds_n[" << p.nS << "];" << std::endl;
|
||||
stream << "for(unsigned int n = 0; n < " << p.nS << "; n++)" << std::endl;
|
||||
stream.inc_tab();
|
||||
switch (p.B_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy*" << p.nS << " + n < N;" << std::endl;
|
||||
break;
|
||||
default:
|
||||
stream << "in_bounds_n[n] = gidy*" << p.nL << " + idy + n*" << p.local_size_1 << " < N;" << std::endl;
|
||||
break;
|
||||
}
|
||||
stream.dec_tab();
|
||||
|
||||
//Bounds checking for B if Local
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
unsigned int fetch_size = (B_trans_=='T'?p.local_fetch_0*p.simd_width:p.local_fetch_1);
|
||||
stream << "bool in_bounds_n_local[" << p.nL/fetch_size << "];" << std::endl;
|
||||
stream << "for(unsigned int n = 0; n < " << p.nL/fetch_size << "; n++)" << std::endl;
|
||||
stream.inc_tab();
|
||||
stream << "in_bounds_n_local[n] = gidy*" << p.nL << " + " << (B_trans_=='T'?"idxT":"idyT") << " + n*" << fetch_size << " < N;" << std::endl;
|
||||
stream.dec_tab();
|
||||
}
|
||||
}
|
||||
|
||||
switch (p.A_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + " + idxT)" + MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
|
||||
else
|
||||
stream << A->process("#pointer += idxT" + MUL_STRIDE1 + " + gidx*" + to_string(p.mL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")" + MUL_STRIDE1 + ";") << std::endl;
|
||||
else
|
||||
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx*" + to_string(p.mS/p.simd_width) + ")*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_STRIDED:
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx" + ")" + MUL_STRIDE1 + ";") << std::endl;
|
||||
else
|
||||
stream << A->process("#pointer += (gidx*" + to_string(p.mL/p.simd_width) + "+ idx)*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
default: break;
|
||||
}
|
||||
|
||||
switch (p.B_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + " + idxT" + ")" + MUL_STRIDE1 + " + idyT*#ld;") << std::endl;
|
||||
else
|
||||
stream << B->process("#pointer += idxT" + MUL_STRIDE1 + " + gidy*" + to_string(p.nL/p.simd_width) + "*#ld + idyT*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")" + MUL_STRIDE1 + ";") << std::endl;
|
||||
else
|
||||
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy*" + to_string(p.nS/p.simd_width) + ")*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_STRIDED:
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy" + ")" + MUL_STRIDE1 + ";") << std::endl;
|
||||
else
|
||||
stream << B->process("#pointer += (gidy*" + to_string(p.nL/p.simd_width) + "+ idy)*#ld;") << std::endl;
|
||||
break;
|
||||
|
||||
default: break;
|
||||
}
|
||||
|
||||
stream << std::endl;
|
||||
stream << "for(unsigned int block_k=0; block_k < K; block_k+=" << p.kL << "){" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("__local #scalartype* plA = lA + idyT*" + to_string(p.mL + 1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
|
||||
else
|
||||
stream << A->process("__local #scalartype* plA = lA + idxT*" + to_string(p.mL + 1) + " + idyT;") << std::endl;
|
||||
}
|
||||
|
||||
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("__local #scalartype* plB = lB + idyT*" + to_string(p.nL+1) + " + " + to_string(p.simd_width) + "*idxT;") << std::endl;
|
||||
else
|
||||
stream << B->process("__local #scalartype* plB = lB + idxT*" + to_string(p.nL+1) + "+ idyT;") <<std::endl;
|
||||
}
|
||||
|
||||
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
|
||||
|
||||
///Fetch LHS to Local Memory
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='N')
|
||||
for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
|
||||
for (unsigned int m = 0; m < p.mL; m += p.local_fetch_0*p.simd_width)
|
||||
{
|
||||
string in_bounds = "in_bounds_m_local[" + to_string(m/(p.local_fetch_0*p.simd_width)) + "]";
|
||||
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + MUL_STRIDE1 + "]";
|
||||
stream << A->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(k*(p.mL+1)+m))) << ";" << std::endl;
|
||||
}
|
||||
else if (p.A_fetching_policy==FETCH_FROM_LOCAL && A_trans_=='T')
|
||||
for (unsigned int k = 0; k < p.mL; k += p.local_fetch_1)
|
||||
for (unsigned int m = 0; m < p.kL; m += p.local_fetch_0*p.simd_width)
|
||||
{
|
||||
string in_bounds = "in_bounds_m_local[" + to_string(k/p.local_fetch_1) + "]";
|
||||
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(m/p.simd_width) + MUL_STRIDE1 + "]";
|
||||
stream << A->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plA + " + to_string(m*(p.mL+1)+k))) << ";" << std::endl;
|
||||
}
|
||||
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='T')
|
||||
for (unsigned int k = 0; k < p.kL; k += p.local_fetch_1)
|
||||
for (unsigned int n = 0; n < p.nL; n += p.local_fetch_0*p.simd_width)
|
||||
{
|
||||
string in_bounds = "in_bounds_n_local[" + to_string(n/(p.local_fetch_0*p.simd_width)) + "]";
|
||||
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + MUL_STRIDE1 + "]";
|
||||
stream << B->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(k*(p.nL+1)+n))) << ";" << std::endl;
|
||||
}
|
||||
else if (p.B_fetching_policy==FETCH_FROM_LOCAL && B_trans_=='N')
|
||||
for (unsigned int k = 0; k < p.nL; k += p.local_fetch_1)
|
||||
for (unsigned int n = 0; n < p.kL; n += p.local_fetch_0*p.simd_width)
|
||||
{
|
||||
string in_bounds = "in_bounds_n_local[" + to_string(k/p.local_fetch_1) + "]";
|
||||
string to_load = "#pointer[" + to_string(k) + "*#ld + " + to_string(n/p.simd_width) + MUL_STRIDE1 + "]";
|
||||
stream << B->process(VSTORE(HANDLE_BOUNDS(in_bounds, to_load), "0", "plB + " + to_string(n*(p.nL+1)+k))) << ";" << std::endl;
|
||||
}
|
||||
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.B_fetching_policy == FETCH_FROM_LOCAL)
|
||||
{
|
||||
stream << "barrier(CLK_LOCAL_MEM_FENCE);" << std::endl;
|
||||
stream << "uint offA = " << p.simd_width << "*idx;" << std::endl;
|
||||
stream << "uint offB = " << p.simd_width << "*idy;" << std::endl;
|
||||
}
|
||||
|
||||
if (fallback)
|
||||
stream << "for(unsigned int k = 0; k < " << p.kL << " && (block_k + k < K); k+=" << p.kS << "){" << std::endl;
|
||||
else
|
||||
stream << "for(unsigned int k = 0; k < " << p.kL << "; k+=" << p.kS << "){" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
///Fetch LHS to registers
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int mm = 0; mm < " << p.mS/p.simd_width << "; mm++)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
switch (p.A_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
for (unsigned int ss = 0; ss < p.simd_width; ++ss)
|
||||
stream << "rA[kk][mm*" << p.simd_width << "+" << ss << "] = lA[offA + mm*" << p.local_size_0*p.simd_width << "+" << ss << "+ kk*" << (p.mL+1) << "];" << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
{
|
||||
if (A_trans_=='N')
|
||||
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
else
|
||||
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
case FETCH_FROM_GLOBAL_STRIDED:
|
||||
{
|
||||
if (A_trans_=='N')
|
||||
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[kk*#ld + mm*" + to_string(p.local_size_0) + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
else
|
||||
stream << "rA[kk][mm] = " << A->process(HANDLE_BOUNDS("in_bounds_m[mm]", "#pointer[mm*#ld*" + to_string(p.local_size_0) + " + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
break;
|
||||
|
||||
}
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int kk = 0; kk < " << p.kS << "; kk++)" << std::endl;
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int nn = 0; nn < " << p.nS/p.simd_width << "; nn++)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
switch (p.B_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
for (unsigned int ss = 0; ss < p.simd_width; ++ss)
|
||||
stream << "rB[kk][nn*" << p.simd_width << "+" << ss << "] = lB[offB + nn*" << p.local_size_1*p.simd_width << "+" << ss << "+ kk*" << (p.nL+1) << "];" << std::endl;
|
||||
break;
|
||||
|
||||
case FETCH_FROM_GLOBAL_CONTIGUOUS:
|
||||
{
|
||||
if (B_trans_=='T')
|
||||
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
else
|
||||
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
case FETCH_FROM_GLOBAL_STRIDED:
|
||||
{
|
||||
if (B_trans_=='T')
|
||||
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[kk*#ld + nn*" + to_string(p.local_size_1) + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
else
|
||||
stream << "rB[kk][nn] = " << B->process(HANDLE_BOUNDS("in_bounds_n[nn]", "#pointer[nn*#ld*" + to_string(p.local_size_1) + " + kk" + MUL_STRIDE1 + "]")) << ";" << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
default: break;
|
||||
}
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
///Increment pointers
|
||||
switch (p.A_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
stream << "offA += " << p.kS*(p.mL+1) << ";" << std::endl;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
|
||||
else
|
||||
stream << A->process("#pointer += " + to_string(p.kS) + "" + MUL_STRIDE1 + ";") << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
switch (p.B_fetching_policy)
|
||||
{
|
||||
case FETCH_FROM_LOCAL:
|
||||
stream << "offB += " << p.kS*(p.nL+1) << ";" << std::endl;
|
||||
break;
|
||||
|
||||
default:
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("#pointer += " + to_string(p.kS) + "*#ld;") << std::endl;
|
||||
else
|
||||
stream << B->process("#pointer += " + to_string(p.kS) + "" + MUL_STRIDE1 + ";") << std::endl;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int kk = 0; kk <" << p.kS << "; ++kk)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
for (unsigned int nn=0; nn < p.nS; ++nn)
|
||||
for (unsigned int mm=0; mm < p.mS; ++mm)
|
||||
{
|
||||
string res_str, lhs_str, rhs_str;
|
||||
res_str = "rC[" + tools::to_string(mm) + "][" + tools::to_string(nn) + "]";
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
|
||||
lhs_str = "rA[kk][" + tools::to_string(mm) + "]";
|
||||
else
|
||||
lhs_str = "rA[kk][" + tools::to_string(mm/p.simd_width) + "].s" + tools::to_string(mm%p.simd_width);
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL || p.simd_width==1)
|
||||
rhs_str = "rB[kk]["+tools::to_string(nn)+"]";
|
||||
else
|
||||
rhs_str = "rB[kk]["+tools::to_string(nn/p.simd_width)+"].s"+tools::to_string(nn%p.simd_width);
|
||||
stream << res_str << "=" << "fma(" << lhs_str << "," << rhs_str << "," << res_str << ");" << std::endl;
|
||||
}
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
//Increment global pointer if local memory is used
|
||||
//Else, it's incremented directly when fetching
|
||||
if (p.A_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
if (A_trans_=='N')
|
||||
stream << A->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
|
||||
else
|
||||
stream << A->process("#pointer += " + to_string(p.kL) + "" + MUL_STRIDE1 + ";") << std::endl;
|
||||
}
|
||||
|
||||
if (p.B_fetching_policy==FETCH_FROM_LOCAL)
|
||||
{
|
||||
if (B_trans_=='T')
|
||||
stream << B->process("#pointer += " + to_string(p.kL) + "*#ld;") << std::endl;
|
||||
else
|
||||
stream << B->process("#pointer += " + to_string(p.kL) + "" + MUL_STRIDE1 + ";") << std::endl;
|
||||
}
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
if (C->row_major())
|
||||
{
|
||||
unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
|
||||
unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
|
||||
|
||||
stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#ld;") << std::endl;
|
||||
stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#ld;") << std::endl;
|
||||
stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#stride2;") << std::endl;
|
||||
stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#stride2;") << std::endl;
|
||||
|
||||
for (unsigned int n=0; n < p.nS; ++n)
|
||||
{
|
||||
for (unsigned int m=0; m < p.mS; ++m)
|
||||
{
|
||||
unsigned int ministride1 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_0;
|
||||
string Cj = to_string((m/p.simd_width)*(ministride1*p.simd_width) + m%p.simd_width);
|
||||
if (fallback)
|
||||
{
|
||||
stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
|
||||
stream.inc_tab();
|
||||
}
|
||||
stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + "+ #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
|
||||
if (fallback)
|
||||
stream.dec_tab();
|
||||
}
|
||||
if ((n+1)%p.simd_width>0 || p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
|
||||
stream << C->process("#pointer += #stride2;") << std::endl;
|
||||
else
|
||||
stream << C->process("#pointer += " + to_string((p.local_size_1*p.simd_width) - (p.simd_width-1)) + "*#stride2;") << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
unsigned int ministartstride0 = p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.mS:p.simd_width;
|
||||
unsigned int ministartstride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?p.nS:p.simd_width;
|
||||
|
||||
stream << C->process("#pointer += gidx*" + to_string(p.mL) + "*#stride1;") << std::endl;
|
||||
stream << C->process("#pointer += idx*" + to_string(ministartstride0) + "*#stride1;") << std::endl;
|
||||
stream << C->process("#pointer += gidy*" + to_string(p.nL) + "*#ld;") << std::endl;
|
||||
stream << C->process("#pointer += idy*" + to_string(ministartstride1) + "*#ld;") << std::endl;
|
||||
|
||||
for (unsigned int m=0; m < p.mS; ++m)
|
||||
{
|
||||
for (unsigned int n=0; n < p.nS; ++n)
|
||||
{
|
||||
unsigned int ministride1 = p.B_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS?1:p.local_size_1;
|
||||
string Cj = to_string((n/p.simd_width)*(ministride1*p.simd_width) + n%p.simd_width);
|
||||
if (fallback)
|
||||
{
|
||||
stream << "if (in_bounds_m[" + to_string(m) + "] && in_bounds_n[" + to_string(n) + "])" << std::endl;
|
||||
stream.inc_tab();
|
||||
}
|
||||
stream << C->process("#pointer[" + Cj + "*#ld] = rC[" + to_string(m) + "][" + to_string(n) + "]*" + alpha->name() + " + #pointer[" + Cj + "*#ld]*" + beta->name() + ";") << std::endl;
|
||||
if (fallback)
|
||||
stream.dec_tab();
|
||||
}
|
||||
|
||||
if ((m+1)%p.simd_width>0 || p.A_fetching_policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
|
||||
stream << C->process("#pointer += #stride1;") << std::endl;
|
||||
else
|
||||
stream << C->process("#pointer += " + to_string((p.local_size_0*p.simd_width) - (p.simd_width-1)) + "*#stride1;") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
return stream.str();
|
||||
|
||||
#undef MUL_STRIDE1
|
||||
#undef HANDLE_BOUNDS
|
||||
#undef VLOAD
|
||||
#undef VST0RE
|
||||
}
|
||||
|
||||
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
|
||||
{
|
||||
std::vector<std::string> res;
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, false));
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, true));
|
||||
return res;
|
||||
}
|
||||
|
||||
template<class NumericT>
|
||||
void enqueue_block(scheduler::statement & statement, atidlas_int_t M, atidlas_int_t N, atidlas_int_t K,
|
||||
scheduler::lhs_rhs_element& eA, scheduler::lhs_rhs_element& eB, scheduler::lhs_rhs_element& eC, scheduler::lhs_rhs_element& ebeta,
|
||||
matrix_base<NumericT> const & A, matrix_base<NumericT> const & B, matrix_base<NumericT> const & C, NumericT beta,
|
||||
std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix, int id)
|
||||
{
|
||||
if (A.size1()==0 || A.size2()==0 || B.size1()==0 || B.size2()==0 || C.size1()==0 || C.size2()==0)
|
||||
return;
|
||||
|
||||
viennacl::ocl::kernel& kernel = programs[id].program().get_kernel(kernel_prefix);
|
||||
|
||||
kernel.local_work_size(0, p_.local_size_0);
|
||||
kernel.local_work_size(1, p_.local_size_1);
|
||||
|
||||
scheduler::statement::assign_element(eA, A);
|
||||
scheduler::statement::assign_element(eB, B);
|
||||
scheduler::statement::assign_element(eC, C);
|
||||
scheduler::statement::assign_element(ebeta, beta);
|
||||
|
||||
if (id==1)
|
||||
{
|
||||
kernel.global_work_size(0, tools::align_to_multiple(tools::align_to_multiple((unsigned int)M,p_.mS)/p_.mS, p_.local_size_0));
|
||||
kernel.global_work_size(1, tools::align_to_multiple(tools::align_to_multiple((unsigned int)N,p_.nS)/p_.nS, p_.local_size_1));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel.global_work_size(0, M/p_.mS);
|
||||
kernel.global_work_size(1, N/p_.nS);
|
||||
}
|
||||
unsigned int current_arg = 0;
|
||||
kernel.arg(current_arg++, cl_uint(M));
|
||||
kernel.arg(current_arg++, cl_uint(N));
|
||||
kernel.arg(current_arg++, cl_uint(K));
|
||||
set_arguments(statement, kernel, current_arg);
|
||||
viennacl::ocl::enqueue(kernel);
|
||||
|
||||
}
|
||||
|
||||
template<class NumericT>
|
||||
matrix_slice< viennacl::matrix_base<NumericT> > create_slice(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr, scheduler::lhs_rhs_element const & element,
|
||||
atidlas_int_t s0_0, atidlas_int_t s0_1, atidlas_int_t s1_0, atidlas_int_t s1_1, bool swap)
|
||||
{
|
||||
matrix_base<NumericT> & M = *(element.*ptr);
|
||||
atidlas_int_t start1 = M.start1();
|
||||
atidlas_int_t start2 = M.start2();
|
||||
atidlas_int_t stride1 = M.stride1();
|
||||
atidlas_int_t stride2 = M.stride2();
|
||||
if (swap ^ M.row_major())
|
||||
{
|
||||
std::swap(start1, start2);
|
||||
std::swap(stride1, stride2);
|
||||
}
|
||||
slice s0(start1 + s0_0, stride1, s0_1 - s0_0);
|
||||
slice s1(start2 + s1_0, stride2, s1_1 - s1_0);
|
||||
if(swap)
|
||||
std::swap(s0, s1);
|
||||
return matrix_slice<viennacl::matrix_base<NumericT> >(M, s0, s1);
|
||||
}
|
||||
|
||||
template<class NumericT>
|
||||
void enqueue_impl(viennacl::matrix_base<NumericT>* scheduler::lhs_rhs_element::*ptr_matrix,
|
||||
scheduler::statement & statement, scheduler::lhs_rhs_element & A, scheduler::lhs_rhs_element & B, scheduler::lhs_rhs_element & C, scheduler::lhs_rhs_element & beta,
|
||||
NumericT beta_value, std::vector<lazy_program_compiler> & programs, std::string const & kernel_prefix)
|
||||
{
|
||||
using namespace device_specific::utils;
|
||||
atidlas_int_t ldstrideA = call_on_matrix(A, leading_stride_fun());
|
||||
atidlas_int_t ldstrideB = call_on_matrix(B, leading_stride_fun());
|
||||
atidlas_int_t ldstrideC = call_on_matrix(C, leading_stride_fun());
|
||||
atidlas_int_t ldstartA = call_on_matrix(A, leading_start_fun());
|
||||
atidlas_int_t ldstartB = call_on_matrix(B, leading_start_fun());
|
||||
bool swap_A = (A_trans_=='T');
|
||||
bool swap_B = (B_trans_=='T');
|
||||
|
||||
atidlas_int_t M = call_on_matrix(C, size1_fun());
|
||||
atidlas_int_t N = call_on_matrix(C, size2_fun());
|
||||
atidlas_int_t K = call_on_matrix(A, size2_fun());
|
||||
if (utils::call_on_matrix(A, row_major_fun()))
|
||||
K = A_trans_=='T'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
|
||||
else
|
||||
K = A_trans_=='N'?call_on_matrix(A, size2_fun()):call_on_matrix(A, size1_fun());
|
||||
|
||||
if (M < p_.mL || N < p_.nL || K < p_.kL || ldstrideA> 1 || ldstrideB > 1 || ldstrideC > 1 ||
|
||||
(p_.simd_width>1 && (ldstartA % p_.simd_width > 0 || ldstartB % p_.simd_width > 0)))
|
||||
{
|
||||
enqueue_block(statement, M, N, K, A, B, C, beta, create_slice(ptr_matrix, A, 0, M, 0, K, swap_A),
|
||||
create_slice(ptr_matrix, B, 0, K, 0, N, swap_B),
|
||||
create_slice(ptr_matrix, C, 0, M, 0, N, false), beta_value, programs, kernel_prefix, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
scheduler::lhs_rhs_element Acopy = A;
|
||||
scheduler::lhs_rhs_element Bcopy = B;
|
||||
scheduler::lhs_rhs_element Ccopy = C;
|
||||
|
||||
atidlas_int_t lM = M / p_.mL * p_.mL;
|
||||
atidlas_int_t lN = N / p_.nL * p_.nL;
|
||||
atidlas_int_t lK = K / p_.kL * p_.kL;
|
||||
|
||||
|
||||
enqueue_block(statement, lM, lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), beta_value, programs, kernel_prefix, 0);
|
||||
enqueue_block(statement, lM, lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
|
||||
|
||||
enqueue_block(statement, lM, N - lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), beta_value, programs, kernel_prefix, 1);
|
||||
enqueue_block(statement, lM, N - lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, 0, lM, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, 0, lM, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
|
||||
|
||||
enqueue_block(statement, M - lM, lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), beta_value, programs, kernel_prefix, 1);
|
||||
enqueue_block(statement, M - lM, lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, 0, lN, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, 0, lN, false), (NumericT)1, programs, kernel_prefix, 1);
|
||||
|
||||
enqueue_block(statement, M - lM, N - lN, lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, 0, lK, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, 0, lK, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), beta_value, programs, kernel_prefix, 1);
|
||||
enqueue_block(statement, M - lM, N - lN, K - lK, A, B, C, beta, create_slice<NumericT>(ptr_matrix, Acopy, lM, M, lK, K, swap_A), create_slice<NumericT>(ptr_matrix, Bcopy, lK, K, lN, N, swap_B), create_slice<NumericT>(ptr_matrix, Ccopy, lM, M, lN, N, false), (NumericT)1, programs, kernel_prefix, 1);
|
||||
}
|
||||
|
||||
public:
|
||||
matrix_product_template(matrix_product_template::parameters_type const & parameters, char A_trans, char B_trans) : template_base_impl<matrix_product_template, matrix_product_parameters>(parameters, BIND_ALL_UNIQUE), A_trans_(A_trans), B_trans_(B_trans){ }
|
||||
|
||||
virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
|
||||
{
|
||||
using namespace device_specific::utils;
|
||||
using namespace tree_parsing;
|
||||
|
||||
scheduler::statement const & st = statements.data().front();
|
||||
bool A_trans, B_trans;
|
||||
atidlas_int_t C_idx=0, A_idx=0, B_idx=0, alpha_idx=0, beta_idx = 0;
|
||||
leaf_t C_leaf=LHS_NODE_TYPE, A_leaf=LHS_NODE_TYPE, B_leaf=LHS_NODE_TYPE, alpha_leaf=LHS_NODE_TYPE, beta_leaf=LHS_NODE_TYPE;
|
||||
parse(st, C_idx, C_leaf, alpha_idx, alpha_leaf, A_idx, A_leaf, A_trans, B_idx, B_leaf, B_trans, beta_idx, beta_leaf);
|
||||
|
||||
scheduler::statement stcopy = st;
|
||||
scheduler::lhs_rhs_element& A = utils::lhs_rhs_element(stcopy, A_idx, A_leaf);
|
||||
scheduler::lhs_rhs_element& B = utils::lhs_rhs_element(stcopy, B_idx, B_leaf);
|
||||
scheduler::lhs_rhs_element& C = utils::lhs_rhs_element(stcopy, C_idx, C_leaf);
|
||||
scheduler::lhs_rhs_element& beta = utils::lhs_rhs_element(stcopy, beta_idx, beta_leaf);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if (C.numeric_type==scheduler::FLOAT_TYPE)
|
||||
enqueue_impl<float>(&scheduler::lhs_rhs_element::matrix_float, stcopy, A, B, C, beta, beta.host_float, programs, kernel_prefix);
|
||||
else if (C.numeric_type==scheduler::DOUBLE_TYPE)
|
||||
enqueue_impl<double>(&scheduler::lhs_rhs_element::matrix_double, stcopy, A, B, C, beta, beta.host_double, programs, kernel_prefix);
|
||||
else
|
||||
throw generator_not_supported_exception("GEMM only supported for float/double");
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
const char A_trans_;
|
||||
const char B_trans_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
351
atidlas/templates/reduction_template.hpp
Normal file
351
atidlas/templates/reduction_template.hpp
Normal file
@@ -0,0 +1,351 @@
|
||||
#ifndef ATIDLAS_REDUCTION_TEMPLATE_HPP
|
||||
#define ATIDLAS_REDUCTION_TEMPLATE_HPP
|
||||
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "viennacl/backend/opencl.hpp"
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
#include "atidlas/templates/utils.hpp"
|
||||
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
struct reduction_parameters : public template_base::parameters_type
|
||||
{
|
||||
reduction_parameters(unsigned int _simd_width,
|
||||
unsigned int _group_size, unsigned int _num_groups,
|
||||
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _group_size, 1, 2), num_groups(_num_groups), fetching_policy(_fetching_policy){ }
|
||||
|
||||
unsigned int num_groups;
|
||||
fetching_policy_type fetching_policy;
|
||||
};
|
||||
|
||||
class reduction_template : public template_base_impl<reduction_template, reduction_parameters>
|
||||
{
|
||||
|
||||
private:
|
||||
unsigned int n_lmem_elements() const
|
||||
{
|
||||
return p_.local_size_0;
|
||||
}
|
||||
|
||||
int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
|
||||
{
|
||||
if (p_.fetching_policy==FETCH_FROM_LOCAL)
|
||||
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
inline void reduce_1d_local_memory(utils::kernel_generation_stream & stream, unsigned int size, std::vector<mapped_scalar_reduction*> exprs,
|
||||
std::string const & buf_str, std::string const & buf_value_str) const
|
||||
{
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int stride = " << size/2 << "; stride >0; stride /=2)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
|
||||
stream << "if (lid < stride)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
for (unsigned int k = 0; k < exprs.size(); k++)
|
||||
if (exprs[k]->is_index_reduction())
|
||||
compute_index_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]")
|
||||
, exprs[k]->process(buf_value_str+"[lid]"), exprs[k]->process(buf_value_str+"[lid+stride]"),
|
||||
exprs[k]->root_op());
|
||||
else
|
||||
compute_reduction(stream, exprs[k]->process(buf_str+"[lid]"), exprs[k]->process(buf_str+"[lid+stride]"), exprs[k]->root_op());
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
}
|
||||
|
||||
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width) const
|
||||
{
|
||||
utils::kernel_generation_stream stream;
|
||||
|
||||
std::vector<mapped_scalar_reduction*> exprs;
|
||||
for (std::vector<mapping_type>::const_iterator it = mappings.begin(); it != mappings.end(); ++it)
|
||||
for (mapping_type::const_iterator iit = it->begin(); iit != it->end(); ++iit)
|
||||
if (mapped_scalar_reduction * p = dynamic_cast<mapped_scalar_reduction*>(iit->second.get()))
|
||||
exprs.push_back(p);
|
||||
std::size_t N = exprs.size();
|
||||
|
||||
std::string arguments = generate_value_kernel_argument("unsigned int", "N");
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
std::string numeric_type = utils::numeric_type_to_string(lhs_most(exprs[k]->statement().array(),
|
||||
exprs[k]->statement().root()).lhs.numeric_type);
|
||||
if (exprs[k]->is_index_reduction())
|
||||
{
|
||||
arguments += generate_pointer_kernel_argument("__global", "unsigned int", exprs[k]->process("#name_temp"));
|
||||
arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp_value"));
|
||||
}
|
||||
else
|
||||
arguments += generate_pointer_kernel_argument("__global", numeric_type, exprs[k]->process("#name_temp"));
|
||||
}
|
||||
|
||||
|
||||
/* ------------------------
|
||||
* First Kernel
|
||||
* -----------------------*/
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
|
||||
stream << "__kernel void " << kernel_prefix << "_0" << "(" << arguments << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
stream << "unsigned int lid = get_local_id(0);" << std::endl;
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
|
||||
("vector", "#pointer += #start;"), statements, mappings);
|
||||
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
if (exprs[k]->is_index_reduction())
|
||||
{
|
||||
stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
|
||||
stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
|
||||
}
|
||||
else
|
||||
{
|
||||
stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";") << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
class loop_body : public loop_body_base
|
||||
{
|
||||
public:
|
||||
loop_body(std::vector<mapped_scalar_reduction*> const & _exprs) : exprs(_exprs){ }
|
||||
|
||||
void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const
|
||||
{
|
||||
std::string i = (simd_width==1)?"i*#stride":"i";
|
||||
//Fetch vector entry
|
||||
for (std::vector<mapped_scalar_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
|
||||
(*it)->process_recursive(stream, PARENT_NODE_TYPE, utils::create_process_accessors("vector", utils::append_width("#scalartype",simd_width) + " #namereg = " + vload(simd_width,i,"#pointer")+";")
|
||||
("matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];")
|
||||
("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];")
|
||||
("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];"));
|
||||
|
||||
|
||||
//Update accumulators
|
||||
std::vector<std::string> str(simd_width);
|
||||
if (simd_width==1)
|
||||
str[0] = "#namereg";
|
||||
else
|
||||
for (unsigned int a = 0; a < simd_width; ++a)
|
||||
str[a] = "#namereg.s" + tools::to_string(a);
|
||||
|
||||
for (unsigned int k = 0; k < exprs.size(); ++k)
|
||||
{
|
||||
for (unsigned int a = 0; a < simd_width; ++a)
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["vector"] = str[a];
|
||||
accessors["matrix_row"] = str[a];
|
||||
accessors["matrix_column"] = str[a];
|
||||
accessors["matrix_diag"] = str[a];
|
||||
accessors["scalar"] = "#namereg";
|
||||
std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
|
||||
if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_INNER_PROD_TYPE)
|
||||
value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
|
||||
|
||||
if (exprs[k]->is_index_reduction())
|
||||
compute_index_reduction(stream, exprs[k]->process("#name_acc"), "i*"+tools::to_string(simd_width) + "+" + tools::to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
|
||||
else
|
||||
compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector<mapped_scalar_reduction*> exprs;
|
||||
};
|
||||
|
||||
element_wise_loop_1D(stream, loop_body(exprs), p_.fetching_policy, simd_width, "i", "N", "get_global_id(0)", "get_global_size(0)");
|
||||
|
||||
//Fills local memory
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
if (exprs[k]->is_index_reduction())
|
||||
stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
|
||||
stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
|
||||
}
|
||||
|
||||
//Reduce local memory
|
||||
reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
|
||||
|
||||
//Write to temporary buffers
|
||||
stream << "if (lid==0)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
if (exprs[k]->is_index_reduction())
|
||||
stream << exprs[k]->process("#name_temp_value[get_group_id(0)] = #name_buf_value[0];") << std::endl;
|
||||
stream << exprs[k]->process("#name_temp[get_group_id(0)] = #name_buf[0];") << std::endl;
|
||||
}
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
/* ------------------------
|
||||
* Second kernel
|
||||
* -----------------------*/
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
|
||||
stream << "__kernel void " << kernel_prefix << "_1" << "(" << arguments << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
stream << "unsigned int lid = get_local_id(0);" << std::endl;
|
||||
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
if (exprs[k]->is_index_reduction())
|
||||
{
|
||||
stream << exprs[k]->process("__local unsigned int #name_buf[" + tools::to_string(p_.local_size_0) + "];");
|
||||
stream << exprs[k]->process("unsigned int #name_acc = 0;") << std::endl;
|
||||
stream << exprs[k]->process("__local #scalartype #name_buf_value[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc_value = " + neutral_element(exprs[k]->root_op()) + ";");
|
||||
}
|
||||
else
|
||||
{
|
||||
stream << exprs[k]->process("__local #scalartype #name_buf[" + tools::to_string(p_.local_size_0) + "];") << std::endl;
|
||||
stream << exprs[k]->process("#scalartype #name_acc = " + neutral_element(exprs[k]->root_op()) + ";");
|
||||
}
|
||||
}
|
||||
|
||||
stream << "for(unsigned int i = lid; i < " << p_.num_groups << "; i += get_local_size(0))" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
if (exprs[k]->is_index_reduction())
|
||||
compute_index_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"),
|
||||
exprs[k]->process("#name_acc_value"),exprs[k]->process("#name_temp_value[i]"),exprs[k]->root_op());
|
||||
else
|
||||
compute_reduction(stream, exprs[k]->process("#name_acc"), exprs[k]->process("#name_temp[i]"), exprs[k]->root_op());
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
for (unsigned int k = 0; k < N; ++k)
|
||||
{
|
||||
if (exprs[k]->is_index_reduction())
|
||||
stream << exprs[k]->process("#name_buf_value[lid] = #name_acc_value;") << std::endl;
|
||||
stream << exprs[k]->process("#name_buf[lid] = #name_acc;") << std::endl;
|
||||
}
|
||||
|
||||
|
||||
//Reduce and write final result
|
||||
reduce_1d_local_memory(stream, p_.local_size_0, exprs, "#name_buf", "#name_buf_value");
|
||||
|
||||
stream << "if (lid==0)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["scalar_reduction"] = "#name_buf[0]";
|
||||
accessors["scalar"] = "*#pointer";
|
||||
accessors["vector"] = "#pointer[#start]";
|
||||
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
result.push_back(generate_impl(kernel_prefix + "_strided", statements, mappings, 1));
|
||||
result.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width));
|
||||
return result;
|
||||
}
|
||||
public:
|
||||
reduction_template(reduction_template::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<reduction_template, reduction_parameters>(parameters, binding_policy) { }
|
||||
|
||||
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
|
||||
{
|
||||
std::vector<scheduler::statement_node const *> reductions;
|
||||
cl_uint size = 0;
|
||||
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
|
||||
{
|
||||
std::vector<size_t> reductions_idx = tree_parsing::filter_nodes(&utils::is_reduction, *it, false);
|
||||
size = static_cast<cl_uint>(vector_size(lhs_most(it->array(), reductions_idx[0]), false));
|
||||
for (std::vector<size_t>::iterator itt = reductions_idx.begin(); itt != reductions_idx.end(); ++itt)
|
||||
reductions.push_back(&it->array()[*itt]);
|
||||
}
|
||||
|
||||
scheduler::statement const & statement = statements.data().front();
|
||||
unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
|
||||
|
||||
viennacl::ocl::kernel * kernels[2];
|
||||
if (has_strided_access(statements) && p_.simd_width > 1)
|
||||
{
|
||||
kernels[0] = &programs[0].program().get_kernel(kernel_prefix+"_strided_0");
|
||||
kernels[1] = &programs[0].program().get_kernel(kernel_prefix+"_strided_1");
|
||||
}
|
||||
else
|
||||
{
|
||||
kernels[0] = &programs[1].program().get_kernel(kernel_prefix+"_0");
|
||||
kernels[1] = &programs[1].program().get_kernel(kernel_prefix+"_1");
|
||||
}
|
||||
|
||||
kernels[0]->local_work_size(0, p_.local_size_0);
|
||||
kernels[0]->global_work_size(0,p_.local_size_0*p_.num_groups);
|
||||
|
||||
kernels[1]->local_work_size(0, p_.local_size_0);
|
||||
kernels[1]->global_work_size(0,p_.local_size_0);
|
||||
|
||||
for (unsigned int k = 0; k < 2; k++)
|
||||
{
|
||||
unsigned int n_arg = 0;
|
||||
kernels[k]->arg(n_arg++, size);
|
||||
unsigned int i = 0;
|
||||
unsigned int j = 0;
|
||||
for (std::vector<scheduler::statement_node const *>::const_iterator it = reductions.begin(); it != reductions.end(); ++it)
|
||||
{
|
||||
if (utils::is_index_reduction((*it)->op))
|
||||
{
|
||||
if (tmpidx_.size() <= j)
|
||||
tmpidx_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*4));
|
||||
kernels[k]->arg(n_arg++, tmpidx_[j]);
|
||||
j++;
|
||||
}
|
||||
|
||||
if (tmp_.size() <= i)
|
||||
tmp_.push_back(kernels[k]->context().create_memory(CL_MEM_READ_WRITE, p_.num_groups*scalartype_size));
|
||||
kernels[k]->arg(n_arg++, tmp_[i]);
|
||||
i++;
|
||||
}
|
||||
set_arguments(statements, *kernels[k], n_arg);
|
||||
}
|
||||
|
||||
for (unsigned int k = 0; k < 2; k++)
|
||||
viennacl::ocl::enqueue(*kernels[k]);
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
std::vector< viennacl::ocl::handle<cl_mem> > tmp_;
|
||||
std::vector< viennacl::ocl::handle<cl_mem> > tmpidx_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
281
atidlas/templates/row_wise_reduction_template.hpp
Normal file
281
atidlas/templates/row_wise_reduction_template.hpp
Normal file
@@ -0,0 +1,281 @@
|
||||
#ifndef VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
|
||||
#define VIENNACL_DEVICE_SPECIFIC_TEMPLATES_ROW_WISE_REDUCTION_HPP
|
||||
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/mapped_objects.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
#include "atidlas/templates/utils.hpp"
|
||||
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
|
||||
#include "viennacl/scheduler/io.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
struct row_wise_reduction_parameters : public template_base::parameters_type
|
||||
{
|
||||
row_wise_reduction_parameters(unsigned int _simd_width,
|
||||
unsigned int _local_size_0, unsigned int _local_size_1,
|
||||
unsigned int _num_groups_0, fetching_policy_type _fetch_policy): template_base::parameters_type(_simd_width, _local_size_0, _local_size_1, 1),
|
||||
num_groups_0(_num_groups_0), fetch_policy(_fetch_policy) { }
|
||||
|
||||
unsigned int num_groups_0;
|
||||
fetching_policy_type fetch_policy;
|
||||
};
|
||||
|
||||
class row_wise_reduction_template : public template_base_impl<row_wise_reduction_template, row_wise_reduction_parameters>
|
||||
{
|
||||
private:
|
||||
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
|
||||
{
|
||||
if (p_.fetch_policy==FETCH_FROM_LOCAL)
|
||||
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
unsigned int n_lmem_elements() const
|
||||
{
|
||||
return p_.local_size_0*(p_.local_size_1+1);
|
||||
}
|
||||
|
||||
static void parse(scheduler::statement const & statement, std::vector<size_t> & idx, bool & is_trans, scheduler::lhs_rhs_element & matrix)
|
||||
{
|
||||
idx = tree_parsing::filter_nodes(&utils::is_reduction, statement, false);
|
||||
is_trans = is_node_trans(statement.array(), idx[0], LHS_NODE_TYPE);
|
||||
matrix = lhs_most(statement.array(), idx[0]).lhs;
|
||||
}
|
||||
|
||||
std::string generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings, unsigned int simd_width, bool is_trans, std::vector<mapped_row_wise_reduction*> const & exprs) const
|
||||
{
|
||||
using tools::to_string;
|
||||
|
||||
unsigned int lsize0 = p_.local_size_0;
|
||||
unsigned int lsize1 = p_.local_size_1+1;
|
||||
std::string lsize1str = to_string(lsize1);
|
||||
|
||||
utils::kernel_generation_stream stream;
|
||||
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << "," << p_.local_size_1 << ",1)))" << std::endl;
|
||||
stream << "__kernel void " << kernel_prefix << "(unsigned int M, unsigned int N, " << generate_arguments("#scalartype", mappings, statements) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE,
|
||||
utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
|
||||
("matrix", "#pointer += #start1 + #start2*#ld;")
|
||||
("matrix", "#ld *= #nldstride;")
|
||||
("vector", "#pointer += #start;"), statements, mappings);
|
||||
|
||||
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
|
||||
stream << (*it)->process("__local #scalartype #name_buf[" + to_string(lsize0*lsize1) + "];") << std::endl;
|
||||
|
||||
stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
|
||||
stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
|
||||
stream << "unsigned int upper_bound_0 = ( M +" << p_.local_size_0 - 1 << ")/" << p_.local_size_0 << "*" << p_.local_size_0 << ";" << std::endl;
|
||||
stream << "for(unsigned int r = get_global_id(0); r < upper_bound_0; r += get_global_size(0)){" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
|
||||
stream << (*it)->process("#scalartype #name_acc = " + neutral_element((*it)->root_op()) + ";") << std::endl;
|
||||
|
||||
stream << "if (r < M)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
class loop_body : public loop_body_base
|
||||
{
|
||||
public:
|
||||
loop_body(std::vector<mapped_row_wise_reduction*> const & _exprs, bool _is_trans) : exprs(_exprs), is_trans(_is_trans){ }
|
||||
|
||||
void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const
|
||||
{
|
||||
std::string data_type = utils::append_width("#scalartype",simd_width);
|
||||
|
||||
for (std::vector<mapped_row_wise_reduction*>::const_iterator it = exprs.begin(); it != exprs.end(); ++it)
|
||||
{
|
||||
std::multimap<std::string, std::string> accessors;
|
||||
if (is_trans)
|
||||
accessors.insert(std::make_pair("matrix_trans", data_type + " #namereg = " + vload(simd_width, "c*#stride1", "#pointer + r*#ld")+";"));
|
||||
else
|
||||
accessors.insert(std::make_pair("matrix","#scalartype #namereg = #pointer[r*#stride1 + c*#ld];"));
|
||||
accessors.insert(std::make_pair("vector", data_type + " #namereg = " + vload(simd_width, "c*#stride", "#pointer")+";"));
|
||||
(*it)->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
||||
}
|
||||
|
||||
|
||||
//Update accumulators
|
||||
std::vector<std::string> str(simd_width);
|
||||
if (simd_width==1)
|
||||
str[0] = "#namereg";
|
||||
else
|
||||
for (unsigned int a = 0; a < simd_width; ++a)
|
||||
str[a] = "#namereg.s" + to_string(a);
|
||||
|
||||
|
||||
for (unsigned int k = 0; k < exprs.size(); ++k)
|
||||
{
|
||||
for (unsigned int a = 0; a < simd_width; ++a)
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
if (is_trans)
|
||||
accessors["matrix_trans"] = str[a];
|
||||
else
|
||||
accessors["matrix"] = str[a];
|
||||
accessors["vector"] = str[a];
|
||||
accessors["scalar"] = "#namereg";
|
||||
std::string value = exprs[k]->evaluate_recursive(LHS_NODE_TYPE, accessors);
|
||||
if (exprs[k]->root_node().op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE)
|
||||
value+= "*" + exprs[k]->evaluate_recursive(RHS_NODE_TYPE, accessors);
|
||||
|
||||
if (exprs[k]->is_index_reduction())
|
||||
compute_index_reduction(stream, exprs[k]->process("#name_acc"), "c*"+to_string(simd_width) + to_string(a), exprs[k]->process("#name_acc_value"), value,exprs[k]->root_op());
|
||||
else
|
||||
compute_reduction(stream, exprs[k]->process("#name_acc"), value,exprs[k]->root_op());
|
||||
}
|
||||
}
|
||||
}
|
||||
private:
|
||||
std::vector<mapped_row_wise_reduction*> exprs;
|
||||
bool is_trans;
|
||||
};
|
||||
|
||||
element_wise_loop_1D(stream, loop_body(exprs, is_trans), p_.fetch_policy, simd_width, "c", "N", "get_local_id(1)", "get_local_size(1)");
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
for (unsigned int k = 0; k < exprs.size(); ++k)
|
||||
stream << exprs[k]->process("#name_buf[lid0*" + lsize1str + "+ lid1] = #name_acc;") << std::endl;
|
||||
|
||||
stream << "#pragma unroll" << std::endl;
|
||||
stream << "for(unsigned int stride = " << p_.local_size_1/2 << "; stride >0; stride /=2)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
|
||||
stream << "if (lid1 < stride)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
for (unsigned int k = 0; k < exprs.size(); k++)
|
||||
if (exprs[k]->is_index_reduction())
|
||||
compute_index_reduction(stream, exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]")
|
||||
, exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf_value[lid0*" + lsize1str + " + lid1 + stride]"),
|
||||
exprs[k]->root_op());
|
||||
else
|
||||
compute_reduction(stream,exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1]"), exprs[k]->process("#name_buf[lid0*" + lsize1str + " + lid1 + stride]"), exprs[k]->root_op());
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
stream << "if (lid1 == 0 && r < M)";
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
std::map<std::string, std::string> accessors;
|
||||
accessors["row_wise_reduction"] = "#name_buf[lid0*" + lsize1str + "]";
|
||||
accessors["vector"] = "#pointer[r*#stride]";
|
||||
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, accessors, statements, mappings);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
return stream.str();
|
||||
}
|
||||
|
||||
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
|
||||
{
|
||||
std::vector<mapped_row_wise_reduction*> exprs;
|
||||
bool is_trans = false;
|
||||
bool row_major = false;
|
||||
statements_container::data_type::const_iterator sit;
|
||||
std::vector<mapping_type>::const_iterator mit;
|
||||
for (mit = mappings.begin(), sit = statements.data().begin(); mit != mappings.end(); ++mit, ++sit)
|
||||
{
|
||||
std::vector<size_t> idx;
|
||||
scheduler::lhs_rhs_element A;
|
||||
parse(*sit, idx, is_trans, A);
|
||||
row_major = utils::call_on_matrix(A, utils::row_major_fun());
|
||||
for (unsigned int j = 0; j < idx.size(); ++j)
|
||||
exprs.push_back((mapped_row_wise_reduction*)(mit->at(mapping_key(idx[j], PARENT_NODE_TYPE)).get()));
|
||||
}
|
||||
is_trans = is_trans ^ row_major;
|
||||
|
||||
std::vector<std::string> res;
|
||||
if (is_trans && p_.simd_width>1)
|
||||
{
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, p_.simd_width, is_trans, exprs));
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
|
||||
}
|
||||
else
|
||||
res.push_back(generate_impl(kernel_prefix, statements, mappings, 1, is_trans, exprs));
|
||||
|
||||
return res;
|
||||
}
|
||||
public:
|
||||
row_wise_reduction_template(row_wise_reduction_template::parameters_type const & parameters, char A_trans, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<row_wise_reduction_template, row_wise_reduction_parameters>(parameters, binding_policy), A_trans_(A_trans){ }
|
||||
|
||||
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
|
||||
{
|
||||
std::vector<size_t> idx;
|
||||
scheduler::lhs_rhs_element A;
|
||||
bool is_trans;
|
||||
parse(statements.data().front(), idx, is_trans, A);
|
||||
bool row_major = utils::call_on_matrix(A, utils::row_major_fun());
|
||||
|
||||
viennacl::ocl::kernel * kernel;
|
||||
if((is_trans ^ row_major)&& p_.simd_width>1)
|
||||
{
|
||||
if (has_strided_access(statements))
|
||||
kernel = &programs[1].program().get_kernel(kernel_prefix);
|
||||
else
|
||||
kernel = &programs[0].program().get_kernel(kernel_prefix);
|
||||
}
|
||||
else
|
||||
kernel = &programs[0].program().get_kernel(kernel_prefix);
|
||||
|
||||
kernel->local_work_size(0,p_.local_size_0);
|
||||
kernel->local_work_size(1,p_.local_size_1);
|
||||
kernel->global_work_size(0,p_.local_size_0*p_.num_groups_0);
|
||||
kernel->global_work_size(1,p_.local_size_1);
|
||||
|
||||
unsigned int current_arg = 0;
|
||||
if (is_trans)
|
||||
{
|
||||
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
|
||||
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
|
||||
}
|
||||
else
|
||||
{
|
||||
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size1_fun())));
|
||||
kernel->arg(current_arg++, cl_uint(utils::call_on_matrix(A, utils::size2_fun())));
|
||||
}
|
||||
|
||||
|
||||
set_arguments(statements, *kernel, current_arg);
|
||||
viennacl::ocl::enqueue(*kernel);
|
||||
}
|
||||
|
||||
private:
|
||||
const char A_trans_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
540
atidlas/templates/template_base.hpp
Normal file
540
atidlas/templates/template_base.hpp
Normal file
@@ -0,0 +1,540 @@
|
||||
#ifndef ATIDLAS_TEMPLATES_TEMPLATE_BASE_
|
||||
#define ATIDLAS_TEMPLATES_TEMPLATE_BASE_
|
||||
|
||||
|
||||
#include <list>
|
||||
#include <set>
|
||||
|
||||
#include "viennacl/ocl/kernel.hpp"
|
||||
#include "viennacl/ocl/device.hpp"
|
||||
#include "viennacl/ocl/device_utils.hpp"
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
#include "viennacl/scheduler/io.hpp"
|
||||
|
||||
#include "atidlas/lazy_program_compiler.hpp"
|
||||
#include "atidlas/mapped_objects.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
enum fetching_policy_type
|
||||
{
|
||||
FETCH_FROM_LOCAL,
|
||||
FETCH_FROM_GLOBAL_STRIDED,
|
||||
FETCH_FROM_GLOBAL_CONTIGUOUS
|
||||
};
|
||||
|
||||
class template_base
|
||||
{
|
||||
public:
|
||||
struct parameters_type
|
||||
{
|
||||
parameters_type(unsigned int _simd_width, unsigned int _local_size_1, unsigned int _local_size_2, unsigned int _num_kernels) : simd_width(_simd_width), local_size_0(_local_size_1), local_size_1(_local_size_2), num_kernels(_num_kernels){ }
|
||||
|
||||
unsigned int simd_width;
|
||||
unsigned int local_size_0;
|
||||
unsigned int local_size_1;
|
||||
unsigned int num_kernels;
|
||||
};
|
||||
|
||||
private:
|
||||
/** @brief Functor to map the statements to the types defined in mapped_objects.hpp */
|
||||
class map_functor : public tree_parsing::traversal_functor
|
||||
{
|
||||
|
||||
viennacl::scheduler::statement_node_numeric_type numeric_type(viennacl::scheduler::statement const * statement, atidlas_int_t root_idx) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const * root_node = &statement->array()[root_idx];
|
||||
while (root_node->lhs.numeric_type==viennacl::scheduler::INVALID_NUMERIC_TYPE)
|
||||
root_node = &statement->array()[root_node->lhs.node_index];
|
||||
return root_node->lhs.numeric_type;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef tools::shared_ptr<mapped_object> result_type;
|
||||
|
||||
map_functor(symbolic_binder & binder, mapping_type & mapping) : binder_(binder), mapping_(mapping){ }
|
||||
|
||||
/** @brief Binary leaf */
|
||||
template<class T>
|
||||
result_type binary_leaf(viennacl::scheduler::statement const * statement, atidlas_int_t root_idx, mapping_type const * mapping) const
|
||||
{
|
||||
return result_type(new T(utils::numeric_type_to_string(numeric_type(statement,root_idx)), binder_.get(NULL), mapped_object::node_info(mapping, statement, root_idx)));
|
||||
}
|
||||
|
||||
template<class NumericT>
|
||||
result_type operator()(NumericT const & /*scalar*/) const
|
||||
{
|
||||
return result_type(new mapped_host_scalar(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
|
||||
}
|
||||
|
||||
/** @brief Scalar mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::scalar<NumericT> const & scal) const
|
||||
{
|
||||
return result_type(new mapped_scalar(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(scal))));
|
||||
}
|
||||
|
||||
/** @brief Vector mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::vector_base<NumericT> const & vec) const
|
||||
{
|
||||
return result_type(new mapped_vector(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(vec))));
|
||||
}
|
||||
|
||||
/** @brief Implicit vector mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::implicit_vector_base<NumericT> const & /*vec*/) const
|
||||
{
|
||||
return result_type(new mapped_implicit_vector(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
|
||||
}
|
||||
|
||||
/** @brief Matrix mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
|
||||
{
|
||||
return result_type(new mapped_matrix(utils::type_to_string<NumericT>::value(), binder_.get(&viennacl::traits::handle(mat)),
|
||||
viennacl::traits::row_major(mat)));
|
||||
}
|
||||
|
||||
/** @brief Implicit matrix mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::implicit_matrix_base<NumericT> const & /*mat*/) const
|
||||
{
|
||||
return result_type(new mapped_implicit_matrix(utils::type_to_string<NumericT>::value(), binder_.get(NULL)));
|
||||
}
|
||||
|
||||
/** @brief Traversal functor */
|
||||
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const {
|
||||
mapping_type::key_type key(root_idx, leaf_t);
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
|
||||
if (leaf_t == LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.lhs, *this)));
|
||||
else if (leaf_t == RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
mapping_.insert(mapping_type::value_type(key, utils::call_on_element(root_node.rhs, *this)));
|
||||
else if ( leaf_t== PARENT_NODE_TYPE)
|
||||
{
|
||||
if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_vector_diag>(&statement, root_idx, &mapping_)));
|
||||
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_diag>(&statement, root_idx, &mapping_)));
|
||||
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_row>(&statement, root_idx, &mapping_)));
|
||||
else if (root_node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_column>(&statement, root_idx, &mapping_)));
|
||||
else if (is_scalar_reduction(root_node))
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_scalar_reduction>(&statement, root_idx, &mapping_)));
|
||||
else if (is_vector_reduction(root_node))
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_row_wise_reduction>(&statement, root_idx, &mapping_)));
|
||||
else if (root_node.op.type == viennacl::scheduler::OPERATION_BINARY_MAT_MAT_PROD_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_matrix_product>(&statement, root_idx, &mapping_)));
|
||||
else if (root_node.op.type == viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
|
||||
mapping_.insert(mapping_type::value_type(key, binary_leaf<mapped_trans>(&statement, root_idx, &mapping_)));
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
symbolic_binder & binder_;
|
||||
mapping_type & mapping_;
|
||||
};
|
||||
|
||||
/** @brief functor for setting the arguments of a kernel */
|
||||
class set_arguments_functor : public tree_parsing::traversal_functor
|
||||
{
|
||||
public:
|
||||
typedef void result_type;
|
||||
|
||||
set_arguments_functor(symbolic_binder & binder, unsigned int & current_arg, viennacl::ocl::kernel & kernel) : binder_(binder), current_arg_(current_arg), kernel_(kernel){ }
|
||||
|
||||
template<class NumericT>
|
||||
result_type operator()(NumericT const & scal) const
|
||||
{
|
||||
typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
|
||||
kernel_.arg(current_arg_++, cl_scalartype(scal));
|
||||
}
|
||||
|
||||
/** @brief Scalar mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::scalar<NumericT> const & scal) const
|
||||
{
|
||||
if (binder_.bind(&viennacl::traits::handle(scal)))
|
||||
kernel_.arg(current_arg_++, scal.handle().opencl_handle());
|
||||
}
|
||||
|
||||
/** @brief Vector mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::vector_base<NumericT> const & vec) const
|
||||
{
|
||||
if (binder_.bind(&viennacl::traits::handle(vec)))
|
||||
{
|
||||
kernel_.arg(current_arg_++, vec.handle().opencl_handle());
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start(vec)));
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride(vec)));
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Implicit vector mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::implicit_vector_base<NumericT> const & vec) const
|
||||
{
|
||||
typedef typename viennacl::result_of::cl_type<NumericT>::type cl_scalartype;
|
||||
kernel_.arg(current_arg_++, cl_scalartype(vec.value()));
|
||||
if (vec.has_index())
|
||||
kernel_.arg(current_arg_++, cl_uint(vec.index()));
|
||||
}
|
||||
|
||||
/** @brief Matrix mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
|
||||
{
|
||||
if (binder_.bind(&viennacl::traits::handle(mat)))
|
||||
{
|
||||
kernel_.arg(current_arg_++, mat.handle().opencl_handle());
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::ld(mat)));
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start1(mat)));
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::start2(mat)));
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride1(mat)));
|
||||
kernel_.arg(current_arg_++, cl_uint(viennacl::traits::stride2(mat)));
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief Implicit matrix mapping */
|
||||
template<class NumericT>
|
||||
result_type operator()(viennacl::implicit_matrix_base<NumericT> const & mat) const
|
||||
{
|
||||
kernel_.arg(current_arg_++, typename viennacl::result_of::cl_type<NumericT>::type(mat.value()));
|
||||
}
|
||||
|
||||
/** @brief Traversal functor: */
|
||||
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
utils::call_on_element(root_node.lhs, *this);
|
||||
else if (leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
utils::call_on_element(root_node.rhs, *this);
|
||||
}
|
||||
|
||||
private:
|
||||
symbolic_binder & binder_;
|
||||
unsigned int & current_arg_;
|
||||
viennacl::ocl::kernel & kernel_;
|
||||
};
|
||||
|
||||
protected:
|
||||
|
||||
static std::string generate_arguments(std::string const & data_type, std::vector<mapping_type> const & mappings, statements_container const & statements)
|
||||
{
|
||||
utils::kernel_generation_stream stream;
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE, utils::create_process_accessors("scalar", "__global #scalartype* #pointer,")
|
||||
("host_scalar", "#scalartype #name,")
|
||||
("matrix", "__global " + data_type + "* #pointer, uint #ld, uint #start1, uint #start2, uint #stride1, uint #stride2,")
|
||||
("vector", "__global " + data_type + "* #pointer, uint #start, uint #stride,")
|
||||
("implicit_vector", "#scalartype #name,")
|
||||
("implicit_matrix", "#scalartype #name,")
|
||||
,statements, mappings);
|
||||
std::string res = stream.str();
|
||||
res.erase(res.rfind(','));
|
||||
return res;
|
||||
}
|
||||
|
||||
void set_arguments(statements_container const & statements, viennacl::ocl::kernel & kernel, unsigned int & current_arg)
|
||||
{
|
||||
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
|
||||
for (statements_container::data_type::const_iterator itt = statements.data().begin(); itt != statements.data().end(); ++itt)
|
||||
tree_parsing::traverse(*itt, itt->root(), set_arguments_functor(*binder,current_arg,kernel), true);
|
||||
}
|
||||
|
||||
class invalid_template_exception : public std::exception
|
||||
{
|
||||
public:
|
||||
invalid_template_exception() : message_() {}
|
||||
invalid_template_exception(std::string message) :
|
||||
message_("ViennaCL: Internal error: The generator cannot apply the given template to the given statement: " + message + "\n"
|
||||
"If you are using a builtin template, please report on viennacl-support@lists.sourceforge.net! We will provide a fix as soon as possible\n"
|
||||
"If you are using your own template, please try using other parameters") {}
|
||||
virtual const char* what() const throw() { return message_.c_str(); }
|
||||
virtual ~invalid_template_exception() throw() {}
|
||||
private:
|
||||
std::string message_;
|
||||
};
|
||||
|
||||
static void fetching_loop_info(fetching_policy_type policy, std::string const & bound, utils::kernel_generation_stream & stream, std::string & init, std::string & upper_bound, std::string & inc, std::string const & domain_id, std::string const & domain_size)
|
||||
{
|
||||
if (policy==FETCH_FROM_GLOBAL_STRIDED)
|
||||
{
|
||||
init = domain_id;
|
||||
upper_bound = bound;
|
||||
inc = domain_size;
|
||||
}
|
||||
else if (policy==FETCH_FROM_GLOBAL_CONTIGUOUS)
|
||||
{
|
||||
std::string chunk_size = "chunk_size";
|
||||
std::string chunk_start = "chunk_start";
|
||||
std::string chunk_end = "chunk_end";
|
||||
|
||||
stream << "unsigned int " << chunk_size << " = (" << bound << "+" << domain_size << "-1)/" << domain_size << ";" << std::endl;
|
||||
stream << "unsigned int " << chunk_start << " =" << domain_id << "*" << chunk_size << ";" << std::endl;
|
||||
stream << "unsigned int " << chunk_end << " = min(" << chunk_start << "+" << chunk_size << ", " << bound << ");" << std::endl;
|
||||
init = chunk_start;
|
||||
upper_bound = chunk_end;
|
||||
inc = "1";
|
||||
}
|
||||
}
|
||||
|
||||
static bool is_node_trans(viennacl::scheduler::statement::container_type const & array, size_t root_idx, leaf_t leaf_type)
|
||||
{
|
||||
bool res = false;
|
||||
viennacl::scheduler::lhs_rhs_element viennacl::scheduler::statement_node::*ptr;
|
||||
if (leaf_type==LHS_NODE_TYPE)
|
||||
ptr = &viennacl::scheduler::statement_node::lhs;
|
||||
else
|
||||
ptr = &viennacl::scheduler::statement_node::rhs;
|
||||
viennacl::scheduler::statement_node const * node = &array[root_idx];
|
||||
while ((node->*ptr).type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
{
|
||||
if (array[(node->*ptr).node_index].op.type==viennacl::scheduler::OPERATION_UNARY_TRANS_TYPE)
|
||||
res = !res;
|
||||
node = &array[(node->*ptr).node_index];
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
protected:
|
||||
|
||||
static bool is_offset_modifier(viennacl::scheduler::statement_node const & node)
|
||||
{
|
||||
return node.op.type==viennacl::scheduler::OPERATION_BINARY_VECTOR_DIAG_TYPE
|
||||
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_DIAG_TYPE
|
||||
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_ROW_TYPE
|
||||
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MATRIX_COLUMN_TYPE;
|
||||
}
|
||||
|
||||
static bool has_strided_access(statements_container const & statements)
|
||||
{
|
||||
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
|
||||
{
|
||||
//checks for vectors
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> vectors = tree_parsing::filter_elements(viennacl::scheduler::DENSE_VECTOR_TYPE, *it);
|
||||
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
|
||||
if (utils::call_on_vector(*itt, utils::stride_fun())>1)
|
||||
return true;
|
||||
|
||||
//checks for matrix
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> matrices = tree_parsing::filter_elements(viennacl::scheduler::DENSE_MATRIX_TYPE, *it);
|
||||
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
|
||||
if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
|
||||
return true;
|
||||
|
||||
if(tree_parsing::filter_nodes(&is_offset_modifier, *it, true).empty()==false)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
static atidlas_int_t vector_size(viennacl::scheduler::statement_node const & node, bool up_to_internal_size)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
using namespace utils;
|
||||
if (node.op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE)
|
||||
{
|
||||
atidlas_int_t size1 = up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
|
||||
atidlas_int_t size2 = up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
|
||||
return std::min<atidlas_int_t>(size1, size2);
|
||||
}
|
||||
else if (node.op.type==OPERATION_BINARY_MATRIX_ROW_TYPE)
|
||||
return up_to_internal_size?call_on_matrix(node.lhs, internal_size2_fun()):call_on_matrix(node.lhs, size2_fun());
|
||||
else if (node.op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE)
|
||||
return up_to_internal_size?call_on_matrix(node.lhs, internal_size1_fun()):call_on_matrix(node.lhs, size1_fun());
|
||||
else
|
||||
return up_to_internal_size?call_on_vector(node.lhs, internal_size_fun()):call_on_vector(node.lhs, size_fun());
|
||||
}
|
||||
|
||||
//NB : templates are not used here because declaring a functor out of the generate() functions would be harder to read
|
||||
struct loop_body_base
|
||||
{
|
||||
virtual void operator()(utils::kernel_generation_stream & stream, unsigned int simd_width) const = 0;
|
||||
};
|
||||
|
||||
static void element_wise_loop_1D(utils::kernel_generation_stream & stream, loop_body_base const & loop_body,
|
||||
fetching_policy_type fetch, unsigned int simd_width, std::string const & i, std::string const & bound, std::string const & domain_id, std::string const & domain_size)
|
||||
{
|
||||
std::string strwidth = tools::to_string(simd_width);
|
||||
std::string boundround = bound + "/" + strwidth;
|
||||
|
||||
std::string init, upper_bound, inc;
|
||||
fetching_loop_info(fetch, boundround, stream, init, upper_bound, inc, domain_id, domain_size);
|
||||
stream << "for(unsigned int " << i << " = " << init << "; " << i << " < " << upper_bound << "; " << i << " += " << inc << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
loop_body(stream, simd_width);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
if (simd_width>1)
|
||||
{
|
||||
stream << "for(unsigned int " << i << " = " << boundround << "*" << strwidth << " + " << domain_id << "; " << i << " < " << bound << "; " << i << " += " + domain_size + ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
loop_body(stream, 1);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
static std::string vstore(unsigned int simd_width, std::string const & value, std::string const & offset, std::string const & ptr)
|
||||
{
|
||||
if (simd_width==1)
|
||||
return "(" + ptr + ")[" + offset + "] = " + value;
|
||||
else
|
||||
return utils::append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
|
||||
}
|
||||
|
||||
static std::string vload(unsigned int simd_width, std::string const & offset, std::string const & ptr)
|
||||
{
|
||||
if (simd_width==1)
|
||||
return "(" + ptr + ")[" + offset + "]";
|
||||
else
|
||||
return utils::append_width("vload", simd_width) + "(" + offset + ", " + ptr + ")";
|
||||
}
|
||||
|
||||
private:
|
||||
/** @brief Generates the body of the associated kernel function */
|
||||
virtual std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mapping) const = 0;
|
||||
|
||||
public:
|
||||
template_base(binding_policy_t binding_policy) : binding_policy_(binding_policy) {}
|
||||
|
||||
virtual ~template_base(){ }
|
||||
|
||||
std::vector<std::string> generate(std::string const & kernel_prefix, statements_container const & statements, viennacl::ocl::device const & device)
|
||||
{
|
||||
statements_container::data_type::const_iterator sit;
|
||||
std::vector<mapping_type>::iterator mit;
|
||||
|
||||
if(int err = check_invalid(statements, device))
|
||||
throw generator_not_supported_exception("The supplied parameters for this template are invalid : err " + tools::to_string(err));
|
||||
|
||||
//Create mapping
|
||||
std::vector<mapping_type> mappings(statements.data().size());
|
||||
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy_);
|
||||
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++sit, ++mit)
|
||||
tree_parsing::traverse(*sit, sit->root(), map_functor(*binder,*mit), true);
|
||||
|
||||
return generate_impl(kernel_prefix, statements, mappings);
|
||||
}
|
||||
|
||||
/** @brief returns whether or not the profile has undefined behavior on particular device */
|
||||
virtual int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const = 0;
|
||||
|
||||
virtual void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements) = 0;
|
||||
|
||||
virtual tools::shared_ptr<template_base> clone() const = 0;
|
||||
private:
|
||||
binding_policy_t binding_policy_;
|
||||
};
|
||||
|
||||
|
||||
template<class TemplateType, class ParametersType>
|
||||
class template_base_impl : public template_base
|
||||
{
|
||||
private:
|
||||
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const { return TEMPLATE_VALID; }
|
||||
virtual unsigned int n_lmem_elements() const { return 0; }
|
||||
|
||||
protected:
|
||||
|
||||
bool has_misaligned_offset(statements_container const & statements)
|
||||
{
|
||||
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
|
||||
{
|
||||
//checks for vectors
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> vectors = tree_parsing::filter_elements(viennacl::scheduler::DENSE_VECTOR_TYPE, *it);
|
||||
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = vectors.begin(); itt != vectors.end(); ++itt)
|
||||
if (utils::call_on_vector(*itt, utils::stride_fun())>1)
|
||||
return true;
|
||||
|
||||
//checks for matrix
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> matrices = tree_parsing::filter_elements(viennacl::scheduler::DENSE_MATRIX_TYPE, *it);
|
||||
for (std::vector<viennacl::scheduler::lhs_rhs_element>::iterator itt = matrices.begin(); itt != matrices.end(); ++itt)
|
||||
if (utils::call_on_matrix(*itt, utils::stride1_fun())>1 || utils::call_on_matrix(*itt, utils::stride2_fun())>2)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public:
|
||||
typedef ParametersType parameters_type;
|
||||
|
||||
/** @brief The constructor */
|
||||
template_base_impl(parameters_type const & parameters, binding_policy_t binding_policy) : template_base(binding_policy), p_(parameters){ }
|
||||
|
||||
parameters_type const & parameters() const
|
||||
{
|
||||
return p_;
|
||||
}
|
||||
|
||||
tools::shared_ptr<template_base> clone() const
|
||||
{
|
||||
return tools::shared_ptr<template_base>(new TemplateType(*dynamic_cast<TemplateType const *>(this)));
|
||||
}
|
||||
|
||||
/** @brief returns whether or not the profile has undefined behavior on particular device */
|
||||
int check_invalid(statements_container const & statements, viennacl::ocl::device const & device) const
|
||||
{
|
||||
using namespace viennacl::tools;
|
||||
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
unsigned int scalartype_size = utils::size_of(lhs_most(statement.array(), statement.root()).lhs.numeric_type);
|
||||
|
||||
//Query device informations
|
||||
size_t lmem_available = static_cast<size_t>(device.local_mem_size());
|
||||
size_t lmem_usage = scalartype_size*n_lmem_elements();
|
||||
if (lmem_usage>lmem_available)
|
||||
return TEMPLATE_LOCAL_MEMORY_OVERFLOW;
|
||||
|
||||
//Invalid work group size
|
||||
size_t max_workgroup_size = device.max_work_group_size();
|
||||
std::vector<size_t> max_work_item_sizes = device.max_work_item_sizes();
|
||||
if (p_.local_size_0*p_.local_size_1 > max_workgroup_size)
|
||||
return TEMPLATE_WORK_GROUP_SIZE_OVERFLOW;
|
||||
if (p_.local_size_0 > max_work_item_sizes[0])
|
||||
return TEMPLATE_LOCAL_SIZE_0_OVERFLOW;
|
||||
|
||||
if (p_.local_size_1 > max_work_item_sizes[1])
|
||||
return TEMPLATE_LOCAL_SIZE_1_OVERFLOW;
|
||||
|
||||
//Advice from the Intel guide
|
||||
unsigned int warp_size = 8;
|
||||
if (device.type()==CL_DEVICE_TYPE_GPU)
|
||||
{
|
||||
//Advice from the nvidia guide
|
||||
warp_size = 32;
|
||||
//Advice from the AMD guide
|
||||
if (device.vendor_id()==4098)
|
||||
warp_size = 64;
|
||||
}
|
||||
if (((p_.local_size_0*p_.local_size_1)%warp_size)>0)
|
||||
return TEMPLATE_LOCAL_SIZE_NOT_WARP_MULTIPLE;
|
||||
|
||||
//Invalid SIMD Width
|
||||
if (p_.simd_width!=1 && p_.simd_width!=2 &&
|
||||
p_.simd_width!=4 && p_.simd_width!=8 &&
|
||||
p_.simd_width!=16)
|
||||
return TEMPLATE_INVALID_SIMD_WIDTH;
|
||||
|
||||
return check_invalid_impl(device);
|
||||
}
|
||||
|
||||
protected:
|
||||
parameters_type p_;
|
||||
binding_policy_t binding_policy_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
79
atidlas/templates/utils.hpp
Normal file
79
atidlas/templates/utils.hpp
Normal file
@@ -0,0 +1,79 @@
|
||||
#ifndef ATIDLAS_TEMPLATES_REDUCTION_UTILS_HPP
|
||||
#define ATIDLAS_TEMPLATES_REDUCTION_UTILS_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
inline void compute_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
if (utils::elementwise_function(op))
|
||||
os << acc << "=" << tree_parsing::evaluate(op.type) << "(" << acc << "," << cur << ");" << std::endl;
|
||||
else
|
||||
os << acc << "= (" << acc << ")" << tree_parsing::evaluate(op.type) << "(" << cur << ");" << std::endl;
|
||||
}
|
||||
|
||||
inline void compute_index_reduction(utils::kernel_generation_stream & os, std::string acc, std::string cur, std::string const & acc_value, std::string const & cur_value, viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
// os << acc << " = " << cur_value << ">" << acc_value << "?" << cur << ":" << acc << ";" << std::endl;
|
||||
os << acc << "= select(" << acc << "," << cur << "," << cur_value << ">" << acc_value << ");" << std::endl;
|
||||
os << acc_value << "=";
|
||||
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE) os << "fmax";
|
||||
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE) os << "max";
|
||||
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE) os << "fmin";
|
||||
if (op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE) os << "min";
|
||||
os << "(" << acc_value << "," << cur_value << ");"<< std::endl;
|
||||
}
|
||||
|
||||
inline void process_all(std::string const & type_key, std::string const & str,
|
||||
utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings)
|
||||
{
|
||||
for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
|
||||
for (mapping_type::const_iterator mmit = mit->begin(); mmit != mit->end(); ++mmit)
|
||||
if (mmit->second->type_key()==type_key)
|
||||
stream << mmit->second->process(str) << std::endl;
|
||||
}
|
||||
|
||||
|
||||
inline void process_all_at(std::string const & type_key, std::string const & str,
|
||||
utils::kernel_generation_stream & stream, std::vector<mapping_type> const & mappings,
|
||||
size_t root_idx, leaf_t leaf)
|
||||
{
|
||||
for (std::vector<mapping_type>::const_iterator mit = mappings.begin(); mit != mappings.end(); ++mit)
|
||||
{
|
||||
mapped_object * obj = mit->at(mapping_key(root_idx, leaf)).get();
|
||||
if (obj->type_key()==type_key)
|
||||
stream << obj->process(str) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string neutral_element(viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
switch (op.type)
|
||||
{
|
||||
case viennacl::scheduler::OPERATION_BINARY_ADD_TYPE : return "0";
|
||||
case viennacl::scheduler::OPERATION_BINARY_MULT_TYPE : return "1";
|
||||
case viennacl::scheduler::OPERATION_BINARY_DIV_TYPE : return "1";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "-INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "-INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_MAX_TYPE : return "-INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "-INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_MIN_TYPE : return "INFINITY";
|
||||
case viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "INFINITY";
|
||||
|
||||
default: throw generator_not_supported_exception("Unsupported reduction operator : no neutral element known");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
#endif
|
130
atidlas/templates/vector_axpy_template.hpp
Normal file
130
atidlas/templates/vector_axpy_template.hpp
Normal file
@@ -0,0 +1,130 @@
|
||||
#ifndef ATIDLAS_TEMPLATES_VECTOR_AXPY_HPP
|
||||
#define ATIDLAS_TEMPLATES_VECTOR_AXPY_HPP
|
||||
|
||||
#include <vector>
|
||||
#include <cmath>
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/mapped_objects.hpp"
|
||||
#include "atidlas/tree_parsing.hpp"
|
||||
#include "atidlas/forwards.h"
|
||||
#include "atidlas/utils.hpp"
|
||||
|
||||
#include "atidlas/templates/template_base.hpp"
|
||||
#include "atidlas/templates/utils.hpp"
|
||||
|
||||
#include "viennacl/tools/tools.hpp"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
class vector_axpy_parameters : public template_base::parameters_type
|
||||
{
|
||||
public:
|
||||
vector_axpy_parameters(unsigned int _simd_width,
|
||||
unsigned int _group_size, unsigned int _num_groups,
|
||||
fetching_policy_type _fetching_policy) : template_base::parameters_type(_simd_width, _group_size, 1, 1), num_groups(_num_groups), fetching_policy(_fetching_policy){ }
|
||||
|
||||
|
||||
|
||||
unsigned int num_groups;
|
||||
fetching_policy_type fetching_policy;
|
||||
};
|
||||
|
||||
class vector_axpy_template : public template_base_impl<vector_axpy_template, vector_axpy_parameters>
|
||||
{
|
||||
private:
|
||||
virtual int check_invalid_impl(viennacl::ocl::device const & /*dev*/) const
|
||||
{
|
||||
if (p_.fetching_policy==FETCH_FROM_LOCAL)
|
||||
return TEMPLATE_INVALID_FETCHING_POLICY_TYPE;
|
||||
return TEMPLATE_VALID;
|
||||
}
|
||||
|
||||
std::vector<std::string> generate_impl(std::string const & kernel_prefix, statements_container const & statements, std::vector<mapping_type> const & mappings) const
|
||||
{
|
||||
std::vector<std::string> result;
|
||||
for (unsigned int i = 0; i < 2; ++i)
|
||||
{
|
||||
utils::kernel_generation_stream stream;
|
||||
unsigned int simd_width = (i==0)?1:p_.simd_width;
|
||||
std::string str_simd_width = tools::to_string(simd_width);
|
||||
std::string data_type = utils::append_width("#scalartype",simd_width);
|
||||
|
||||
stream << " __attribute__((reqd_work_group_size(" << p_.local_size_0 << ",1,1)))" << std::endl;
|
||||
stream << "__kernel void " << kernel_prefix << i << "(unsigned int N," << generate_arguments(data_type, mappings, statements) << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE,
|
||||
utils::create_process_accessors("scalar", "#scalartype #namereg = *#pointer;")
|
||||
("matrix", "#pointer += #start1 + #start2*#ld;")
|
||||
("vector", "#pointer += #start;")
|
||||
("vector", "#start/=" + str_simd_width + ";"), statements, mappings);
|
||||
|
||||
std::string init, upper_bound, inc;
|
||||
fetching_loop_info(p_.fetching_policy, "N/"+str_simd_width, stream, init, upper_bound, inc, "get_global_id(0)", "get_global_size(0)");
|
||||
stream << "for(unsigned int i = " << init << "; i < " << upper_bound << "; i += " << inc << ")" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
tree_parsing::process(stream, PARENT_NODE_TYPE,
|
||||
utils::create_process_accessors("vector", data_type + " #namereg = #pointer[i*#stride];")
|
||||
("matrix_row", "#scalartype #namereg = #pointer[$OFFSET{#row*#stride1, i*#stride2}];")
|
||||
("matrix_column", "#scalartype #namereg = #pointer[$OFFSET{i*#stride1,#column*#stride2}];")
|
||||
("matrix_diag", "#scalartype #namereg = #pointer[#diag_offset<0?$OFFSET{(i - #diag_offset)*#stride1, i*#stride2}:$OFFSET{i*#stride1, (i + #diag_offset)*#stride2}];")
|
||||
, statements, mappings);
|
||||
|
||||
tree_parsing::evaluate(stream, PARENT_NODE_TYPE, utils::create_evaluate_accessors("vector", "#namereg")
|
||||
("matrix_row", "#namereg")
|
||||
("matrix_column", "#namereg")
|
||||
("matrix_diag", "#namereg")
|
||||
("scalar", "#namereg"), statements, mappings);
|
||||
|
||||
tree_parsing::process(stream, LHS_NODE_TYPE, utils::create_process_accessors("vector", "#pointer[i*#stride] = #namereg;")
|
||||
("matrix_row", "#pointer[$OFFSET{#row, i}] = #namereg;")
|
||||
("matrix_column", "#pointer[$OFFSET{i, #column}] = #namereg;")
|
||||
("matrix_diag", "#pointer[#diag_offset<0?$OFFSET{i - #diag_offset, i}:$OFFSET{i, i + #diag_offset}] = #namereg;")
|
||||
,statements, mappings);
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
result.push_back(stream.str());
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
public:
|
||||
vector_axpy_template(vector_axpy_template::parameters_type const & parameters, binding_policy_t binding_policy = BIND_ALL_UNIQUE) : template_base_impl<vector_axpy_template, vector_axpy_parameters>(parameters, binding_policy), up_to_internal_size_(false){ }
|
||||
|
||||
void up_to_internal_size(bool v) { up_to_internal_size_ = v; }
|
||||
|
||||
void enqueue(std::string const & kernel_prefix, std::vector<lazy_program_compiler> & programs, statements_container const & statements)
|
||||
{
|
||||
viennacl::scheduler::statement const & statement = statements.data().front();
|
||||
atidlas_int_t size = vector_size(lhs_most(statement.array(), statement.root()), up_to_internal_size_);
|
||||
|
||||
viennacl::ocl::kernel * kernel;
|
||||
if(p_.simd_width > 1 && (has_strided_access(statements) || (size%p_.simd_width>0) || has_misaligned_offset(statements)))
|
||||
kernel = &programs[0].program().get_kernel(kernel_prefix+"0");
|
||||
else
|
||||
kernel = &programs[1].program().get_kernel(kernel_prefix+"1");
|
||||
kernel->local_work_size(0, p_.local_size_0);
|
||||
kernel->global_work_size(0, p_.local_size_0*p_.num_groups);
|
||||
unsigned int current_arg = 0;
|
||||
kernel->arg(current_arg++, static_cast<cl_uint>(size));
|
||||
set_arguments(statements, *kernel, current_arg);
|
||||
viennacl::ocl::enqueue(*kernel);
|
||||
}
|
||||
|
||||
private:
|
||||
bool up_to_internal_size_;
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
27
atidlas/tools/find_and_replace.hpp
Normal file
27
atidlas/tools/find_and_replace.hpp
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef ATIDLAS_TOOLS_FIND_AND_REPLACE_HPP
|
||||
#define ATIDLAS_TOOLS_FIND_AND_REPLACE_HPP
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
namespace tools
|
||||
{
|
||||
|
||||
int inline find_and_replace(std::string & source, std::string const & find, std::string const & replace)
|
||||
{
|
||||
int num=0;
|
||||
size_t fLen = find.size();
|
||||
size_t rLen = replace.size();
|
||||
for (size_t pos=0; (pos=source.find(find, pos))!=std::string::npos; pos+=rLen)
|
||||
{
|
||||
num++;
|
||||
source.replace(pos, fLen, replace);
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
178
atidlas/tools/shared_ptr.hpp
Normal file
178
atidlas/tools/shared_ptr.hpp
Normal file
@@ -0,0 +1,178 @@
|
||||
#ifndef ATIDLAS_TOOLS_SHARED_PTR_HPP
|
||||
#define ATIDLAS_TOOLS_SHARED_PTR_HPP
|
||||
|
||||
/* =========================================================================
|
||||
Copyright (c) 2010-2012, Institute for Microelectronics,
|
||||
Institute for Analysis and Scientific Computing,
|
||||
TU Wien.
|
||||
Portions of this software are copyright by UChicago Argonne, LLC.
|
||||
|
||||
-----------------
|
||||
ViennaCL - The Vienna Computing Library
|
||||
-----------------
|
||||
|
||||
Project Head: Karl Rupp rupp@iue.tuwien.ac.at
|
||||
|
||||
(A list of authors and contributors can be found in the PDF manual)
|
||||
|
||||
License: MIT (X11), see file LICENSE in the base directory
|
||||
============================================================================= */
|
||||
|
||||
/** @file tools/shared_ptr.hpp
|
||||
@brief Implementation of a shared pointer class (cf. std::shared_ptr, boost::shared_ptr). Will be used until C++11 is widely available.
|
||||
|
||||
Contributed by Philippe Tillet.
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <algorithm>
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
namespace tools
|
||||
{
|
||||
namespace detail
|
||||
{
|
||||
|
||||
/** @brief Reference counting class for the shared_ptr implementation */
|
||||
class count
|
||||
{
|
||||
public:
|
||||
count(unsigned int val) : val_(val){ }
|
||||
void dec(){ --val_; }
|
||||
void inc(){ ++val_; }
|
||||
bool is_null(){ return val_ == 0; }
|
||||
unsigned int val(){ return val_; }
|
||||
private:
|
||||
unsigned int val_;
|
||||
};
|
||||
|
||||
/** @brief Interface for the reference counter inside the shared_ptr */
|
||||
struct aux
|
||||
{
|
||||
detail::count count;
|
||||
|
||||
aux() :count(1) {}
|
||||
virtual void destroy()=0;
|
||||
virtual ~aux() {}
|
||||
};
|
||||
|
||||
/** @brief Implementation helper for the reference counting mechanism inside shared_ptr. */
|
||||
template<class U, class Deleter>
|
||||
struct auximpl: public detail::aux
|
||||
{
|
||||
U* p;
|
||||
Deleter d;
|
||||
|
||||
auximpl(U* pu, Deleter x) :p(pu), d(x) {}
|
||||
virtual void destroy() { d(p); }
|
||||
};
|
||||
|
||||
/** @brief Default deleter class for a pointer. The default is to just call 'delete' on the pointer. Provide your own implementations for 'delete[]' and 'free'. */
|
||||
template<class U>
|
||||
struct default_deleter
|
||||
{
|
||||
void operator()(U* p) const { delete p; }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
/** @brief A shared pointer class similar to boost::shared_ptr. Reimplemented in order to avoid a Boost-dependency. Will be replaced by std::shared_ptr as soon as C++11 is widely available. */
|
||||
template<class T>
|
||||
class shared_ptr
|
||||
{
|
||||
template<class U>
|
||||
friend class shared_ptr;
|
||||
|
||||
detail::aux* pa;
|
||||
T* pt;
|
||||
|
||||
public:
|
||||
|
||||
shared_ptr() :pa(NULL), pt(NULL) {}
|
||||
|
||||
template<class U, class Deleter>
|
||||
shared_ptr(U* pu, Deleter d) : pa(new detail::auximpl<U, Deleter>(pu, d)), pt(pu) {}
|
||||
|
||||
template<class U>
|
||||
explicit shared_ptr(U* pu) : pa(new detail::auximpl<U, detail::default_deleter<U> >(pu, detail::default_deleter<U>())), pt(pu) {}
|
||||
|
||||
T* get() const { return pt; }
|
||||
|
||||
T* operator->() const { return pt; }
|
||||
|
||||
T& operator*() const { return *pt; }
|
||||
|
||||
shared_ptr(const shared_ptr& s) :pa(s.pa), pt(s.pt)
|
||||
{
|
||||
inc();
|
||||
}
|
||||
|
||||
template<class U>
|
||||
shared_ptr(const shared_ptr<U>& s) :pa(s.pa), pt(s.pt)
|
||||
{
|
||||
inc();
|
||||
}
|
||||
|
||||
~shared_ptr()
|
||||
{
|
||||
dec();
|
||||
}
|
||||
|
||||
void reset()
|
||||
{
|
||||
shared_ptr<T>().swap(*this);
|
||||
}
|
||||
|
||||
void reset(T * ptr)
|
||||
{
|
||||
shared_ptr<T>(ptr).swap(*this);
|
||||
}
|
||||
|
||||
void swap(shared_ptr<T> & other)
|
||||
{
|
||||
std::swap(pt,other.pt);
|
||||
std::swap(pa, other.pa);
|
||||
}
|
||||
|
||||
|
||||
shared_ptr& operator=(const shared_ptr& s)
|
||||
{
|
||||
if (this!=&s)
|
||||
{
|
||||
dec();
|
||||
pa = s.pa;
|
||||
pt = s.pt;
|
||||
inc();
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void inc()
|
||||
{
|
||||
if (pa) pa->count.inc();
|
||||
}
|
||||
|
||||
void dec()
|
||||
{
|
||||
if (pa)
|
||||
{
|
||||
pa->count.dec();
|
||||
|
||||
if (pa->count.is_null())
|
||||
{
|
||||
pa->destroy();
|
||||
delete pa;
|
||||
pa = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
22
atidlas/tools/to_string.hpp
Normal file
22
atidlas/tools/to_string.hpp
Normal file
@@ -0,0 +1,22 @@
|
||||
#ifndef ATIDLAS_TOOLS_TO_STRING_HPP
|
||||
#define ATIDLAS_TOOLS_TO_STRING_HPP
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
namespace tools
|
||||
{
|
||||
|
||||
template<class T>
|
||||
inline std::string to_string ( T const t )
|
||||
{
|
||||
std::stringstream ss;
|
||||
ss << t;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
500
atidlas/tree_parsing.hpp
Normal file
500
atidlas/tree_parsing.hpp
Normal file
@@ -0,0 +1,500 @@
|
||||
#ifndef ATIDLAS_TREE_PARSING_HPP
|
||||
#define ATIDLAS_TREE_PARSING_HPP
|
||||
|
||||
|
||||
#include <set>
|
||||
#include "CL/cl.h"
|
||||
|
||||
#include "viennacl/forwards.h"
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "atidlas/utils.hpp"
|
||||
#include "atidlas/forwards.h"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
|
||||
namespace tree_parsing
|
||||
{
|
||||
|
||||
/** @brief base functor class for traversing a statement */
|
||||
class traversal_functor
|
||||
{
|
||||
public:
|
||||
void call_before_expansion(viennacl::scheduler::statement const &, atidlas_int_t) const { }
|
||||
void call_after_expansion(viennacl::scheduler::statement const &, atidlas_int_t) const { }
|
||||
};
|
||||
|
||||
/** @brief Recursively execute a functor on a statement */
|
||||
template<class Fun>
|
||||
inline void traverse(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, Fun const & fun, bool inspect)
|
||||
{
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
bool recurse = utils::node_leaf(root_node.op)?inspect:true;
|
||||
|
||||
fun.call_before_expansion(statement, root_idx);
|
||||
|
||||
//Lhs:
|
||||
if (recurse)
|
||||
{
|
||||
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
traverse(statement, root_node.lhs.node_index, fun, inspect);
|
||||
if (root_node.lhs.type_family != viennacl::scheduler::INVALID_TYPE_FAMILY)
|
||||
fun(statement, root_idx, LHS_NODE_TYPE);
|
||||
}
|
||||
|
||||
//Self:
|
||||
fun(statement, root_idx, PARENT_NODE_TYPE);
|
||||
|
||||
//Rhs:
|
||||
if (recurse && root_node.rhs.type_family!=viennacl::scheduler::INVALID_TYPE_FAMILY)
|
||||
{
|
||||
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
traverse(statement, root_node.rhs.node_index, fun, inspect);
|
||||
if (root_node.rhs.type_family != viennacl::scheduler::INVALID_TYPE_FAMILY)
|
||||
fun(statement, root_idx, RHS_NODE_TYPE);
|
||||
}
|
||||
|
||||
fun.call_after_expansion(statement, root_idx);
|
||||
}
|
||||
|
||||
class filter_fun : public traversal_functor
|
||||
{
|
||||
public:
|
||||
typedef bool (*pred_t)(viennacl::scheduler::statement_node const & node);
|
||||
|
||||
filter_fun(pred_t pred, std::vector<size_t> & out) : pred_(pred), out_(out){ }
|
||||
|
||||
void operator()(viennacl::scheduler::statement const & statement, size_t root_idx, leaf_t) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const * root_node = &statement.array()[root_idx];
|
||||
if (pred_(*root_node))
|
||||
out_.push_back(root_idx);
|
||||
}
|
||||
private:
|
||||
pred_t pred_;
|
||||
std::vector<size_t> & out_;
|
||||
};
|
||||
|
||||
std::vector<size_t> filter_nodes(bool (*pred)(viennacl::scheduler::statement_node const & node), viennacl::scheduler::statement const & statement, bool inspect)
|
||||
{
|
||||
std::vector<size_t> res;
|
||||
tree_parsing::traverse(statement, statement.root(), filter_fun(pred, res), inspect);
|
||||
return res;
|
||||
}
|
||||
|
||||
class filter_elements_fun : public traversal_functor
|
||||
{
|
||||
public:
|
||||
filter_elements_fun(viennacl::scheduler::statement_node_subtype subtype, std::vector<viennacl::scheduler::lhs_rhs_element> & out) : subtype_(subtype), out_(out) { }
|
||||
|
||||
void operator()(viennacl::scheduler::statement const & statement, size_t root_idx, leaf_t) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const * root_node = &statement.array()[root_idx];
|
||||
if (root_node->lhs.subtype==subtype_)
|
||||
out_.push_back(root_node->lhs);
|
||||
if (root_node->rhs.subtype==subtype_)
|
||||
out_.push_back(root_node->rhs);
|
||||
}
|
||||
private:
|
||||
viennacl::scheduler::statement_node_subtype subtype_;
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> & out_;
|
||||
};
|
||||
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> filter_elements(viennacl::scheduler::statement_node_subtype subtype, viennacl::scheduler::statement const & statement)
|
||||
{
|
||||
std::vector<viennacl::scheduler::lhs_rhs_element> res;
|
||||
tree_parsing::traverse(statement, statement.root(), filter_elements_fun(subtype, res), true);
|
||||
return res;
|
||||
}
|
||||
|
||||
/** @brief generate a string from an operation_node_type */
|
||||
inline const char * evaluate(viennacl::scheduler::operation_node_type type)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
// unary expression
|
||||
switch (type)
|
||||
{
|
||||
//Function
|
||||
case OPERATION_UNARY_ABS_TYPE : return "abs";
|
||||
case OPERATION_UNARY_ACOS_TYPE : return "acos";
|
||||
case OPERATION_UNARY_ASIN_TYPE : return "asin";
|
||||
case OPERATION_UNARY_ATAN_TYPE : return "atan";
|
||||
case OPERATION_UNARY_CEIL_TYPE : return "ceil";
|
||||
case OPERATION_UNARY_COS_TYPE : return "cos";
|
||||
case OPERATION_UNARY_COSH_TYPE : return "cosh";
|
||||
case OPERATION_UNARY_EXP_TYPE : return "exp";
|
||||
case OPERATION_UNARY_FABS_TYPE : return "fabs";
|
||||
case OPERATION_UNARY_FLOOR_TYPE : return "floor";
|
||||
case OPERATION_UNARY_LOG_TYPE : return "log";
|
||||
case OPERATION_UNARY_LOG10_TYPE : return "log10";
|
||||
case OPERATION_UNARY_SIN_TYPE : return "sin";
|
||||
case OPERATION_UNARY_SINH_TYPE : return "sinh";
|
||||
case OPERATION_UNARY_SQRT_TYPE : return "sqrt";
|
||||
case OPERATION_UNARY_TAN_TYPE : return "tan";
|
||||
case OPERATION_UNARY_TANH_TYPE : return "tanh";
|
||||
|
||||
case OPERATION_UNARY_CAST_CHAR_TYPE : return "(char)";
|
||||
case OPERATION_UNARY_CAST_UCHAR_TYPE : return "(uchar)";
|
||||
case OPERATION_UNARY_CAST_SHORT_TYPE : return "(short)";
|
||||
case OPERATION_UNARY_CAST_USHORT_TYPE : return "(ushort)";
|
||||
case OPERATION_UNARY_CAST_INT_TYPE : return "(int)";
|
||||
case OPERATION_UNARY_CAST_UINT_TYPE : return "(uint)";
|
||||
case OPERATION_UNARY_CAST_LONG_TYPE : return "(long)";
|
||||
case OPERATION_UNARY_CAST_ULONG_TYPE : return "(ulong)";
|
||||
case OPERATION_UNARY_CAST_HALF_TYPE : return "(half)";
|
||||
case OPERATION_UNARY_CAST_FLOAT_TYPE : return "(float)";
|
||||
case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "(double)";
|
||||
|
||||
case OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE : return "argfmax";
|
||||
case OPERATION_BINARY_ELEMENT_ARGMAX_TYPE : return "argmax";
|
||||
case OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE : return "argfmin";
|
||||
case OPERATION_BINARY_ELEMENT_ARGMIN_TYPE : return "argmin";
|
||||
case OPERATION_BINARY_ELEMENT_POW_TYPE : return "pow";
|
||||
|
||||
//Arithmetic
|
||||
case OPERATION_UNARY_MINUS_TYPE : return "-";
|
||||
case OPERATION_BINARY_ASSIGN_TYPE : return "=";
|
||||
case OPERATION_BINARY_INPLACE_ADD_TYPE : return "+=";
|
||||
case OPERATION_BINARY_INPLACE_SUB_TYPE : return "-=";
|
||||
case OPERATION_BINARY_ADD_TYPE : return "+";
|
||||
case OPERATION_BINARY_SUB_TYPE : return "-";
|
||||
case OPERATION_BINARY_MULT_TYPE : return "*";
|
||||
case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "*";
|
||||
case OPERATION_BINARY_DIV_TYPE : return "/";
|
||||
case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "/";
|
||||
case OPERATION_BINARY_ACCESS_TYPE : return "[]";
|
||||
|
||||
//Relational
|
||||
case OPERATION_BINARY_ELEMENT_EQ_TYPE : return "isequal";
|
||||
case OPERATION_BINARY_ELEMENT_NEQ_TYPE : return "isnotequal";
|
||||
case OPERATION_BINARY_ELEMENT_GREATER_TYPE : return "isgreater";
|
||||
case OPERATION_BINARY_ELEMENT_GEQ_TYPE : return "isgreaterequal";
|
||||
case OPERATION_BINARY_ELEMENT_LESS_TYPE : return "isless";
|
||||
case OPERATION_BINARY_ELEMENT_LEQ_TYPE : return "islessequal";
|
||||
|
||||
case OPERATION_BINARY_ELEMENT_FMAX_TYPE : return "fmax";
|
||||
case OPERATION_BINARY_ELEMENT_FMIN_TYPE : return "fmin";
|
||||
case OPERATION_BINARY_ELEMENT_MAX_TYPE : return "max";
|
||||
case OPERATION_BINARY_ELEMENT_MIN_TYPE : return "min";
|
||||
//Unary
|
||||
case OPERATION_UNARY_TRANS_TYPE : return "trans";
|
||||
|
||||
//Binary
|
||||
case OPERATION_BINARY_INNER_PROD_TYPE : return "iprod";
|
||||
case OPERATION_BINARY_MAT_MAT_PROD_TYPE : return "mmprod";
|
||||
case OPERATION_BINARY_MAT_VEC_PROD_TYPE : return "mvprod";
|
||||
case OPERATION_BINARY_VECTOR_DIAG_TYPE : return "vdiag";
|
||||
case OPERATION_BINARY_MATRIX_DIAG_TYPE : return "mdiag";
|
||||
case OPERATION_BINARY_MATRIX_ROW_TYPE : return "row";
|
||||
case OPERATION_BINARY_MATRIX_COLUMN_TYPE : return "col";
|
||||
|
||||
default : throw generator_not_supported_exception("Unsupported operator");
|
||||
}
|
||||
}
|
||||
|
||||
inline const char * operator_string(viennacl::scheduler::operation_node_type type)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
switch (type)
|
||||
{
|
||||
case OPERATION_UNARY_CAST_CHAR_TYPE : return "char";
|
||||
case OPERATION_UNARY_CAST_UCHAR_TYPE : return "uchar";
|
||||
case OPERATION_UNARY_CAST_SHORT_TYPE : return "short";
|
||||
case OPERATION_UNARY_CAST_USHORT_TYPE : return "ushort";
|
||||
case OPERATION_UNARY_CAST_INT_TYPE : return "int";
|
||||
case OPERATION_UNARY_CAST_UINT_TYPE : return "uint";
|
||||
case OPERATION_UNARY_CAST_LONG_TYPE : return "long";
|
||||
case OPERATION_UNARY_CAST_ULONG_TYPE : return "ulong";
|
||||
case OPERATION_UNARY_CAST_HALF_TYPE : return "half";
|
||||
case OPERATION_UNARY_CAST_FLOAT_TYPE : return "float";
|
||||
case OPERATION_UNARY_CAST_DOUBLE_TYPE : return "double";
|
||||
|
||||
case OPERATION_UNARY_MINUS_TYPE : return "umin";
|
||||
case OPERATION_BINARY_ASSIGN_TYPE : return "assign";
|
||||
case OPERATION_BINARY_INPLACE_ADD_TYPE : return "ip_add";
|
||||
case OPERATION_BINARY_INPLACE_SUB_TYPE : return "ip_sub";
|
||||
case OPERATION_BINARY_ADD_TYPE : return "add";
|
||||
case OPERATION_BINARY_SUB_TYPE : return "sub";
|
||||
case OPERATION_BINARY_MULT_TYPE : return "mult";
|
||||
case OPERATION_BINARY_ELEMENT_PROD_TYPE : return "eprod";
|
||||
case OPERATION_BINARY_DIV_TYPE : return "div";
|
||||
case OPERATION_BINARY_ELEMENT_DIV_TYPE : return "ediv";
|
||||
case OPERATION_BINARY_ACCESS_TYPE : return "acc";
|
||||
default : return evaluate(type);
|
||||
}
|
||||
}
|
||||
|
||||
/** @brief functor for generating the expression string from a statement */
|
||||
class evaluate_expression_traversal: public tree_parsing::traversal_functor
|
||||
{
|
||||
private:
|
||||
std::map<std::string, std::string> const & accessors_;
|
||||
std::string & str_;
|
||||
mapping_type const & mapping_;
|
||||
|
||||
public:
|
||||
evaluate_expression_traversal(std::map<std::string, std::string> const & accessors, std::string & str, mapping_type const & mapping) : accessors_(accessors), str_(str), mapping_(mapping){ }
|
||||
|
||||
void call_before_expansion(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
if ((root_node.op.type_family==viennacl::scheduler::OPERATION_UNARY_TYPE_FAMILY || utils::elementwise_function(root_node.op))
|
||||
&& !utils::node_leaf(root_node.op))
|
||||
str_+=tree_parsing::evaluate(root_node.op.type);
|
||||
str_+="(";
|
||||
|
||||
}
|
||||
|
||||
void call_after_expansion(viennacl::scheduler::statement const & /*statement*/, atidlas_int_t /*root_idx*/) const
|
||||
{
|
||||
str_+=")";
|
||||
}
|
||||
|
||||
void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
mapping_type::key_type key = std::make_pair(root_idx, leaf);
|
||||
if (leaf==PARENT_NODE_TYPE)
|
||||
{
|
||||
if (utils::node_leaf(root_node.op))
|
||||
str_ += mapping_.at(key)->evaluate(accessors_);
|
||||
else if (utils::elementwise_operator(root_node.op))
|
||||
str_ += tree_parsing::evaluate(root_node.op.type);
|
||||
else if (root_node.op.type_family!=viennacl::scheduler::OPERATION_UNARY_TYPE_FAMILY && utils::elementwise_function(root_node.op))
|
||||
str_ += ",";
|
||||
}
|
||||
else
|
||||
{
|
||||
if (leaf==LHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.lhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
str_ += mapping_.at(key)->evaluate(accessors_);
|
||||
}
|
||||
|
||||
if (leaf==RHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.rhs.type_family!=viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
str_ += mapping_.at(key)->evaluate(accessors_);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
inline std::string evaluate(leaf_t leaf, std::map<std::string, std::string> const & accessors,
|
||||
viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, mapping_type const & mapping)
|
||||
{
|
||||
std::string res;
|
||||
evaluate_expression_traversal traversal_functor(accessors, res, mapping);
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
|
||||
if (leaf==RHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, false);
|
||||
else
|
||||
traversal_functor(statement, root_idx, leaf);
|
||||
}
|
||||
else if (leaf==LHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, false);
|
||||
else
|
||||
traversal_functor(statement, root_idx, leaf);
|
||||
}
|
||||
else
|
||||
tree_parsing::traverse(statement, root_idx, traversal_functor, false);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
inline void evaluate(utils::kernel_generation_stream & stream, leaf_t leaf, std::map<std::string, std::string> const & accessors,
|
||||
statements_container const & statements, std::vector<mapping_type> const & mappings)
|
||||
{
|
||||
statements_container::data_type::const_iterator sit;
|
||||
std::vector<mapping_type>::const_iterator mit;
|
||||
|
||||
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
|
||||
stream << evaluate(leaf, accessors, *sit, sit->root(), *mit) << ";" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
/** @brief functor for fetching or writing-back the elements in a statement */
|
||||
class process_traversal : public tree_parsing::traversal_functor
|
||||
{
|
||||
public:
|
||||
process_traversal(std::multimap<std::string, std::string> const & accessors, utils::kernel_generation_stream & stream,
|
||||
mapping_type const & mapping, std::set<std::string> & already_processed) : accessors_(accessors), stream_(stream), mapping_(mapping), already_processed_(already_processed){ }
|
||||
|
||||
void operator()(viennacl::scheduler::statement const & /*statement*/, atidlas_int_t root_idx, leaf_t leaf) const
|
||||
{
|
||||
mapping_type::const_iterator it = mapping_.find(std::make_pair(root_idx, leaf));
|
||||
if (it!=mapping_.end())
|
||||
{
|
||||
mapped_object * obj = it->second.get();
|
||||
std::string key = obj->type_key();
|
||||
if(already_processed_.insert(obj->process("#name")).second && accessors_.find(obj->type_key())!=accessors_.end())
|
||||
for(std::multimap<std::string, std::string>::const_iterator it = accessors_.lower_bound(key) ; it != accessors_.upper_bound(key) ; ++it)
|
||||
stream_ << obj->process(it->second) << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
std::multimap<std::string, std::string> accessors_;
|
||||
utils::kernel_generation_stream & stream_;
|
||||
mapping_type const & mapping_;
|
||||
std::set<std::string> & already_processed_;
|
||||
};
|
||||
|
||||
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
|
||||
viennacl::scheduler::statement const & statement, size_t root_idx, mapping_type const & mapping, std::set<std::string> & already_processed)
|
||||
{
|
||||
process_traversal traversal_functor(accessors, stream, mapping, already_processed);
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
|
||||
if (leaf==RHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.rhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
tree_parsing::traverse(statement, root_node.rhs.node_index, traversal_functor, true);
|
||||
else
|
||||
traversal_functor(statement, root_idx, leaf);
|
||||
}
|
||||
else if (leaf==LHS_NODE_TYPE)
|
||||
{
|
||||
if (root_node.lhs.type_family==viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
tree_parsing::traverse(statement, root_node.lhs.node_index, traversal_functor, true);
|
||||
else
|
||||
traversal_functor(statement, root_idx, leaf);
|
||||
}
|
||||
else
|
||||
{
|
||||
tree_parsing::traverse(statement, root_idx, traversal_functor, true);
|
||||
}
|
||||
}
|
||||
|
||||
inline void process(utils::kernel_generation_stream & stream, leaf_t leaf, std::multimap<std::string, std::string> const & accessors,
|
||||
statements_container const & statements, std::vector<mapping_type> const & mappings)
|
||||
{
|
||||
statements_container::data_type::const_iterator sit;
|
||||
std::vector<mapping_type>::const_iterator mit;
|
||||
std::set<std::string> already_processed;
|
||||
|
||||
for (mit = mappings.begin(), sit = statements.data().begin(); sit != statements.data().end(); ++mit, ++sit)
|
||||
process(stream, leaf, accessors, *sit, sit->root(), *mit, already_processed);
|
||||
}
|
||||
|
||||
|
||||
class statement_representation_functor : public traversal_functor{
|
||||
private:
|
||||
static void append_id(char * & ptr, unsigned int val)
|
||||
{
|
||||
if (val==0)
|
||||
*ptr++='0';
|
||||
else
|
||||
while (val>0)
|
||||
{
|
||||
*ptr++= (char)('0' + (val % 10));
|
||||
val /= 10;
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
typedef void result_type;
|
||||
|
||||
statement_representation_functor(symbolic_binder & binder, char *& ptr) : binder_(binder), ptr_(ptr){ }
|
||||
|
||||
template<class NumericT>
|
||||
inline result_type operator()(NumericT const & /*scal*/) const
|
||||
{
|
||||
*ptr_++='h'; //host
|
||||
*ptr_++='s'; //scalar
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
}
|
||||
|
||||
/** @brief Scalar mapping */
|
||||
template<class NumericT>
|
||||
inline result_type operator()(viennacl::scalar<NumericT> const & scal) const
|
||||
{
|
||||
*ptr_++='s'; //scalar
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
append_id(ptr_, binder_.get(&viennacl::traits::handle(scal)));
|
||||
}
|
||||
|
||||
/** @brief Vector mapping */
|
||||
template<class NumericT>
|
||||
inline result_type operator()(viennacl::vector_base<NumericT> const & vec) const
|
||||
{
|
||||
*ptr_++='v'; //vector
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
append_id(ptr_, binder_.get(&viennacl::traits::handle(vec)));
|
||||
}
|
||||
|
||||
/** @brief Implicit vector mapping */
|
||||
template<class NumericT>
|
||||
inline result_type operator()(viennacl::implicit_vector_base<NumericT> const & /*vec*/) const
|
||||
{
|
||||
*ptr_++='i'; //implicit
|
||||
*ptr_++='v'; //vector
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
}
|
||||
|
||||
/** @brief Matrix mapping */
|
||||
template<class NumericT>
|
||||
inline result_type operator()(viennacl::matrix_base<NumericT> const & mat) const
|
||||
{
|
||||
*ptr_++='m'; //Matrix
|
||||
*ptr_++=mat.row_major()?'r':'c';
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
append_id(ptr_, binder_.get(&viennacl::traits::handle(mat)));
|
||||
}
|
||||
|
||||
/** @brief Implicit matrix mapping */
|
||||
template<class NumericT>
|
||||
inline result_type operator()(viennacl::implicit_matrix_base<NumericT> const & /*mat*/) const
|
||||
{
|
||||
*ptr_++='i'; //implicit
|
||||
*ptr_++='m'; //matrix
|
||||
*ptr_++=utils::first_letter_of_type<NumericT>::value();
|
||||
}
|
||||
|
||||
static inline void append(char*& p, const char * str)
|
||||
{
|
||||
std::size_t n = std::strlen(str);
|
||||
std::memcpy(p, str, n);
|
||||
p+=n;
|
||||
}
|
||||
|
||||
inline void operator()(viennacl::scheduler::statement const & statement, atidlas_int_t root_idx, leaf_t leaf_t) const
|
||||
{
|
||||
viennacl::scheduler::statement_node const & root_node = statement.array()[root_idx];
|
||||
if (leaf_t==LHS_NODE_TYPE && root_node.lhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
utils::call_on_element(root_node.lhs, *this);
|
||||
else if (root_node.op.type_family==viennacl::scheduler::OPERATION_BINARY_TYPE_FAMILY && leaf_t==RHS_NODE_TYPE && root_node.rhs.type_family != viennacl::scheduler::COMPOSITE_OPERATION_FAMILY)
|
||||
utils::call_on_element(root_node.rhs, *this);
|
||||
else if (leaf_t==PARENT_NODE_TYPE)
|
||||
append_id(ptr_,root_node.op.type);
|
||||
}
|
||||
|
||||
private:
|
||||
symbolic_binder & binder_;
|
||||
char *& ptr_;
|
||||
};
|
||||
|
||||
inline std::string statements_representation(statements_container const & statements, binding_policy_t binding_policy)
|
||||
{
|
||||
std::vector<char> program_name_vector(256);
|
||||
char* program_name = program_name_vector.data();
|
||||
if (statements.order()==statements_container::INDEPENDENT)
|
||||
*program_name++='i';
|
||||
else
|
||||
*program_name++='s';
|
||||
tools::shared_ptr<symbolic_binder> binder = make_binder(binding_policy);
|
||||
for (statements_container::data_type::const_iterator it = statements.data().begin(); it != statements.data().end(); ++it)
|
||||
tree_parsing::traverse(*it, it->root(), tree_parsing::statement_representation_functor(*binder, program_name),true);
|
||||
*program_name='\0';
|
||||
return std::string(program_name_vector.data());
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
572
atidlas/utils.hpp
Normal file
572
atidlas/utils.hpp
Normal file
@@ -0,0 +1,572 @@
|
||||
#ifndef ATIDLAS_UTILS_HPP
|
||||
#define ATIDLAS_UTILS_HPP
|
||||
|
||||
|
||||
#include <sstream>
|
||||
|
||||
#include "viennacl/matrix_def.hpp"
|
||||
#include "viennacl/vector_def.hpp"
|
||||
|
||||
#include "viennacl/ocl/forwards.h"
|
||||
|
||||
#include "viennacl/scheduler/forwards.h"
|
||||
|
||||
#include "viennacl/traits/size.hpp"
|
||||
#include "viennacl/traits/handle.hpp"
|
||||
#include "viennacl/traits/row_major.hpp"
|
||||
|
||||
#include "atidlas/tools/to_string.hpp"
|
||||
#include "atidlas/forwards.h"
|
||||
|
||||
namespace atidlas
|
||||
{
|
||||
namespace utils
|
||||
{
|
||||
|
||||
//CUDA Conversion
|
||||
inline std::string opencl_source_to_cuda_source(std::string const & opencl_src)
|
||||
{
|
||||
std::string res = opencl_src;
|
||||
|
||||
viennacl::tools::find_and_replace(res,"__attribute__","//__attribute__");
|
||||
|
||||
//Pointer
|
||||
viennacl::tools::find_and_replace(res, "__global float*", "float*");
|
||||
viennacl::tools::find_and_replace(res, "__local float*", "float*");
|
||||
|
||||
viennacl::tools::find_and_replace(res, "__global double*", "double*");
|
||||
viennacl::tools::find_and_replace(res, "__local double*", "double*");
|
||||
|
||||
//Qualifiers
|
||||
viennacl::tools::find_and_replace(res,"__global","__device__");
|
||||
viennacl::tools::find_and_replace(res,"__kernel","__global__");
|
||||
viennacl::tools::find_and_replace(res,"__constant","__constant__");
|
||||
viennacl::tools::find_and_replace(res,"__local","__shared__");
|
||||
|
||||
//Indexing
|
||||
viennacl::tools::find_and_replace(res,"get_num_groups(0)","gridDim.x");
|
||||
viennacl::tools::find_and_replace(res,"get_num_groups(1)","gridDim.y");
|
||||
|
||||
viennacl::tools::find_and_replace(res,"get_local_size(0)","blockDim.x");
|
||||
viennacl::tools::find_and_replace(res,"get_local_size(1)","blockDim.y");
|
||||
|
||||
viennacl::tools::find_and_replace(res,"get_group_id(0)","blockIdx.x");
|
||||
viennacl::tools::find_and_replace(res,"get_group_id(1)","blockIdx.y");
|
||||
|
||||
viennacl::tools::find_and_replace(res,"get_local_id(0)","threadIdx.x");
|
||||
viennacl::tools::find_and_replace(res,"get_local_id(1)","threadIdx.y");
|
||||
|
||||
viennacl::tools::find_and_replace(res,"get_global_id(0)","(blockIdx.x*blockDim.x + threadIdx.x)");
|
||||
viennacl::tools::find_and_replace(res,"get_global_id(1)","(blockIdx.y*blockDim.y + threadIdx.y)");
|
||||
|
||||
//Synchronization
|
||||
viennacl::tools::find_and_replace(res,"barrier(CLK_LOCAL_MEM_FENCE)","__syncthreads()");
|
||||
viennacl::tools::find_and_replace(res,"barrier(CLK_GLOBAL_MEM_FENCE)","__syncthreads()");
|
||||
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
static std::string numeric_type_to_string(viennacl::scheduler::statement_node_numeric_type const & type){
|
||||
switch (type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return "char";
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return "unsigned char";
|
||||
//case viennacl::scheduler::SHORT_TYPE: return "short";
|
||||
//case viennacl::scheduler::USHORT_TYPE: return "unsigned short";
|
||||
case viennacl::scheduler::INT_TYPE: return "int";
|
||||
case viennacl::scheduler::UINT_TYPE: return "unsigned int";
|
||||
case viennacl::scheduler::LONG_TYPE: return "long";
|
||||
case viennacl::scheduler::ULONG_TYPE: return "unsigned long";
|
||||
case viennacl::scheduler::FLOAT_TYPE : return "float";
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return "double";
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_host_scalar(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.type_family == viennacl::scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a host scalar"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(element.host_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(element.host_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(element.host_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(element.host_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(element.host_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(element.host_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(element.host_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(element.host_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(element.host_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(element.host_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_scalar(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.type_family == viennacl::scheduler::SCALAR_TYPE_FAMILY && bool("Must be called on a scalar"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.scalar_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.scalar_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.scalar_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.scalar_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(*element.scalar_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(*element.scalar_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(*element.scalar_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(*element.scalar_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.scalar_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.scalar_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_vector(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.type_family == viennacl::scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a vector"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.vector_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.vector_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.vector_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.vector_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(*element.vector_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(*element.vector_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(*element.vector_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(*element.vector_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.vector_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.vector_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_implicit_vector(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.type_family == viennacl::scheduler::VECTOR_TYPE_FAMILY && bool("Must be called on a implicit_vector"));
|
||||
assert(element.subtype == viennacl::scheduler::IMPLICIT_VECTOR_TYPE && bool("Must be called on a implicit_vector"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.implicit_vector_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.implicit_vector_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.implicit_vector_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.implicit_vector_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(*element.implicit_vector_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(*element.implicit_vector_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(*element.implicit_vector_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(*element.implicit_vector_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.implicit_vector_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.implicit_vector_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_matrix(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.type_family == viennacl::scheduler::MATRIX_TYPE_FAMILY && bool("Must be called on a matrix"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.matrix_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.matrix_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.matrix_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.matrix_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(*element.matrix_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(*element.matrix_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(*element.matrix_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(*element.matrix_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.matrix_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.matrix_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_implicit_matrix(viennacl::scheduler::lhs_rhs_element element, Fun const & fun){
|
||||
assert(element.subtype == viennacl::scheduler::IMPLICIT_MATRIX_TYPE && bool("Must be called on a implicit matrix"));
|
||||
switch (element.numeric_type)
|
||||
{
|
||||
//case viennacl::scheduler::CHAR_TYPE: return fun(*element.implicit_matrix_char);
|
||||
//case viennacl::scheduler::UCHAR_TYPE: return fun(*element.implicit_matrix_uchar);
|
||||
//case viennacl::scheduler::SHORT_TYPE: return fun(*element.implicit_matrix_short);
|
||||
//case viennacl::scheduler::USHORT_TYPE: return fun(*element.implicit_matrix_ushort);
|
||||
case viennacl::scheduler::INT_TYPE: return fun(*element.implicit_matrix_int);
|
||||
case viennacl::scheduler::UINT_TYPE: return fun(*element.implicit_matrix_uint);
|
||||
case viennacl::scheduler::LONG_TYPE: return fun(*element.implicit_matrix_long);
|
||||
case viennacl::scheduler::ULONG_TYPE: return fun(*element.implicit_matrix_ulong);
|
||||
case viennacl::scheduler::FLOAT_TYPE : return fun(*element.implicit_matrix_float);
|
||||
case viennacl::scheduler::DOUBLE_TYPE : return fun(*element.implicit_matrix_double);
|
||||
default : throw generator_not_supported_exception("Unsupported Scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
template<class Fun>
|
||||
static typename Fun::result_type call_on_element(viennacl::scheduler::lhs_rhs_element const & element, Fun const & fun){
|
||||
switch (element.type_family)
|
||||
{
|
||||
case viennacl::scheduler::SCALAR_TYPE_FAMILY:
|
||||
if (element.subtype == viennacl::scheduler::HOST_SCALAR_TYPE)
|
||||
return call_on_host_scalar(element, fun);
|
||||
else
|
||||
return call_on_scalar(element, fun);
|
||||
case viennacl::scheduler::VECTOR_TYPE_FAMILY :
|
||||
if (element.subtype == viennacl::scheduler::IMPLICIT_VECTOR_TYPE)
|
||||
return call_on_implicit_vector(element, fun);
|
||||
else
|
||||
return call_on_vector(element, fun);
|
||||
case viennacl::scheduler::MATRIX_TYPE_FAMILY:
|
||||
if (element.subtype == viennacl::scheduler::IMPLICIT_MATRIX_TYPE)
|
||||
return call_on_implicit_matrix(element, fun);
|
||||
else
|
||||
return call_on_matrix(element,fun);
|
||||
default:
|
||||
throw generator_not_supported_exception("Unsupported datastructure type : Not among {Scalar, Vector, Matrix}");
|
||||
}
|
||||
}
|
||||
|
||||
struct scalartype_size_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
result_type operator()(float const &) const { return sizeof(float); }
|
||||
result_type operator()(double const &) const { return sizeof(double); }
|
||||
template<class T> result_type operator()(T const &) const { return sizeof(typename viennacl::result_of::cpu_value_type<T>::type); }
|
||||
};
|
||||
|
||||
struct internal_size_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::internal_size(t); }
|
||||
};
|
||||
|
||||
struct size_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::size(t); }
|
||||
};
|
||||
|
||||
struct start_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start(t); }
|
||||
};
|
||||
|
||||
|
||||
struct stride_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride(t); }
|
||||
};
|
||||
|
||||
struct start1_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start1(t); }
|
||||
};
|
||||
|
||||
struct start2_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::start2(t); }
|
||||
};
|
||||
|
||||
struct leading_stride_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::stride2(t):viennacl::traits::stride1(t); }
|
||||
};
|
||||
|
||||
struct leading_start_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::row_major(t)?viennacl::traits::start2(t):viennacl::traits::start1(t); }
|
||||
};
|
||||
|
||||
struct stride1_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride1(t); }
|
||||
};
|
||||
|
||||
struct stride2_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T> result_type operator()(T const &t) const { return viennacl::traits::stride2(t); }
|
||||
};
|
||||
|
||||
struct handle_fun
|
||||
{
|
||||
typedef cl_mem result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::opencl_handle(t); }
|
||||
};
|
||||
|
||||
struct internal_size1_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::internal_size1(t); }
|
||||
};
|
||||
|
||||
struct row_major_fun
|
||||
{
|
||||
typedef bool result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::row_major(t); }
|
||||
};
|
||||
|
||||
struct internal_size2_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::internal_size2(t); }
|
||||
};
|
||||
|
||||
struct size1_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::size1(t); }
|
||||
};
|
||||
|
||||
struct size2_fun
|
||||
{
|
||||
typedef atidlas_int_t result_type;
|
||||
template<class T>
|
||||
result_type operator()(T const &t) const { return viennacl::traits::size2(t); }
|
||||
};
|
||||
|
||||
template<class T, class U>
|
||||
struct is_same_type { enum { value = 0 }; };
|
||||
|
||||
template<class T>
|
||||
struct is_same_type<T,T> { enum { value = 1 }; };
|
||||
|
||||
inline bool is_reduction(viennacl::scheduler::statement_node const & node)
|
||||
{
|
||||
return node.op.type_family==viennacl::scheduler::OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
|
||||
|| node.op.type_family==viennacl::scheduler::OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY
|
||||
|| node.op.type_family==viennacl::scheduler::OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|
||||
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_INNER_PROD_TYPE
|
||||
|| node.op.type==viennacl::scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE;
|
||||
}
|
||||
|
||||
inline bool is_index_reduction(viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
return op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMAX_TYPE
|
||||
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMAX_TYPE
|
||||
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGFMIN_TYPE
|
||||
|| op.type==viennacl::scheduler::OPERATION_BINARY_ELEMENT_ARGMIN_TYPE;
|
||||
}
|
||||
template<class T>
|
||||
struct type_to_string;
|
||||
template<> struct type_to_string<unsigned char> { static const char * value() { return "unsigned char"; } };
|
||||
template<> struct type_to_string<char> { static const char * value() { return "char"; } };
|
||||
template<> struct type_to_string<unsigned short> { static const char * value() { return "unsigned short"; } };
|
||||
template<> struct type_to_string<short> { static const char * value() { return "short"; } };
|
||||
template<> struct type_to_string<unsigned int> { static const char * value() { return "unsigned int"; } };
|
||||
template<> struct type_to_string<int> { static const char * value() { return "int"; } };
|
||||
template<> struct type_to_string<unsigned long> { static const char * value() { return "unsigned long"; } };
|
||||
template<> struct type_to_string<long> { static const char * value() { return "long"; } };
|
||||
template<> struct type_to_string<float> { static const char * value() { return "float"; } };
|
||||
template<> struct type_to_string<double> { static const char * value() { return "double"; } };
|
||||
|
||||
|
||||
template<class T>
|
||||
struct first_letter_of_type;
|
||||
template<> struct first_letter_of_type<char> { static char value() { return 'c'; } };
|
||||
template<> struct first_letter_of_type<unsigned char> { static char value() { return 'd'; } };
|
||||
template<> struct first_letter_of_type<short> { static char value() { return 's'; } };
|
||||
template<> struct first_letter_of_type<unsigned short> { static char value() { return 't'; } };
|
||||
template<> struct first_letter_of_type<int> { static char value() { return 'i'; } };
|
||||
template<> struct first_letter_of_type<unsigned int> { static char value() { return 'j'; } };
|
||||
template<> struct first_letter_of_type<long> { static char value() { return 'l'; } };
|
||||
template<> struct first_letter_of_type<unsigned long> { static char value() { return 'm'; } };
|
||||
template<> struct first_letter_of_type<float> { static char value() { return 'f'; } };
|
||||
template<> struct first_letter_of_type<double> { static char value() { return 'd'; } };
|
||||
|
||||
class kernel_generation_stream : public std::ostream
|
||||
{
|
||||
class kgenstream : public std::stringbuf
|
||||
{
|
||||
public:
|
||||
kgenstream(std::ostringstream& oss,unsigned int const & tab_count) : oss_(oss), tab_count_(tab_count){ }
|
||||
int sync() {
|
||||
for (unsigned int i=0; i<tab_count_;++i)
|
||||
oss_ << " ";
|
||||
oss_ << str();
|
||||
str("");
|
||||
return !oss_;
|
||||
}
|
||||
~kgenstream() { pubsync(); }
|
||||
private:
|
||||
std::ostream& oss_;
|
||||
unsigned int const & tab_count_;
|
||||
};
|
||||
|
||||
public:
|
||||
kernel_generation_stream() : std::ostream(new kgenstream(oss,tab_count_)), tab_count_(0){ }
|
||||
~kernel_generation_stream(){ delete rdbuf(); }
|
||||
|
||||
std::string str(){ return oss.str(); }
|
||||
void inc_tab(){ ++tab_count_; }
|
||||
void dec_tab(){ --tab_count_; }
|
||||
private:
|
||||
unsigned int tab_count_;
|
||||
std::ostringstream oss;
|
||||
};
|
||||
|
||||
inline bool node_leaf(viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
return op.type==OPERATION_UNARY_NORM_1_TYPE
|
||||
|| op.type==OPERATION_UNARY_NORM_2_TYPE
|
||||
|| op.type==OPERATION_UNARY_NORM_INF_TYPE
|
||||
|| op.type==OPERATION_UNARY_TRANS_TYPE
|
||||
|| op.type==OPERATION_BINARY_MAT_VEC_PROD_TYPE
|
||||
|| op.type==OPERATION_BINARY_MAT_MAT_PROD_TYPE
|
||||
|| op.type==OPERATION_BINARY_INNER_PROD_TYPE
|
||||
|| op.type==OPERATION_BINARY_MATRIX_DIAG_TYPE
|
||||
|| op.type==OPERATION_BINARY_VECTOR_DIAG_TYPE
|
||||
|| op.type==OPERATION_BINARY_MATRIX_ROW_TYPE
|
||||
|| op.type==OPERATION_BINARY_MATRIX_COLUMN_TYPE
|
||||
|| op.type_family==OPERATION_VECTOR_REDUCTION_TYPE_FAMILY
|
||||
|| op.type_family==OPERATION_ROWS_REDUCTION_TYPE_FAMILY
|
||||
|| op.type_family==OPERATION_COLUMNS_REDUCTION_TYPE_FAMILY;
|
||||
}
|
||||
|
||||
inline bool elementwise_operator(viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
return op.type== OPERATION_BINARY_ASSIGN_TYPE
|
||||
|| op.type== OPERATION_BINARY_INPLACE_ADD_TYPE
|
||||
|| op.type== OPERATION_BINARY_INPLACE_SUB_TYPE
|
||||
|| op.type== OPERATION_BINARY_ADD_TYPE
|
||||
|| op.type== OPERATION_BINARY_SUB_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_PROD_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_DIV_TYPE
|
||||
|| op.type== OPERATION_BINARY_MULT_TYPE
|
||||
|| op.type== OPERATION_BINARY_DIV_TYPE;
|
||||
}
|
||||
|
||||
inline bool elementwise_function(viennacl::scheduler::op_element const & op)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
return
|
||||
|
||||
op.type == OPERATION_UNARY_CAST_CHAR_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_UCHAR_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_SHORT_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_USHORT_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_INT_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_UINT_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_LONG_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_ULONG_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_HALF_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_FLOAT_TYPE
|
||||
|| op.type == OPERATION_UNARY_CAST_DOUBLE_TYPE
|
||||
|
||||
|| op.type== OPERATION_UNARY_ABS_TYPE
|
||||
|| op.type== OPERATION_UNARY_ACOS_TYPE
|
||||
|| op.type== OPERATION_UNARY_ASIN_TYPE
|
||||
|| op.type== OPERATION_UNARY_ATAN_TYPE
|
||||
|| op.type== OPERATION_UNARY_CEIL_TYPE
|
||||
|| op.type== OPERATION_UNARY_COS_TYPE
|
||||
|| op.type== OPERATION_UNARY_COSH_TYPE
|
||||
|| op.type== OPERATION_UNARY_EXP_TYPE
|
||||
|| op.type== OPERATION_UNARY_FABS_TYPE
|
||||
|| op.type== OPERATION_UNARY_FLOOR_TYPE
|
||||
|| op.type== OPERATION_UNARY_LOG_TYPE
|
||||
|| op.type== OPERATION_UNARY_LOG10_TYPE
|
||||
|| op.type== OPERATION_UNARY_SIN_TYPE
|
||||
|| op.type== OPERATION_UNARY_SINH_TYPE
|
||||
|| op.type== OPERATION_UNARY_SQRT_TYPE
|
||||
|| op.type== OPERATION_UNARY_TAN_TYPE
|
||||
|| op.type== OPERATION_UNARY_TANH_TYPE
|
||||
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_POW_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_EQ_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_NEQ_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_GREATER_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_LESS_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_GEQ_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_LEQ_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_FMAX_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_FMIN_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_MAX_TYPE
|
||||
|| op.type== OPERATION_BINARY_ELEMENT_MIN_TYPE;
|
||||
|
||||
}
|
||||
|
||||
inline viennacl::scheduler::lhs_rhs_element & lhs_rhs_element(viennacl::scheduler::statement const & st, atidlas_int_t idx, leaf_t leaf)
|
||||
{
|
||||
using namespace tree_parsing;
|
||||
assert(leaf==LHS_NODE_TYPE || leaf==RHS_NODE_TYPE);
|
||||
if (leaf==LHS_NODE_TYPE)
|
||||
return const_cast<viennacl::scheduler::lhs_rhs_element &>(st.array()[idx].lhs);
|
||||
return const_cast<viennacl::scheduler::lhs_rhs_element &>(st.array()[idx].rhs);
|
||||
}
|
||||
|
||||
inline unsigned int size_of(viennacl::scheduler::statement_node_numeric_type type)
|
||||
{
|
||||
using namespace viennacl::scheduler;
|
||||
switch (type)
|
||||
{
|
||||
case UCHAR_TYPE:
|
||||
case CHAR_TYPE: return 1;
|
||||
|
||||
case USHORT_TYPE:
|
||||
case SHORT_TYPE:
|
||||
case HALF_TYPE: return 2;
|
||||
|
||||
case UINT_TYPE:
|
||||
case INT_TYPE:
|
||||
case FLOAT_TYPE: return 4;
|
||||
|
||||
case ULONG_TYPE:
|
||||
case LONG_TYPE:
|
||||
case DOUBLE_TYPE: return 8;
|
||||
|
||||
default: throw generator_not_supported_exception("Unsupported scalartype");
|
||||
}
|
||||
}
|
||||
|
||||
inline std::string append_width(std::string const & str, unsigned int width)
|
||||
{
|
||||
if (width==1)
|
||||
return str;
|
||||
return str + tools::to_string(width);
|
||||
}
|
||||
|
||||
template<typename MapT>
|
||||
class create_map
|
||||
{
|
||||
typedef typename MapT::key_type T;
|
||||
typedef typename MapT::mapped_type U;
|
||||
public:
|
||||
create_map(const T& key, const U& val)
|
||||
{
|
||||
map_.insert(std::make_pair(key,val));
|
||||
}
|
||||
|
||||
create_map<MapT>& operator()(const T& key, const U& val)
|
||||
{
|
||||
map_.insert(std::make_pair(key,val));
|
||||
return *this;
|
||||
}
|
||||
|
||||
operator MapT()
|
||||
{
|
||||
return map_;
|
||||
}
|
||||
private:
|
||||
MapT map_;
|
||||
};
|
||||
|
||||
typedef create_map<std::multimap<std::string, std::string> > create_process_accessors;
|
||||
typedef create_map<std::map<std::string, std::string> > create_evaluate_accessors;
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
88
cmake_modules/FindOpenCL.cmake
Normal file
88
cmake_modules/FindOpenCL.cmake
Normal file
@@ -0,0 +1,88 @@
|
||||
# - Find the OpenCL headers and library
|
||||
#
|
||||
# Defines the following if found:
|
||||
# OPENCL_FOUND : TRUE if found, FALSE otherwise
|
||||
# OPENCL_INCLUDE_DIRS : Include directories for OpenCL
|
||||
# OPENCL_LIBRARIES : The libraries to link against
|
||||
#
|
||||
# The user can set the OPENCLROOT environment variable to help finding OpenCL
|
||||
# if it is installed in a non-standard place.
|
||||
|
||||
set(ENV_ATISTREAMSDKROOT $ENV{ATISTREAMSDKROOT})
|
||||
if(ENV_ATISTREAMSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{ATISTREAMSDKROOT})
|
||||
endif(ENV_ATISTREAMSDKROOT)
|
||||
|
||||
set(ENV_AMDAPPSDKROOT $ENV{AMDAPPSDKROOT})
|
||||
if(ENV_AMDAPPSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{AMDAPPSDKROOT})
|
||||
endif(ENV_AMDAPPSDKROOT)
|
||||
|
||||
set(ENV_INTELOCLSDKROOT $ENV{INTELOCLSDKROOT})
|
||||
if(ENV_INTELOCLSDKROOT)
|
||||
set(ENV_OPENCLROOT $ENV{INTELOCLSDKROOT})
|
||||
endif(ENV_INTELOCLSDKROOT)
|
||||
|
||||
set(ENV_OPENCLROOT2 $ENV{OPENCLROOT})
|
||||
if(ENV_OPENCLROOT2)
|
||||
set(ENV_OPENCLROOT $ENV{OPENCLROOT})
|
||||
endif(ENV_OPENCLROOT2)
|
||||
|
||||
if(ENV_OPENCLROOT)
|
||||
find_path(
|
||||
OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h
|
||||
PATHS ${ENV_OPENCLROOT}/include
|
||||
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
|
||||
)
|
||||
|
||||
if (("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
|
||||
if(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH
|
||||
${OPENCL_LIB_SEARCH_PATH}
|
||||
${ENV_OPENCLROOT}/lib/x86)
|
||||
else(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
set(OPENCL_LIB_SEARCH_PATH
|
||||
${OPENCL_LIB_SEARCH_PATH}
|
||||
${ENV_OPENCLROOT}/lib/x86_64)
|
||||
endif(CMAKE_SIZEOF_VOID_P EQUAL 4)
|
||||
endif(("${CMAKE_SYSTEM_NAME}" MATCHES "Linux") OR (${CMAKE_SYSTEM_NAME} MATCHES "Windows"))
|
||||
find_library(
|
||||
OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
PATHS ${OPENCL_LIB_SEARCH_PATH}
|
||||
#NO_DEFAULT_PATH #uncomment this is you wish to surpress the use of default paths for OpenCL
|
||||
)
|
||||
else(ENV_OPENCLROOT)
|
||||
find_path(
|
||||
OPENCL_INCLUDE_DIR
|
||||
NAMES CL/cl.h OpenCL/cl.h
|
||||
PATHS ${PROJECT_SOURCE_DIR} #use the CL/ include folder provided with ViennaCL
|
||||
)
|
||||
|
||||
find_library(
|
||||
OPENCL_LIBRARY
|
||||
NAMES OpenCL
|
||||
)
|
||||
endif(ENV_OPENCLROOT)
|
||||
|
||||
include(FindPackageHandleStandardArgs)
|
||||
find_package_handle_standard_args(
|
||||
OPENCL
|
||||
DEFAULT_MSG
|
||||
OPENCL_LIBRARY OPENCL_INCLUDE_DIR
|
||||
)
|
||||
|
||||
if(OPENCL_FOUND)
|
||||
set(OPENCL_INCLUDE_DIRS ${OPENCL_INCLUDE_DIR})
|
||||
set(OPENCL_LIBRARIES ${OPENCL_LIBRARY})
|
||||
else(OPENCL_FOUND)
|
||||
set(OPENCL_INCLUDE_DIRS)
|
||||
set(OPENCL_LIBRARIES)
|
||||
endif(OPENCL_FOUND)
|
||||
|
||||
mark_as_advanced(
|
||||
OPENCL_INCLUDE_DIR
|
||||
OPENCL_LIBRARY
|
||||
)
|
||||
|
7
cmake_modules/FindViennaCL.cmake
Normal file
7
cmake_modules/FindViennaCL.cmake
Normal file
@@ -0,0 +1,7 @@
|
||||
find_path(
|
||||
VIENNACL_INCLUDE_DIR
|
||||
NAMES viennacl/vector.hpp
|
||||
)
|
||||
|
||||
set(VIENNACL_INCLUDE_DIRS ${VIENNACL_INCLUDE_DIR})
|
||||
mark_as_advanced(VIENNACL_INCLUDE_DIRS)
|
6
tests/CMakeLists.txt
Normal file
6
tests/CMakeLists.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
foreach(PROG blas1 blas3)
|
||||
add_executable(${PROG}-test ${PROG}.cpp)
|
||||
add_test(${PROG} ${PROG}-test)
|
||||
target_link_libraries(${PROG}-test ${OPENCL_LIBRARIES})
|
||||
set_target_properties(${PROG}-test PROPERTIES COMPILE_FLAGS "-DVIENNACL_WITH_OPENCL -Wall -Wextra")
|
||||
endforeach(PROG)
|
147
tests/blas1.cpp
Normal file
147
tests/blas1.cpp
Normal file
@@ -0,0 +1,147 @@
|
||||
|
||||
#include "common.hpp"
|
||||
#include "viennacl/vector.hpp"
|
||||
|
||||
#include "atidlas/templates/vector_axpy_template.hpp"
|
||||
#include "atidlas/execute.hpp"
|
||||
|
||||
template<typename NumericT, class XType, class YType, class ZType>
|
||||
int test_vectors(NumericT epsilon, atidlas::vector_axpy_parameters const & vector_axpy_parameters,
|
||||
XType & cx, YType & cy, ZType & cz)
|
||||
{
|
||||
int failure_count = 0;
|
||||
ZType buffer = cz;
|
||||
|
||||
NumericT a = 3.12, b = 3.5;
|
||||
viennacl::scalar<NumericT> da(a), db(b);
|
||||
|
||||
viennacl::vector<NumericT> xtmp(cx.internal_size());
|
||||
viennacl::vector<NumericT> ytmp(cy.internal_size());
|
||||
viennacl::vector<NumericT> ztmp(cz.internal_size());
|
||||
|
||||
typename vector_maker<XType>::result_type x = vector_maker<XType>::make(xtmp, cx);
|
||||
typename vector_maker<YType>::result_type y = vector_maker<YType>::make(ytmp, cy);
|
||||
typename vector_maker<ZType>::result_type z = vector_maker<ZType>::make(ztmp, cz);
|
||||
|
||||
|
||||
#define RUN_TEST_VECTOR_AXPY(NAME, CPU_LOOP, GPU_STATEMENT) \
|
||||
std::cout << NAME "..." << std::flush;\
|
||||
for(int_t i = 0 ; i < cz.size() ; ++i)\
|
||||
CPU_LOOP;\
|
||||
atidlas::execute(atidlas::vector_axpy_template(vector_axpy_parameters),\
|
||||
GPU_STATEMENT,\
|
||||
viennacl::ocl::current_context(), true);\
|
||||
viennacl::copy(z, buffer);\
|
||||
if(failure_vector(cz, buffer, epsilon))\
|
||||
{\
|
||||
failure_count++;\
|
||||
std::cout << " [Failure!]" << std::endl;\
|
||||
}\
|
||||
else\
|
||||
std::cout << std::endl;
|
||||
|
||||
RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x))
|
||||
RUN_TEST_VECTOR_AXPY("z = x + y", cz[i] = cx[i] + cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x + y))
|
||||
RUN_TEST_VECTOR_AXPY("z = x - y", cz[i] = cx[i] - cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x - y))
|
||||
RUN_TEST_VECTOR_AXPY("z = x + y + z", cz[i] = cx[i] + cy[i] + cz[i], viennacl::scheduler::statement(z, viennacl::op_assign(), x + y + z))
|
||||
|
||||
RUN_TEST_VECTOR_AXPY("z = a*x", cz[i] = a*cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x))
|
||||
RUN_TEST_VECTOR_AXPY("z = da*x", cz[i] = a*cx[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x))
|
||||
RUN_TEST_VECTOR_AXPY("z = a*x + b*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x + b*y))
|
||||
RUN_TEST_VECTOR_AXPY("z = da*x + b*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x + b*y))
|
||||
RUN_TEST_VECTOR_AXPY("z = a*x + db*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), a*x + db*y))
|
||||
RUN_TEST_VECTOR_AXPY("z = da*x + db*y", cz[i] = a*cx[i] + b*cy[i], viennacl::scheduler::statement(z, viennacl::op_assign(), da*x + db*y))
|
||||
#undef RUN_TEST_VECTOR_AXPY
|
||||
|
||||
return failure_count;
|
||||
}
|
||||
|
||||
template<typename NumericT>
|
||||
int test_impl(NumericT epsilon)
|
||||
{
|
||||
int_t N = 24378;
|
||||
int x_start = 4, y_start = 7, z_start = 15;
|
||||
int x_stride = 5, y_stride = 8, z_stride = 12;
|
||||
viennacl::range xr(x_start, N + x_start), yr(y_start, N + y_start), zr(z_start, N + z_start);
|
||||
viennacl::slice xs(x_start, x_stride, N), ys(y_start, y_stride, N), zs(z_start, z_stride, N);
|
||||
|
||||
simple_vector<NumericT> x_vector(N), y_vector(N), z_vector(N);
|
||||
init_rand(x_vector);
|
||||
init_rand(y_vector);
|
||||
init_rand(z_vector);
|
||||
|
||||
simple_vector<NumericT> x_range_holder(N + x_start);
|
||||
simple_vector<NumericT> x_slice_holder(x_start + N*x_stride);
|
||||
init_rand(x_range_holder);
|
||||
init_rand(x_slice_holder);
|
||||
simple_vector_range< simple_vector<NumericT> > x_range(x_range_holder, xr);
|
||||
simple_vector_slice< simple_vector<NumericT> > x_slice(x_slice_holder, xs);
|
||||
|
||||
simple_vector<NumericT> y_range_holder(N + y_start);
|
||||
simple_vector<NumericT> y_slice_holder(y_start + N*y_stride);
|
||||
init_rand(y_range_holder);
|
||||
init_rand(y_slice_holder);
|
||||
simple_vector_range< simple_vector<NumericT> > y_range(y_range_holder, yr);
|
||||
simple_vector_slice< simple_vector<NumericT> > y_slice(y_slice_holder, ys);
|
||||
|
||||
simple_vector<NumericT> z_range_holder(N + z_start);
|
||||
simple_vector<NumericT> z_slice_holder(z_start + N*z_stride);
|
||||
init_rand(z_range_holder);
|
||||
init_rand(z_slice_holder);
|
||||
simple_vector_range< simple_vector<NumericT> > z_range(z_range_holder, zr);
|
||||
simple_vector_slice< simple_vector<NumericT> > z_slice(z_slice_holder, zs);
|
||||
|
||||
int_t failure_count = 0;
|
||||
|
||||
atidlas::vector_axpy_parameters vector_axpy_parameters(4, 32, 128, atidlas::FETCH_FROM_GLOBAL_CONTIGUOUS);
|
||||
|
||||
|
||||
#define TEST_OPERATIONS(XTYPE, YTYPE, ZTYPE)\
|
||||
std::cout << "> x : " #XTYPE " | y : " #YTYPE " | z : " #ZTYPE << std::endl;\
|
||||
failure_count += test_vectors(epsilon, vector_axpy_parameters, x_ ## XTYPE, y_ ## YTYPE, z_ ## ZTYPE);\
|
||||
|
||||
TEST_OPERATIONS(vector, vector, vector)
|
||||
TEST_OPERATIONS(vector, vector, range)
|
||||
TEST_OPERATIONS(vector, vector, slice)
|
||||
TEST_OPERATIONS(vector, range, vector)
|
||||
TEST_OPERATIONS(vector, range, range)
|
||||
TEST_OPERATIONS(vector, range, slice)
|
||||
TEST_OPERATIONS(vector, slice, vector)
|
||||
TEST_OPERATIONS(vector, slice, range)
|
||||
TEST_OPERATIONS(vector, slice, slice)
|
||||
|
||||
TEST_OPERATIONS(range, vector, vector)
|
||||
TEST_OPERATIONS(range, vector, range)
|
||||
TEST_OPERATIONS(range, vector, slice)
|
||||
TEST_OPERATIONS(range, range, vector)
|
||||
TEST_OPERATIONS(range, range, range)
|
||||
TEST_OPERATIONS(range, range, slice)
|
||||
TEST_OPERATIONS(range, slice, vector)
|
||||
TEST_OPERATIONS(range, slice, range)
|
||||
TEST_OPERATIONS(range, slice, slice)
|
||||
|
||||
TEST_OPERATIONS(slice, vector, vector)
|
||||
TEST_OPERATIONS(slice, vector, range)
|
||||
TEST_OPERATIONS(slice, vector, slice)
|
||||
TEST_OPERATIONS(slice, range, vector)
|
||||
TEST_OPERATIONS(slice, range, range)
|
||||
TEST_OPERATIONS(slice, range, slice)
|
||||
TEST_OPERATIONS(slice, slice, vector)
|
||||
TEST_OPERATIONS(slice, slice, range)
|
||||
TEST_OPERATIONS(slice, slice, slice)
|
||||
|
||||
return failure_count;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n_failures = 0;
|
||||
std::cout << ">> float" << std::endl;
|
||||
n_failures += test_impl<float>(1e-5);
|
||||
std::cout << ">> double" << std::endl;
|
||||
n_failures += test_impl<double>(1e-9);
|
||||
|
||||
if(n_failures>0)
|
||||
return EXIT_FAILURE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
223
tests/blas2.cpp
Normal file
223
tests/blas2.cpp
Normal file
@@ -0,0 +1,223 @@
|
||||
#include <cstddef>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "viennacl/matrix.hpp"
|
||||
#include "viennacl/matrix_proxy.hpp"
|
||||
#include "viennacl/linalg/prod.hpp"
|
||||
|
||||
|
||||
template<typename T, typename CType, typename AType, typename BType>
|
||||
int test_layout(CType & C, AType const & A, AType const & AT, BType const & B, BType const & BT,
|
||||
simple_matrix<T> const & ground, T epsilon)
|
||||
{
|
||||
using viennacl::linalg::prod;
|
||||
using viennacl::trans;
|
||||
int failures_count = 0;
|
||||
|
||||
simple_matrix<T> tmp(C.size1(), C.size2());
|
||||
|
||||
#define TEST_OP(NAME, OPERATION)\
|
||||
std::cout << NAME " ..." << std::flush;\
|
||||
OPERATION;\
|
||||
viennacl::copy(C, tmp);\
|
||||
if (failure(ground, tmp, epsilon))\
|
||||
{\
|
||||
std::cout << " [Failure!]" << std::endl;\
|
||||
failures_count++;\
|
||||
}\
|
||||
else\
|
||||
std::cout << std::endl;
|
||||
|
||||
TEST_OP("C = A.B", C = prod(A, B));
|
||||
TEST_OP("C = A'.B", C = prod(trans(AT), B));
|
||||
TEST_OP("C = A.B'", C = prod(A, trans(BT)));
|
||||
TEST_OP("C = A'.B'", C = prod(trans(AT), trans(BT)));
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
template<typename T, typename RefAType, typename RefBType, typename RefCType>
|
||||
int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & cA, RefAType & cAT, int BK, int BN, RefBType & cB, RefBType & cBT, T epsilon)
|
||||
{
|
||||
|
||||
viennacl::matrix<T, viennacl::row_major> ArowTmp(AM, AK);
|
||||
viennacl::matrix<T, viennacl::row_major> ATrowTmp(AK, AM);
|
||||
viennacl::matrix<T, viennacl::row_major> BrowTmp(BK, BN);
|
||||
viennacl::matrix<T, viennacl::row_major> BTrowTmp(BN, BK);
|
||||
viennacl::matrix<T, viennacl::row_major> CrowTmp(CM, CN);
|
||||
|
||||
viennacl::matrix<T, viennacl::column_major> AcolTmp(AM, AK);
|
||||
viennacl::matrix<T, viennacl::column_major> ATcolTmp(AK, AM);
|
||||
viennacl::matrix<T, viennacl::column_major> BcolTmp(BK, BN);
|
||||
viennacl::matrix<T, viennacl::column_major> BTcolTmp(BN, BK);
|
||||
viennacl::matrix<T, viennacl::column_major> CcolTmp(CM, CN);
|
||||
|
||||
|
||||
typename matrix_maker<RefCType, viennacl::row_major>::result_type Crow = matrix_maker<RefCType, viennacl::row_major>::make(CrowTmp, cC);
|
||||
typename matrix_maker<RefAType, viennacl::row_major>::result_type Arow = matrix_maker<RefAType, viennacl::row_major>::make(ArowTmp, cA);
|
||||
typename matrix_maker<RefAType, viennacl::row_major>::result_type ATrow = matrix_maker<RefAType, viennacl::row_major>::make(ATrowTmp, cAT);
|
||||
typename matrix_maker<RefBType, viennacl::row_major>::result_type Brow = matrix_maker<RefBType, viennacl::row_major>::make(BrowTmp, cB);
|
||||
typename matrix_maker<RefBType, viennacl::row_major>::result_type BTrow = matrix_maker<RefBType, viennacl::row_major>::make(BTrowTmp, cBT);
|
||||
|
||||
typename matrix_maker<RefCType, viennacl::column_major>::result_type Ccol = matrix_maker<RefCType, viennacl::column_major>::make(CcolTmp, cC);
|
||||
typename matrix_maker<RefAType, viennacl::column_major>::result_type Acol = matrix_maker<RefAType, viennacl::column_major>::make(AcolTmp, cA);
|
||||
typename matrix_maker<RefAType, viennacl::column_major>::result_type ATcol = matrix_maker<RefAType, viennacl::column_major>::make(ATcolTmp, cAT);
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
|
||||
|
||||
|
||||
simple_matrix<T> ground = simple_prod<T>(cA, cB);
|
||||
|
||||
int failures_count = 0;
|
||||
|
||||
#define TEST_LAYOUT(Clayout, Alayout, Blayout) \
|
||||
std::cout << "> " #Clayout " = " #Alayout "." #Blayout << std::endl; \
|
||||
failures_count += test_layout(C ## Clayout, A ## Alayout, AT ## Alayout, B ## Blayout, BT ## Blayout, ground, epsilon);
|
||||
|
||||
TEST_LAYOUT(row, row, row);
|
||||
TEST_LAYOUT(row, row, col);
|
||||
TEST_LAYOUT(row, col, row);
|
||||
TEST_LAYOUT(row, col, col);
|
||||
TEST_LAYOUT(col, row, row);
|
||||
TEST_LAYOUT(col, row, col);
|
||||
TEST_LAYOUT(col, col, row);
|
||||
TEST_LAYOUT(col, col, col);
|
||||
|
||||
#undef TEST_LAYOUT
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
template<class MatrixType>
|
||||
void init_rand(MatrixType & A)
|
||||
{
|
||||
typedef typename MatrixType::value_type T;
|
||||
for (unsigned int i = 0; i < A.size1(); ++i)
|
||||
for (unsigned int j = 0; j < A.size2(); ++j)
|
||||
A(i, j) = static_cast<T>(0.1) * rand()/RAND_MAX;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
int run_test(T epsilon)
|
||||
{
|
||||
typedef viennacl::range range_type;
|
||||
typedef viennacl::slice slice_type;
|
||||
typedef simple_matrix<T> matrix_type;
|
||||
typedef simple_matrix_range<matrix_type> matrix_range_type;
|
||||
typedef simple_matrix_slice<matrix_type> matrix_slice_type;
|
||||
|
||||
int matrix_holder_M = 143;
|
||||
int matrix_holder_N = 124;
|
||||
int matrix_holder_K = 184;
|
||||
|
||||
int start_M = 14;
|
||||
int start_N = 20;
|
||||
int start_K = 73;
|
||||
|
||||
int range_holder_M = start_M + matrix_holder_M;
|
||||
int range_holder_N = start_N + matrix_holder_N;
|
||||
int range_holder_K = start_K + matrix_holder_K;
|
||||
|
||||
range_type range_M(start_M, range_holder_M);
|
||||
range_type range_N(start_N, range_holder_N);
|
||||
range_type range_K(start_K, range_holder_K);
|
||||
|
||||
int stride_M = 9;
|
||||
int stride_N = 13;
|
||||
int stride_K = 4;
|
||||
|
||||
int slice_holder_M = start_M + stride_M*matrix_holder_M;
|
||||
int slice_holder_N = start_N + stride_N*matrix_holder_N;
|
||||
int slice_holder_K = start_K + stride_K*matrix_holder_K;
|
||||
|
||||
slice_type slice_M(start_M, stride_M, matrix_holder_M);
|
||||
slice_type slice_N(start_N, stride_N, matrix_holder_N);
|
||||
slice_type slice_K(start_K, stride_K, matrix_holder_K);
|
||||
|
||||
int failures_count = 0;
|
||||
|
||||
#define DECLARE(NAME, size1, size2) \
|
||||
matrix_type NAME ## _matrix(matrix_holder_ ## size1, matrix_holder_ ## size2);\
|
||||
init_rand(NAME ## _matrix);\
|
||||
matrix_type NAME ## T_matrix = simple_trans(NAME ## _matrix);\
|
||||
\
|
||||
matrix_type NAME ## _range_holder(range_holder_ ## size1, range_holder_ ## size2);\
|
||||
init_rand(NAME ## _range_holder);\
|
||||
matrix_range_type NAME ## _range(NAME ## _range_holder, range_ ## size1, range_ ## size2);\
|
||||
matrix_type NAME ## T_range_holder = simple_trans(NAME ## _range_holder);\
|
||||
matrix_range_type NAME ## T_range(NAME ## T_range_holder, range_ ## size2, range_ ## size1);\
|
||||
\
|
||||
matrix_type NAME ## _slice_holder(slice_holder_ ## size1, slice_holder_ ## size2);\
|
||||
init_rand(NAME ## _slice_holder);\
|
||||
matrix_slice_type NAME ## _slice(NAME ## _slice_holder, slice_ ## size1, slice_ ## size2);\
|
||||
matrix_type NAME ## T_slice_holder = simple_trans(NAME ## _slice_holder);\
|
||||
matrix_slice_type NAME ## T_slice(NAME ## T_slice_holder, slice_ ## size2, slice_ ## size1);\
|
||||
|
||||
DECLARE(A, M, K);
|
||||
DECLARE(B, K, N);
|
||||
DECLARE(C, M, N);
|
||||
#undef DECLARE
|
||||
|
||||
#define TEST_ALL_LAYOUTS(C_TYPE, A_TYPE, B_TYPE)\
|
||||
std::cout << ">> " #C_TYPE " = " #A_TYPE "." #B_TYPE << std::endl;\
|
||||
failures_count += test_all_layouts<T>(C_TYPE ## _holder_M, C_TYPE ## _holder_N, C_ ## C_TYPE,\
|
||||
A_TYPE ## _holder_M, A_TYPE ## _holder_K, A_ ## A_TYPE, AT_ ## A_TYPE,\
|
||||
B_TYPE ## _holder_K, B_TYPE ## _holder_N, B_ ## B_TYPE, BT_ ## B_TYPE, epsilon);
|
||||
\
|
||||
// //C=matrix
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, range)
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(matrix, range, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, range, range)
|
||||
TEST_ALL_LAYOUTS(matrix, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(matrix, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, slice, range)
|
||||
TEST_ALL_LAYOUTS(matrix, slice, slice)
|
||||
|
||||
// C = range
|
||||
TEST_ALL_LAYOUTS(range, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(range, matrix, range)
|
||||
TEST_ALL_LAYOUTS(range, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(range, range, matrix)
|
||||
TEST_ALL_LAYOUTS(range, range, range)
|
||||
TEST_ALL_LAYOUTS(range, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(range, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(range, slice, range)
|
||||
TEST_ALL_LAYOUTS(range, slice, slice)
|
||||
|
||||
// C = slice
|
||||
TEST_ALL_LAYOUTS(slice, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, matrix, range)
|
||||
TEST_ALL_LAYOUTS(slice, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(slice, range, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, range, range)
|
||||
TEST_ALL_LAYOUTS(slice, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(slice, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, slice, range)
|
||||
TEST_ALL_LAYOUTS(slice, slice, slice)
|
||||
|
||||
#undef TEST_ALL_LAYOUTS
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n_failures = 0;
|
||||
std::cout << ">>> float" << std::endl;
|
||||
n_failures += run_test<float>(1e-5);
|
||||
std::cout << ">>> double" << std::endl;
|
||||
n_failures += run_test<double>(1e-9);
|
||||
|
||||
if(n_failures>0)
|
||||
return EXIT_FAILURE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
214
tests/blas3.cpp
Normal file
214
tests/blas3.cpp
Normal file
@@ -0,0 +1,214 @@
|
||||
#include <cstddef>
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "viennacl/matrix.hpp"
|
||||
#include "viennacl/matrix_proxy.hpp"
|
||||
#include "viennacl/linalg/prod.hpp"
|
||||
|
||||
|
||||
template<typename T, typename CType, typename AType, typename BType>
|
||||
int test_layout(CType & C, AType const & A, AType const & AT, BType const & B, BType const & BT,
|
||||
simple_matrix<T> const & ground, T epsilon)
|
||||
{
|
||||
using viennacl::linalg::prod;
|
||||
using viennacl::trans;
|
||||
int failures_count = 0;
|
||||
|
||||
simple_matrix<T> tmp(C.size1(), C.size2());
|
||||
|
||||
#define TEST_OP(NAME, OPERATION)\
|
||||
std::cout << NAME " ..." << std::flush;\
|
||||
OPERATION;\
|
||||
viennacl::copy(C, tmp);\
|
||||
if (failure(ground, tmp, epsilon))\
|
||||
{\
|
||||
std::cout << " [Failure!]" << std::endl;\
|
||||
failures_count++;\
|
||||
}\
|
||||
else\
|
||||
std::cout << std::endl;
|
||||
|
||||
TEST_OP("C = A.B", C = prod(A, B));
|
||||
TEST_OP("C = A'.B", C = prod(trans(AT), B));
|
||||
TEST_OP("C = A.B'", C = prod(A, trans(BT)));
|
||||
TEST_OP("C = A'.B'", C = prod(trans(AT), trans(BT)));
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
template<typename T, typename RefAType, typename RefBType, typename RefCType>
|
||||
int test_all_layouts(int CM, int CN, RefCType & cC, int AM, int AK, RefAType & cA, RefAType & cAT, int BK, int BN, RefBType & cB, RefBType & cBT, T epsilon)
|
||||
{
|
||||
|
||||
viennacl::matrix<T, viennacl::row_major> ArowTmp(AM, AK);
|
||||
viennacl::matrix<T, viennacl::row_major> ATrowTmp(AK, AM);
|
||||
viennacl::matrix<T, viennacl::row_major> BrowTmp(BK, BN);
|
||||
viennacl::matrix<T, viennacl::row_major> BTrowTmp(BN, BK);
|
||||
viennacl::matrix<T, viennacl::row_major> CrowTmp(CM, CN);
|
||||
|
||||
viennacl::matrix<T, viennacl::column_major> AcolTmp(AM, AK);
|
||||
viennacl::matrix<T, viennacl::column_major> ATcolTmp(AK, AM);
|
||||
viennacl::matrix<T, viennacl::column_major> BcolTmp(BK, BN);
|
||||
viennacl::matrix<T, viennacl::column_major> BTcolTmp(BN, BK);
|
||||
viennacl::matrix<T, viennacl::column_major> CcolTmp(CM, CN);
|
||||
|
||||
|
||||
typename matrix_maker<RefCType, viennacl::row_major>::result_type Crow = matrix_maker<RefCType, viennacl::row_major>::make(CrowTmp, cC);
|
||||
typename matrix_maker<RefAType, viennacl::row_major>::result_type Arow = matrix_maker<RefAType, viennacl::row_major>::make(ArowTmp, cA);
|
||||
typename matrix_maker<RefAType, viennacl::row_major>::result_type ATrow = matrix_maker<RefAType, viennacl::row_major>::make(ATrowTmp, cAT);
|
||||
typename matrix_maker<RefBType, viennacl::row_major>::result_type Brow = matrix_maker<RefBType, viennacl::row_major>::make(BrowTmp, cB);
|
||||
typename matrix_maker<RefBType, viennacl::row_major>::result_type BTrow = matrix_maker<RefBType, viennacl::row_major>::make(BTrowTmp, cBT);
|
||||
|
||||
typename matrix_maker<RefCType, viennacl::column_major>::result_type Ccol = matrix_maker<RefCType, viennacl::column_major>::make(CcolTmp, cC);
|
||||
typename matrix_maker<RefAType, viennacl::column_major>::result_type Acol = matrix_maker<RefAType, viennacl::column_major>::make(AcolTmp, cA);
|
||||
typename matrix_maker<RefAType, viennacl::column_major>::result_type ATcol = matrix_maker<RefAType, viennacl::column_major>::make(ATcolTmp, cAT);
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type Bcol = matrix_maker<RefBType, viennacl::column_major>::make(BcolTmp, cB);
|
||||
typename matrix_maker<RefBType, viennacl::column_major>::result_type BTcol = matrix_maker<RefBType, viennacl::column_major>::make(BTcolTmp, cBT);
|
||||
|
||||
|
||||
simple_matrix<T> ground = simple_prod<T>(cA, cB);
|
||||
|
||||
int failures_count = 0;
|
||||
|
||||
#define TEST_LAYOUT(Clayout, Alayout, Blayout) \
|
||||
std::cout << "> " #Clayout " = " #Alayout "." #Blayout << std::endl; \
|
||||
failures_count += test_layout(C ## Clayout, A ## Alayout, AT ## Alayout, B ## Blayout, BT ## Blayout, ground, epsilon);
|
||||
|
||||
TEST_LAYOUT(row, row, row);
|
||||
TEST_LAYOUT(row, row, col);
|
||||
TEST_LAYOUT(row, col, row);
|
||||
TEST_LAYOUT(row, col, col);
|
||||
TEST_LAYOUT(col, row, row);
|
||||
TEST_LAYOUT(col, row, col);
|
||||
TEST_LAYOUT(col, col, row);
|
||||
TEST_LAYOUT(col, col, col);
|
||||
|
||||
#undef TEST_LAYOUT
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
int run_test(T epsilon)
|
||||
{
|
||||
typedef viennacl::range range_type;
|
||||
typedef viennacl::slice slice_type;
|
||||
typedef simple_matrix<T> matrix_type;
|
||||
typedef simple_matrix_range<matrix_type> matrix_range_type;
|
||||
typedef simple_matrix_slice<matrix_type> matrix_slice_type;
|
||||
|
||||
int matrix_holder_M = 143;
|
||||
int matrix_holder_N = 124;
|
||||
int matrix_holder_K = 184;
|
||||
|
||||
int start_M = 14;
|
||||
int start_N = 20;
|
||||
int start_K = 73;
|
||||
|
||||
int range_holder_M = start_M + matrix_holder_M;
|
||||
int range_holder_N = start_N + matrix_holder_N;
|
||||
int range_holder_K = start_K + matrix_holder_K;
|
||||
|
||||
range_type range_M(start_M, range_holder_M);
|
||||
range_type range_N(start_N, range_holder_N);
|
||||
range_type range_K(start_K, range_holder_K);
|
||||
|
||||
int stride_M = 9;
|
||||
int stride_N = 13;
|
||||
int stride_K = 4;
|
||||
|
||||
int slice_holder_M = start_M + stride_M*matrix_holder_M;
|
||||
int slice_holder_N = start_N + stride_N*matrix_holder_N;
|
||||
int slice_holder_K = start_K + stride_K*matrix_holder_K;
|
||||
|
||||
slice_type slice_M(start_M, stride_M, matrix_holder_M);
|
||||
slice_type slice_N(start_N, stride_N, matrix_holder_N);
|
||||
slice_type slice_K(start_K, stride_K, matrix_holder_K);
|
||||
|
||||
int failures_count = 0;
|
||||
|
||||
#define DECLARE(NAME, size1, size2) \
|
||||
matrix_type NAME ## _matrix(matrix_holder_ ## size1, matrix_holder_ ## size2);\
|
||||
init_rand(NAME ## _matrix);\
|
||||
matrix_type NAME ## T_matrix = simple_trans(NAME ## _matrix);\
|
||||
\
|
||||
matrix_type NAME ## _range_holder(range_holder_ ## size1, range_holder_ ## size2);\
|
||||
init_rand(NAME ## _range_holder);\
|
||||
matrix_range_type NAME ## _range(NAME ## _range_holder, range_ ## size1, range_ ## size2);\
|
||||
matrix_type NAME ## T_range_holder = simple_trans(NAME ## _range_holder);\
|
||||
matrix_range_type NAME ## T_range(NAME ## T_range_holder, range_ ## size2, range_ ## size1);\
|
||||
\
|
||||
matrix_type NAME ## _slice_holder(slice_holder_ ## size1, slice_holder_ ## size2);\
|
||||
init_rand(NAME ## _slice_holder);\
|
||||
matrix_slice_type NAME ## _slice(NAME ## _slice_holder, slice_ ## size1, slice_ ## size2);\
|
||||
matrix_type NAME ## T_slice_holder = simple_trans(NAME ## _slice_holder);\
|
||||
matrix_slice_type NAME ## T_slice(NAME ## T_slice_holder, slice_ ## size2, slice_ ## size1);\
|
||||
|
||||
DECLARE(A, M, K);
|
||||
DECLARE(B, K, N);
|
||||
DECLARE(C, M, N);
|
||||
#undef DECLARE
|
||||
|
||||
#define TEST_ALL_LAYOUTS(C_TYPE, A_TYPE, B_TYPE)\
|
||||
std::cout << ">> " #C_TYPE " = " #A_TYPE "." #B_TYPE << std::endl;\
|
||||
failures_count += test_all_layouts<T>(C_TYPE ## _holder_M, C_TYPE ## _holder_N, C_ ## C_TYPE,\
|
||||
A_TYPE ## _holder_M, A_TYPE ## _holder_K, A_ ## A_TYPE, AT_ ## A_TYPE,\
|
||||
B_TYPE ## _holder_K, B_TYPE ## _holder_N, B_ ## B_TYPE, BT_ ## B_TYPE, epsilon);
|
||||
\
|
||||
// //C=matrix
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, range)
|
||||
TEST_ALL_LAYOUTS(matrix, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(matrix, range, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, range, range)
|
||||
TEST_ALL_LAYOUTS(matrix, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(matrix, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(matrix, slice, range)
|
||||
TEST_ALL_LAYOUTS(matrix, slice, slice)
|
||||
|
||||
// C = range
|
||||
TEST_ALL_LAYOUTS(range, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(range, matrix, range)
|
||||
TEST_ALL_LAYOUTS(range, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(range, range, matrix)
|
||||
TEST_ALL_LAYOUTS(range, range, range)
|
||||
TEST_ALL_LAYOUTS(range, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(range, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(range, slice, range)
|
||||
TEST_ALL_LAYOUTS(range, slice, slice)
|
||||
|
||||
// C = slice
|
||||
TEST_ALL_LAYOUTS(slice, matrix, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, matrix, range)
|
||||
TEST_ALL_LAYOUTS(slice, matrix, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(slice, range, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, range, range)
|
||||
TEST_ALL_LAYOUTS(slice, range, slice)
|
||||
|
||||
TEST_ALL_LAYOUTS(slice, slice, matrix)
|
||||
TEST_ALL_LAYOUTS(slice, slice, range)
|
||||
TEST_ALL_LAYOUTS(slice, slice, slice)
|
||||
|
||||
#undef TEST_ALL_LAYOUTS
|
||||
|
||||
return failures_count;
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
int n_failures = 0;
|
||||
std::cout << ">>> float" << std::endl;
|
||||
n_failures += run_test<float>(1e-5);
|
||||
std::cout << ">>> double" << std::endl;
|
||||
n_failures += run_test<double>(1e-9);
|
||||
|
||||
if(n_failures>0)
|
||||
return EXIT_FAILURE;
|
||||
return EXIT_SUCCESS;
|
||||
}
|
314
tests/common.hpp
Normal file
314
tests/common.hpp
Normal file
@@ -0,0 +1,314 @@
|
||||
#ifndef TEST_COMMON_HPP_
|
||||
#define TEST_COMMON_HPP_
|
||||
|
||||
#include "vector"
|
||||
#include "viennacl/matrix.hpp"
|
||||
#include "viennacl/matrix_proxy.hpp"
|
||||
#include "atidlas/forwards.h"
|
||||
|
||||
typedef atidlas::atidlas_int_t int_t;
|
||||
|
||||
/*---------
|
||||
* Vector
|
||||
* -------*/
|
||||
|
||||
template<class NumericT>
|
||||
class simple_vector
|
||||
{
|
||||
public:
|
||||
typedef NumericT value_type;
|
||||
typedef size_t size_type;
|
||||
|
||||
simple_vector(size_t N) : N_(N), data_(N){ }
|
||||
size_t size() const { return N_; }
|
||||
value_type & operator[](size_t i) { return data_[i]; }
|
||||
value_type operator[](size_t i) const { return data_[i]; }
|
||||
size_t internal_size() const { return data_.size(); }
|
||||
typename std::vector<value_type>::iterator begin() { return data_.begin(); }
|
||||
typename std::vector<value_type>::iterator end() { return data_.begin() + size(); }
|
||||
typename std::vector<value_type>::const_iterator begin() const { return data_.begin(); }
|
||||
typename std::vector<value_type>::const_iterator end() const { return data_.begin() + size(); }
|
||||
private:
|
||||
size_t N_;
|
||||
std::vector<value_type> data_;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class simple_vector_range
|
||||
{
|
||||
public:
|
||||
typedef typename T::value_type value_type;
|
||||
typedef typename T::size_type size_type;
|
||||
|
||||
simple_vector_range(simple_vector<value_type> & data, viennacl::range const & r) : data_(data), r_(r) { }
|
||||
size_t size() const { return r_.size(); }
|
||||
viennacl::range const & range() const { return r_; }
|
||||
value_type & operator[](size_t i) { return data_[i]; }
|
||||
value_type operator[](size_t i) const { return data_[i]; }
|
||||
size_t internal_size() const { return data_.size(); }
|
||||
typename std::vector<value_type>::iterator begin() { return data_.begin(); }
|
||||
typename std::vector<value_type>::iterator end() { return data_.begin() + size(); }
|
||||
typename std::vector<value_type>::const_iterator begin() const { return data_.begin(); }
|
||||
typename std::vector<value_type>::const_iterator end() const { return data_.begin() + size(); }
|
||||
private:
|
||||
simple_vector<value_type> & data_;
|
||||
viennacl::range r_;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class simple_vector_slice
|
||||
{
|
||||
public:
|
||||
typedef typename T::value_type value_type;
|
||||
typedef typename T::size_type size_type;
|
||||
|
||||
simple_vector_slice(simple_vector<value_type> & data, viennacl::slice const & s) : data_(data), s_(s) { }
|
||||
size_type size() const { return s_.size(); }
|
||||
size_t internal_size() const { return data_.size(); }
|
||||
viennacl::slice const & slice() const { return s_; }
|
||||
value_type & operator[](size_t i) { return data_[i]; }
|
||||
value_type operator[](size_t i) const { return data_[i]; }
|
||||
private:
|
||||
simple_vector<value_type> & data_;
|
||||
viennacl::slice s_;
|
||||
};
|
||||
|
||||
|
||||
//Helper to initialize a viennacl vector from a simple type
|
||||
|
||||
template<class SimpleType>
|
||||
struct vector_maker;
|
||||
|
||||
template<class NumericT>
|
||||
struct vector_maker< simple_vector<NumericT> >
|
||||
{
|
||||
typedef viennacl::vector<NumericT> result_type;
|
||||
static result_type make(viennacl::vector<NumericT> const &, simple_vector<NumericT> & base)
|
||||
{
|
||||
viennacl::vector<NumericT> result(base.size());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<class NumericT>
|
||||
struct vector_maker< simple_vector_range< simple_vector<NumericT> > >
|
||||
{
|
||||
typedef viennacl::vector_range< viennacl::vector<NumericT> > result_type;
|
||||
static result_type make(viennacl::vector<NumericT> & x, simple_vector_range< simple_vector<NumericT> > & base)
|
||||
{
|
||||
result_type result(x, base.range());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<class NumericT>
|
||||
struct vector_maker< simple_vector_slice<simple_vector<NumericT> > >
|
||||
{
|
||||
typedef viennacl::vector_slice< viennacl::vector<NumericT> > result_type;
|
||||
static result_type make(viennacl::vector<NumericT> & M, simple_vector_slice< simple_vector<NumericT> > & base)
|
||||
{
|
||||
result_type result(M, base.slice());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/*---------
|
||||
* Matrix
|
||||
* -------*/
|
||||
|
||||
template<class NumericT>
|
||||
class simple_matrix
|
||||
{
|
||||
public:
|
||||
typedef NumericT value_type;
|
||||
typedef size_t size_type;
|
||||
|
||||
simple_matrix(size_t M, size_t N) : data_(M*N), M_(M), N_(N){ }
|
||||
size_t size1() const { return M_; }
|
||||
size_t size2() const { return N_; }
|
||||
value_type & operator()(size_t i, size_t j) { return data_[i + M_*j]; }
|
||||
value_type operator()(size_t i, size_t j) const { return data_[i + M_*j]; }
|
||||
private:
|
||||
std::vector<value_type> data_;
|
||||
size_t M_;
|
||||
size_t N_;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class simple_matrix_range
|
||||
{
|
||||
public:
|
||||
typedef typename T::value_type value_type;
|
||||
|
||||
simple_matrix_range(T & A, viennacl::range const & r1, viennacl::range const & r2) : A_(A), r1_(r1), r2_(r2){ }
|
||||
size_t size1() const { return r1_.size(); }
|
||||
size_t size2() const { return r2_.size(); }
|
||||
viennacl::range const & r1() const { return r1_; }
|
||||
viennacl::range const & r2() const { return r2_; }
|
||||
value_type & operator()(size_t i, size_t j) { return A_(i+r1_.start(), j+r2_.start()); }
|
||||
value_type operator()(size_t i, size_t j) const { return A_(i+r1_.start(), j+r2_.start()); }
|
||||
private:
|
||||
T & A_;
|
||||
viennacl::range r1_;
|
||||
viennacl::range r2_;
|
||||
};
|
||||
|
||||
template<class T>
|
||||
class simple_matrix_slice
|
||||
{
|
||||
public:
|
||||
typedef typename T::value_type value_type;
|
||||
|
||||
simple_matrix_slice(T & A, viennacl::slice const & s1, viennacl::slice const & s2) : A_(A), s1_(s1), s2_(s2){ }
|
||||
viennacl::slice::size_type size1() const { return s1_.size(); }
|
||||
viennacl::slice::size_type size2() const { return s2_.size(); }
|
||||
viennacl::slice const & s1() const { return s1_; }
|
||||
viennacl::slice const & s2() const { return s2_; }
|
||||
value_type & operator()(size_t i, size_t j) { return A_(i*s1_.stride() + s1_.start(), j*s2_.stride() + s2_.start()); }
|
||||
value_type operator()(size_t i, size_t j) const { return A_(i+s1_.start(), j+s2_.start()); }
|
||||
private:
|
||||
T & A_;
|
||||
viennacl::slice s1_;
|
||||
viennacl::slice s2_;
|
||||
};
|
||||
|
||||
|
||||
/*-------
|
||||
* Helpers
|
||||
*-------*/
|
||||
|
||||
template<typename T>
|
||||
void init_rand(simple_matrix<T> & A)
|
||||
{
|
||||
for (unsigned int i = 0; i < A.size1(); ++i)
|
||||
for (unsigned int j = 0; j < A.size2(); ++j)
|
||||
A(i, j) = T(0.1) * rand()/RAND_MAX;
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
void init_rand(simple_vector<T> & x)
|
||||
{
|
||||
for (unsigned int i = 0; i < x.size(); ++i)
|
||||
x[i] = T(0.1) * rand()/RAND_MAX;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
simple_matrix<T> simple_trans(simple_matrix<T> const & A)
|
||||
{
|
||||
int M = A.size1();
|
||||
int N = A.size2();
|
||||
simple_matrix<T> result(N, M);
|
||||
|
||||
for(int i = 0; i < N; ++i)
|
||||
for(int j = 0; j < M; ++j)
|
||||
result(i,j) = A(j,i);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class T, class U, class V>
|
||||
simple_matrix<T> simple_prod(U const & A, V const & B)
|
||||
{
|
||||
int M = A.size1();
|
||||
int N = B.size2();
|
||||
int K = A.size2();
|
||||
simple_matrix<T> result(M, N);
|
||||
|
||||
for(int i = 0 ; i < M ; ++i)
|
||||
for(int j = 0 ; j < N ; ++j)
|
||||
{
|
||||
T val = 0;
|
||||
for(int k = 0 ; k < K ; ++k)
|
||||
val+= A(i, k)*B(k,j);
|
||||
result(i, j) = val;
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
template<class SimpleType, class F>
|
||||
struct matrix_maker;
|
||||
|
||||
template<class T, class F>
|
||||
struct matrix_maker< simple_matrix<T>, F>
|
||||
{
|
||||
typedef viennacl::matrix<T, F> result_type;
|
||||
static result_type make(viennacl::matrix<T, F> const &, simple_matrix<T> & base)
|
||||
{
|
||||
viennacl::matrix<T, F> result(base.size1(), base.size2());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<class MatrixT, class F>
|
||||
struct matrix_maker< simple_matrix_range<MatrixT>, F>
|
||||
{
|
||||
typedef typename MatrixT::value_type T;
|
||||
typedef viennacl::matrix_range< viennacl::matrix<T, F> > result_type;
|
||||
|
||||
static result_type make(viennacl::matrix<T, F> & M, simple_matrix_range<MatrixT> & base)
|
||||
{
|
||||
result_type result(M, base.r1(), base.r2());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<class MatrixT, class F>
|
||||
struct matrix_maker< simple_matrix_slice<MatrixT>, F>
|
||||
{
|
||||
typedef typename MatrixT::value_type T;
|
||||
typedef viennacl::matrix_slice< viennacl::matrix<T, F> > result_type;
|
||||
|
||||
static result_type make(viennacl::matrix<T, F> & M, simple_matrix_slice<MatrixT> & base)
|
||||
{
|
||||
result_type result(M, base.s1(), base.s2());
|
||||
viennacl::copy(base, result);
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
template<class VectorType>
|
||||
bool failure_vector(VectorType const & x, VectorType const & y, typename VectorType::value_type epsilon)
|
||||
{
|
||||
typedef typename VectorType::value_type value_type;
|
||||
for(int_t i = 0 ; i < x.size() ; ++i)
|
||||
{
|
||||
value_type delta = std::abs(x[i] - y[i]);
|
||||
if(delta > epsilon)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template<class NumericT>
|
||||
bool failure(simple_matrix<NumericT> const & A, simple_matrix<NumericT> const & B, NumericT epsilon)
|
||||
{
|
||||
int M = A.size1();
|
||||
int N = A.size2();
|
||||
for(int i = 0 ; i < M ; ++i)
|
||||
for(int j = 0 ; j < N ; ++j)
|
||||
{
|
||||
NumericT delta = std::abs(A(i,j) - B(i,j));
|
||||
if(delta > epsilon)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
namespace viennacl
|
||||
{
|
||||
namespace traits
|
||||
{
|
||||
template<class T> int size1(simple_matrix<T> const & M) { return M.size1(); }
|
||||
template<class T> int size2(simple_matrix<T> const & M) { return M.size2(); }
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
Reference in New Issue
Block a user