Backend: A lot of bugfixes in dot() for handling shapes better
This commit is contained in:
@@ -482,13 +482,17 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_LEQ_TYPE, operator <=, INT_TYPE)
|
|||||||
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_EQ_TYPE, operator ==, INT_TYPE)
|
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_EQ_TYPE, operator ==, INT_TYPE)
|
||||||
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_NEQ_TYPE, operator !=, INT_TYPE)
|
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_NEQ_TYPE, operator !=, INT_TYPE)
|
||||||
|
|
||||||
|
#define DEFINE_OUTER(LTYPE, RTYPE) \
|
||||||
|
array_expression outer(LTYPE const & x, RTYPE const & y)\
|
||||||
|
{\
|
||||||
|
assert(x.nshape()==1 && y.nshape()==1);\
|
||||||
|
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );\
|
||||||
|
}\
|
||||||
|
|
||||||
array_expression outer(array const & x, array const & y)
|
DEFINE_OUTER(array, array)
|
||||||
{
|
DEFINE_OUTER(array_expression, array)
|
||||||
assert(x.nshape()==1 && y.nshape()==1);
|
DEFINE_OUTER(array, array_expression)
|
||||||
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );
|
DEFINE_OUTER(array_expression, array_expression)
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#undef DEFINE_ELEMENT_BINARY_OPERATOR
|
#undef DEFINE_ELEMENT_BINARY_OPERATOR
|
||||||
//---------------------------------------
|
//---------------------------------------
|
||||||
@@ -705,6 +709,10 @@ namespace detail
|
|||||||
int_t N = A.shape()[1];
|
int_t N = A.shape()[1];
|
||||||
array_expression::node & A_root = const_cast<array_expression::node &>(A.tree()[A.root()]);
|
array_expression::node & A_root = const_cast<array_expression::node &>(A.tree()[A.root()]);
|
||||||
bool A_trans = A_root.op.type==OPERATOR_TRANS_TYPE;
|
bool A_trans = A_root.op.type==OPERATOR_TRANS_TYPE;
|
||||||
|
while(A_root.lhs.type_family==COMPOSITE_OPERATOR_FAMILY){
|
||||||
|
A_root = A.tree()[A_root.lhs.node_index];
|
||||||
|
A_trans ^= A_root.op.type==OPERATOR_TRANS_TYPE;
|
||||||
|
}
|
||||||
if(A_trans)
|
if(A_trans)
|
||||||
{
|
{
|
||||||
array_expression tmp(A, repmat(x, 1, M), op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ELEMENT_PROD_TYPE), A.context(), A.dtype(), size4(N, M));
|
array_expression tmp(A, repmat(x, 1, M), op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ELEMENT_PROD_TYPE), A.context(), A.dtype(), size4(N, M));
|
||||||
@@ -717,12 +725,6 @@ namespace detail
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
array_expression matvecprod(array_expression const & A, array_expression const & x)
|
|
||||||
{
|
|
||||||
return matvecprod(A, array(x));
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
array_expression reshape(array const & x, int_t shape0, int_t shape1)
|
array_expression reshape(array const & x, int_t shape0, int_t shape1)
|
||||||
@@ -735,22 +737,23 @@ array_expression reshape(array_expression const & x, int_t shape0, int_t shape1)
|
|||||||
#define DEFINE_DOT(LTYPE, RTYPE) \
|
#define DEFINE_DOT(LTYPE, RTYPE) \
|
||||||
array_expression dot(LTYPE const & x, RTYPE const & y)\
|
array_expression dot(LTYPE const & x, RTYPE const & y)\
|
||||||
{\
|
{\
|
||||||
if(x.nshape()==1 && y.nshape()==1)\
|
if(x.nshape()<1 || y.nshape()<1){\
|
||||||
{\
|
return x*y;\
|
||||||
return sum(x*y);\
|
}\
|
||||||
|
if(x.nshape()==1 && y.nshape()==1){\
|
||||||
|
if(x.shape()[1]==1 && y.shape()[0]==1)\
|
||||||
|
return outer(x, y);\
|
||||||
|
else if(x.shape()[0]==1 && y.shape()[1]==1)\
|
||||||
|
return sum(x*trans(y));\
|
||||||
|
else\
|
||||||
|
return sum(x*y);\
|
||||||
}\
|
}\
|
||||||
else if(x.nshape()==2 && y.nshape()==1)\
|
else if(x.nshape()==2 && y.nshape()==1)\
|
||||||
{\
|
|
||||||
return detail::matvecprod(x, y);\
|
return detail::matvecprod(x, y);\
|
||||||
}\
|
|
||||||
else if(x.nshape()==1 && y.nshape()==2)\
|
else if(x.nshape()==1 && y.nshape()==2)\
|
||||||
{\
|
return trans(detail::matvecprod(trans(y), trans(x)));\
|
||||||
return detail::matvecprod(trans(y), x);\
|
|
||||||
}\
|
|
||||||
else /*if(x.nshape()==2 && y.nshape()==2)*/\
|
else /*if(x.nshape()==2 && y.nshape()==2)*/\
|
||||||
{\
|
|
||||||
return detail::matmatprod(x, y);\
|
return detail::matmatprod(x, y);\
|
||||||
}\
|
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFINE_DOT(array, array)
|
DEFINE_DOT(array, array)
|
||||||
|
@@ -47,7 +47,8 @@ namespace detail
|
|||||||
|
|
||||||
bool bypass(op_element const & op)
|
bool bypass(op_element const & op)
|
||||||
{
|
{
|
||||||
return op.type == OPERATOR_RESHAPE_TYPE;
|
return op.type == OPERATOR_RESHAPE_TYPE
|
||||||
|
||op.type == OPERATOR_TRANS_TYPE;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_cast(op_element const & op)
|
bool is_cast(op_element const & op)
|
||||||
@@ -68,8 +69,7 @@ namespace detail
|
|||||||
|
|
||||||
bool is_node_leaf(op_element const & op)
|
bool is_node_leaf(op_element const & op)
|
||||||
{
|
{
|
||||||
return op.type==OPERATOR_TRANS_TYPE
|
return op.type==OPERATOR_MATRIX_DIAG_TYPE
|
||||||
|| op.type==OPERATOR_MATRIX_DIAG_TYPE
|
|
||||||
|| op.type==OPERATOR_VDIAG_TYPE
|
|| op.type==OPERATOR_VDIAG_TYPE
|
||||||
|| op.type==OPERATOR_REPEAT_TYPE
|
|| op.type==OPERATOR_REPEAT_TYPE
|
||||||
|| op.type==OPERATOR_MATRIX_ROW_TYPE
|
|| op.type==OPERATOR_MATRIX_ROW_TYPE
|
||||||
@@ -212,8 +212,6 @@ const char * evaluate(operation_node_type type)
|
|||||||
case OPERATOR_ELEMENT_FMIN_TYPE : return "fmin";
|
case OPERATOR_ELEMENT_FMIN_TYPE : return "fmin";
|
||||||
case OPERATOR_ELEMENT_MAX_TYPE : return "max";
|
case OPERATOR_ELEMENT_MAX_TYPE : return "max";
|
||||||
case OPERATOR_ELEMENT_MIN_TYPE : return "min";
|
case OPERATOR_ELEMENT_MIN_TYPE : return "min";
|
||||||
//Unary
|
|
||||||
case OPERATOR_TRANS_TYPE : return "trans";
|
|
||||||
|
|
||||||
//Binary
|
//Binary
|
||||||
case OPERATOR_MATRIX_PRODUCT_NN_TYPE : return "prodNN";
|
case OPERATOR_MATRIX_PRODUCT_NN_TYPE : return "prodNN";
|
||||||
|
@@ -82,6 +82,13 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
|||||||
stream.dec_tab();
|
stream.dec_tab();
|
||||||
stream << "}" << std::endl;
|
stream << "}" << std::endl;
|
||||||
|
|
||||||
|
stream << "if(" << GlobalIdx0(backend) << "==0 &&" << GlobalIdx1(backend) << "==0)" << std::endl;
|
||||||
|
stream << "{" << std::endl;
|
||||||
|
stream.inc_tab();
|
||||||
|
process(stream, LHS_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array0", "#pointer[#start] = #namereg;"), expressions, mappings);
|
||||||
|
stream.dec_tab();
|
||||||
|
stream << "}" << std::endl;
|
||||||
|
|
||||||
stream.dec_tab();
|
stream.dec_tab();
|
||||||
stream << "}" << std::endl;
|
stream << "}" << std::endl;
|
||||||
|
|
||||||
|
@@ -567,7 +567,6 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
value_scalar const & alpha, value_scalar const & beta,
|
value_scalar const & alpha, value_scalar const & beta,
|
||||||
driver::Program & program, const char * suffix, execution_options_type const & options)
|
driver::Program & program, const char * suffix, execution_options_type const & options)
|
||||||
{
|
{
|
||||||
|
|
||||||
if(M==0 || N==0 || K==0)
|
if(M==0 || N==0 || K==0)
|
||||||
return;
|
return;
|
||||||
|
|
||||||
@@ -588,8 +587,10 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
driver::Kernel gemm(program, gemm_name);
|
driver::Kernel gemm(program, gemm_name);
|
||||||
driver::NDRange local(p_.local_size_0, p_.local_size_1);
|
driver::NDRange local(p_.local_size_0, p_.local_size_1);
|
||||||
|
|
||||||
|
|
||||||
using tools::align;
|
using tools::align;
|
||||||
driver::NDRange global = (strcmp(suffix,"fallback")==0)?driver::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth):driver::NDRange(M/p_.mS, N/p_.nS, p_.depth);
|
driver::NDRange global = (strcmp(suffix,"fallback")==0)?driver::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth):driver::NDRange(M/p_.mS, N/p_.nS, p_.depth);
|
||||||
|
|
||||||
unsigned int current_arg = 0;
|
unsigned int current_arg = 0;
|
||||||
set_arguments_functor helper(binder, current_arg, gemm);
|
set_arguments_functor helper(binder, current_arg, gemm);
|
||||||
gemm.setSizeArg(current_arg++, M);
|
gemm.setSizeArg(current_arg++, M);
|
||||||
@@ -611,9 +612,14 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
|||||||
gemm.setSizeArg(current_arg++, B.start()[0] + B.start()[1]*B.ld()/p_.simd_width);
|
gemm.setSizeArg(current_arg++, B.start()[0] + B.start()[1]*B.ld()/p_.simd_width);
|
||||||
gemm.setSizeArg(current_arg++, B.stride()[0]);
|
gemm.setSizeArg(current_arg++, B.stride()[0]);
|
||||||
|
|
||||||
|
// std::cout << "before " << *out << std::endl;
|
||||||
|
|
||||||
helper.set_arguments(beta.dtype(), beta.values());
|
helper.set_arguments(beta.dtype(), beta.values());
|
||||||
options.enqueue(program.context(), gemm, global, local);
|
options.enqueue(program.context(), gemm, global, local);
|
||||||
|
|
||||||
|
options.queue(program.context()).synchronize();
|
||||||
|
// std::cout << "after " << *out << std::endl;
|
||||||
|
|
||||||
if(p_.depth > 1)
|
if(p_.depth > 1)
|
||||||
{
|
{
|
||||||
unsigned int current_arg = 0;
|
unsigned int current_arg = 0;
|
||||||
|
@@ -33,6 +33,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
|||||||
{
|
{
|
||||||
using tools::to_string;
|
using tools::to_string;
|
||||||
|
|
||||||
|
|
||||||
std::vector<mapped_mreduction*> reductions;
|
std::vector<mapped_mreduction*> reductions;
|
||||||
expressions_tuple::data_type::const_iterator sit;
|
expressions_tuple::data_type::const_iterator sit;
|
||||||
std::vector<mapping_type>::const_iterator mit;
|
std::vector<mapping_type>::const_iterator mit;
|
||||||
@@ -114,6 +115,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
|||||||
{
|
{
|
||||||
std::string data_type = append_width("#scalartype",simd_width);
|
std::string data_type = append_width("#scalartype",simd_width);
|
||||||
|
|
||||||
|
|
||||||
for (const auto & e : reductions)
|
for (const auto & e : reductions)
|
||||||
{
|
{
|
||||||
std::map<std::string, std::string> accessors;
|
std::map<std::string, std::string> accessors;
|
||||||
@@ -130,7 +132,6 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
|||||||
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
//Update accumulators
|
//Update accumulators
|
||||||
std::vector<std::string> str(simd_width);
|
std::vector<std::string> str(simd_width);
|
||||||
if (simd_width==1)
|
if (simd_width==1)
|
||||||
@@ -240,6 +241,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
|||||||
stream << _size_t << " gsize1 = " << GlobalSize1(backend) <<";" << std::endl;
|
stream << _size_t << " gsize1 = " << GlobalSize1(backend) <<";" << std::endl;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
stream << _size_t << " upper_bound_1 = ( M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << ";" << std::endl;
|
stream << _size_t << " upper_bound_1 = ( M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << ";" << std::endl;
|
||||||
stream << "for(" << _size_t << " r = gid1; r < upper_bound_1; r += gsize1){" << std::endl;
|
stream << "for(" << _size_t << " r = gid1; r < upper_bound_1; r += gsize1){" << std::endl;
|
||||||
stream.inc_tab();
|
stream.inc_tab();
|
||||||
|
@@ -256,8 +256,8 @@ std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<base> > ini
|
|||||||
res[std::make_pair(MATRIX_AXPY_TYPE, DTYPE)] = ptr_t(new maxpy(1,8,8,8,8,FETCH_FROM_GLOBAL_STRIDED));
|
res[std::make_pair(MATRIX_AXPY_TYPE, DTYPE)] = ptr_t(new maxpy(1,8,8,8,8,FETCH_FROM_GLOBAL_STRIDED));
|
||||||
res[std::make_pair(ROW_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_rows(1, 8, 8, 4, 16, FETCH_FROM_GLOBAL_STRIDED));
|
res[std::make_pair(ROW_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_rows(1, 8, 8, 4, 16, FETCH_FROM_GLOBAL_STRIDED));
|
||||||
res[std::make_pair(COL_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_cols(1, 8, 8, 64, 8, FETCH_FROM_GLOBAL_STRIDED));
|
res[std::make_pair(COL_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_cols(1, 8, 8, 64, 8, FETCH_FROM_GLOBAL_STRIDED));
|
||||||
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 1, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||||
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 1, 1, 1, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||||
res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new mproduct_nt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new mproduct_nt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||||
res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new mproduct_tt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new mproduct_tt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||||
}
|
}
|
||||||
|
@@ -59,7 +59,7 @@ extern "C"
|
|||||||
clRetainMemObject(mx); \
|
clRetainMemObject(mx); \
|
||||||
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
|
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
|
||||||
clRetainMemObject(my); \
|
clRetainMemObject(my); \
|
||||||
execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
|
execute(is::assign(y, alpha*x + y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
|
||||||
return clblasSuccess; \
|
return clblasSuccess; \
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,15 +157,14 @@ extern "C"
|
|||||||
std::swap(M, N);\
|
std::swap(M, N);\
|
||||||
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
|
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
|
||||||
}\
|
}\
|
||||||
is::int_t As1 = M, As2 = N;\
|
is::array A(M, N, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
|
||||||
if(transA==clblasTrans) std::swap(As1, As2);\
|
|
||||||
is::array A(As1, As2, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
|
|
||||||
clRetainMemObject(mA);\
|
clRetainMemObject(mA);\
|
||||||
\
|
\
|
||||||
is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
|
is::int_t sx = N, sy = M;\
|
||||||
|
if(transA) std::swap(sx, sy);\
|
||||||
|
is::array x(sx, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
|
||||||
clRetainMemObject(mx);\
|
clRetainMemObject(mx);\
|
||||||
\
|
is::array y(sy, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
|
||||||
is::array y(M, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
|
|
||||||
clRetainMemObject(my);\
|
clRetainMemObject(my);\
|
||||||
\
|
\
|
||||||
is::driver::Context const & context = A.context();\
|
is::driver::Context const & context = A.context();\
|
||||||
@@ -182,6 +181,7 @@ extern "C"
|
|||||||
//*****************
|
//*****************
|
||||||
//BLAS3
|
//BLAS3
|
||||||
//*****************
|
//*****************
|
||||||
|
|
||||||
#define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CL) \
|
#define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CL) \
|
||||||
clblasStatus clblas ## TYPE_CHAR ## gemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,\
|
clblasStatus clblas ## TYPE_CHAR ## gemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,\
|
||||||
size_t M, size_t N, size_t K,\
|
size_t M, size_t N, size_t K,\
|
||||||
@@ -198,8 +198,7 @@ extern "C"
|
|||||||
std::swap(offA, offB);\
|
std::swap(offA, offB);\
|
||||||
std::swap(lda, ldb);\
|
std::swap(lda, ldb);\
|
||||||
std::swap(M, N);\
|
std::swap(M, N);\
|
||||||
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
|
std::swap(transA, transB);\
|
||||||
transB = (transB==clblasTrans)?clblasNoTrans:clblasTrans;\
|
|
||||||
}\
|
}\
|
||||||
is::int_t As1 = M, As2 = K;\
|
is::int_t As1 = M, As2 = K;\
|
||||||
is::int_t Bs1 = K, Bs2 = N;\
|
is::int_t Bs1 = K, Bs2 = N;\
|
||||||
@@ -214,9 +213,8 @@ extern "C"
|
|||||||
clRetainMemObject(mC);\
|
clRetainMemObject(mC);\
|
||||||
is::driver::Context const & context = C.context();\
|
is::driver::Context const & context = C.context();\
|
||||||
/*Operation*/\
|
/*Operation*/\
|
||||||
if((transA==clblasTrans) && (transB==clblasTrans)){\
|
if((transA==clblasTrans) && (transB==clblasTrans))\
|
||||||
execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
||||||
}\
|
|
||||||
else if((transA==clblasTrans) && (transB==clblasNoTrans))\
|
else if((transA==clblasTrans) && (transB==clblasNoTrans))\
|
||||||
execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
||||||
else if((transA==clblasNoTrans) && (transB==clblasTrans))\
|
else if((transA==clblasNoTrans) && (transB==clblasTrans))\
|
||||||
@@ -229,4 +227,6 @@ extern "C"
|
|||||||
MAKE_GEMM(S, is::FLOAT_TYPE, cl_float)
|
MAKE_GEMM(S, is::FLOAT_TYPE, cl_float)
|
||||||
MAKE_GEMM(D, is::DOUBLE_TYPE, cl_double)
|
MAKE_GEMM(D, is::DOUBLE_TYPE, cl_double)
|
||||||
|
|
||||||
|
#undef DOT
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -115,7 +115,7 @@ def main():
|
|||||||
include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
|
include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
|
||||||
|
|
||||||
#Source files
|
#Source files
|
||||||
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
|
src = 'src/lib/array.cpp src/lib/wrap/clBLAS.cpp src/lib/value_scalar.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/driver/program.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/check.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/device.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/ndrange.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/backend/parse.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/stream.cpp src/lib/backend/mapped_object.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
|
||||||
boostsrc = 'external/boost/libs/'
|
boostsrc = 'external/boost/libs/'
|
||||||
for s in ['numpy','python','smart_ptr','system','thread']:
|
for s in ['numpy','python','smart_ptr','system','thread']:
|
||||||
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
||||||
|
@@ -29,7 +29,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
|||||||
T cij = 0;
|
T cij = 0;
|
||||||
for(int k = 0 ; k < K ; ++k)
|
for(int k = 0 ; k < K ; ++k)
|
||||||
cij += cA(i,k)*cB(k,j);
|
cij += cA(i,k)*cB(k,j);
|
||||||
cC(i,j) = cij;
|
cC(i,j) = alpha*cij + beta*cC(i, j);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,6 +43,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
|||||||
#define RUN_TEST(NAME, GPU_OP)\
|
#define RUN_TEST(NAME, GPU_OP)\
|
||||||
std::cout << "[" << prefix << "] \t" << NAME << "..." << std::flush;\
|
std::cout << "[" << prefix << "] \t" << NAME << "..." << std::flush;\
|
||||||
GPU_OP;\
|
GPU_OP;\
|
||||||
|
queue.synchronize();\
|
||||||
ad::copy(C, buffer);\
|
ad::copy(C, buffer);\
|
||||||
if(diff(buffer, cCbuffer, epsilon))\
|
if(diff(buffer, cCbuffer, epsilon))\
|
||||||
{\
|
{\
|
||||||
@@ -57,20 +58,22 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
|||||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||||
|
|
||||||
//Row-major
|
//Row-major
|
||||||
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||||
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
RUN_TEST("GEMM(ROW, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
RUN_TEST("GEMM(ROW, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||||
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
RUN_TEST("GEMM(ROW, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
RUN_TEST("GEMM(ROW, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||||
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||||
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
//Column-major
|
//Column-major
|
||||||
RUN_TEST("GEMM(COL, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
RUN_TEST("GEMM(COL, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
RUN_TEST("GEMM(COL, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
RUN_TEST("GEMM(COL, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
RUN_TEST("GEMM(COL, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
RUN_TEST("GEMM(COL, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
||||||
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||||
RUN_TEST("GEMM(COL, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
RUN_TEST("GEMM(COL, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
||||||
@@ -92,9 +95,10 @@ template<typename T>
|
|||||||
void test_impl(T epsilon, ad::driver::Context const & ctx)
|
void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||||
{
|
{
|
||||||
|
|
||||||
int_t M = 412;
|
int_t M = 427;
|
||||||
int_t N = 248;
|
int_t N = 248;
|
||||||
int_t K = 376;
|
int_t K = 376;
|
||||||
|
|
||||||
|
|
||||||
int_t SUBM = 61;
|
int_t SUBM = 61;
|
||||||
int_t SUBN = 75;
|
int_t SUBN = 75;
|
||||||
@@ -120,6 +124,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
|||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
clblasSetup();
|
||||||
auto data = ad::driver::queues.contexts();
|
auto data = ad::driver::queues.contexts();
|
||||||
for(const auto & elem : data)
|
for(const auto & elem : data)
|
||||||
{
|
{
|
||||||
@@ -132,5 +137,6 @@ int main()
|
|||||||
test_impl<double>(1e-9, elem.first);
|
test_impl<double>(1e-9, elem.first);
|
||||||
std::cout << "---" << std::endl;
|
std::cout << "---" << std::endl;
|
||||||
}
|
}
|
||||||
|
clblasTeardown();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@@ -18,6 +18,8 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
|||||||
simple_vector<T> bufy(M);
|
simple_vector<T> bufy(M);
|
||||||
simple_vector<T> bufx(N);
|
simple_vector<T> bufx(N);
|
||||||
|
|
||||||
|
T alpha = 4.2, beta = 1.8;
|
||||||
|
|
||||||
ad::driver::CommandQueue queue = ad::driver::queues[y.context()][0];
|
ad::driver::CommandQueue queue = ad::driver::queues[y.context()][0];
|
||||||
|
|
||||||
T yi = 0, xi = 0;
|
T yi = 0, xi = 0;
|
||||||
@@ -32,6 +34,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
|||||||
ASSIGNMENT;\
|
ASSIGNMENT;\
|
||||||
}\
|
}\
|
||||||
GPU_REDUCTION;\
|
GPU_REDUCTION;\
|
||||||
|
queue.synchronize();\
|
||||||
ad::copy(RES, BUF.data());\
|
ad::copy(RES, BUF.data());\
|
||||||
if(diff(CRES, BUF, epsilon))\
|
if(diff(CRES, BUF, epsilon))\
|
||||||
{\
|
{\
|
||||||
@@ -47,24 +50,24 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
|||||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||||
|
|
||||||
|
|
||||||
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
|
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
|
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
|
CHANDLE(x), x.start()[0], x.stride()[0], beta, CHANDLE(y), y.start()[0], y.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||||
|
|
||||||
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
|
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
|
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
|
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||||
|
|
||||||
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
|
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
|
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
|
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||||
|
|
||||||
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
|
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
|
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||||
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
|
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
@@ -102,6 +105,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
|||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
clblasSetup();
|
||||||
auto data = ad::driver::queues.contexts();
|
auto data = ad::driver::queues.contexts();
|
||||||
for(const auto & elem : data)
|
for(const auto & elem : data)
|
||||||
{
|
{
|
||||||
@@ -114,5 +118,6 @@ int main()
|
|||||||
test_impl<double>(1e-9, elem.first);
|
test_impl<double>(1e-9, elem.first);
|
||||||
std::cout << "---" << std::endl;
|
std::cout << "---" << std::endl;
|
||||||
}
|
}
|
||||||
|
clblasTeardown();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@@ -17,6 +17,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
|||||||
int_t N = cx.size();
|
int_t N = cx.size();
|
||||||
ad::driver::CommandQueue queue = ad::driver::queues[ctx][0];
|
ad::driver::CommandQueue queue = ad::driver::queues[ctx][0];
|
||||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||||
|
ad::array scratch(N, x.dtype());
|
||||||
|
|
||||||
unsigned int failure_count = 0;
|
unsigned int failure_count = 0;
|
||||||
|
|
||||||
@@ -33,6 +34,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
|||||||
CPU_REDUCTION;\
|
CPU_REDUCTION;\
|
||||||
cs= ASSIGNMENT ;\
|
cs= ASSIGNMENT ;\
|
||||||
GPU_REDUCTION;\
|
GPU_REDUCTION;\
|
||||||
|
queue.synchronize();\
|
||||||
tmp = ds;\
|
tmp = ds;\
|
||||||
if((std::abs(cs - tmp)/std::max(cs, tmp)) > epsilon)\
|
if((std::abs(cs - tmp)/std::max(cs, tmp)) > epsilon)\
|
||||||
{\
|
{\
|
||||||
@@ -45,10 +47,9 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
|||||||
#define PREFIX "[C]"
|
#define PREFIX "[C]"
|
||||||
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
||||||
0, 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||||
0, 1, &clqueue, 0, NULL, NULL));
|
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||||
#undef PREFIX
|
#undef PREFIX
|
||||||
#define PREFIX "[C++]"
|
#define PREFIX "[C++]"
|
||||||
|
|
||||||
@@ -70,11 +71,11 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
|||||||
{
|
{
|
||||||
using isaac::_;
|
using isaac::_;
|
||||||
|
|
||||||
int_t N = 24378;
|
int_t N =2 ;
|
||||||
int_t SUBN = 531;
|
int_t SUBN = 2;
|
||||||
|
|
||||||
INIT_VECTOR(N, SUBN, 2, 4, cx, x, ctx);
|
INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx);
|
||||||
INIT_VECTOR(N, SUBN, 5, 8, cy, y, ctx);
|
INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx);
|
||||||
|
|
||||||
#define TEST_OPERATIONS(TYPE)\
|
#define TEST_OPERATIONS(TYPE)\
|
||||||
test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\
|
test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\
|
||||||
@@ -88,6 +89,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
|||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
clblasSetup();
|
||||||
auto data = ad::driver::queues.contexts();
|
auto data = ad::driver::queues.contexts();
|
||||||
for(const auto & elem : data)
|
for(const auto & elem : data)
|
||||||
{
|
{
|
||||||
@@ -100,5 +102,6 @@ int main()
|
|||||||
test_impl<double>(1e-9, elem.first);
|
test_impl<double>(1e-9, elem.first);
|
||||||
std::cout << "---" << std::endl;
|
std::cout << "---" << std::endl;
|
||||||
}
|
}
|
||||||
|
clblasTeardown();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
@@ -20,7 +20,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
|||||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||||
int_t N = cz.size();
|
int_t N = cz.size();
|
||||||
|
|
||||||
T aa = 3.12, bb=3.5;
|
T aa = 4.378, bb=3.5;
|
||||||
isaac::value_scalar a(aa), b(bb);
|
isaac::value_scalar a(aa), b(bb);
|
||||||
isaac::scalar da(a, ctx), db(b, ctx);
|
isaac::scalar da(a, ctx), db(b, ctx);
|
||||||
|
|
||||||
@@ -32,6 +32,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
|||||||
for(int_t i = 0 ; i < N ; ++i)\
|
for(int_t i = 0 ; i < N ; ++i)\
|
||||||
CPU_LOOP;\
|
CPU_LOOP;\
|
||||||
GPU_EXPR;\
|
GPU_EXPR;\
|
||||||
|
queue.synchronize();\
|
||||||
isaac::copy(z, buffer.data());\
|
isaac::copy(z, buffer.data());\
|
||||||
CONVERT;\
|
CONVERT;\
|
||||||
if(diff(cz, buffer, epsilon))\
|
if(diff(cz, buffer, epsilon))\
|
||||||
@@ -44,16 +45,18 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define PREFIX "[C]"
|
#define PREFIX "[C]"
|
||||||
RUN_TEST_VECTOR_AXPY("AXPY", cy[i] = cx[i] + a*cy[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL));
|
1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
RUN_TEST_VECTOR_AXPY("COPY", cy[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL));
|
1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
RUN_TEST_VECTOR_AXPY("SCAL", cx[i] = a*cx[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||||
1, &clqueue, 0, NULL, NULL));
|
1, &clqueue, 0, NULL, NULL));
|
||||||
|
|
||||||
|
|
||||||
#undef PREFIX
|
#undef PREFIX
|
||||||
#define PREFIX "[C++]"
|
#define PREFIX "[C++]"
|
||||||
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, ctx))
|
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, ctx))
|
||||||
@@ -136,6 +139,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
|||||||
|
|
||||||
int main()
|
int main()
|
||||||
{
|
{
|
||||||
|
clblasSetup();
|
||||||
auto data = ad::driver::queues.contexts();
|
auto data = ad::driver::queues.contexts();
|
||||||
for(const auto & elem : data)
|
for(const auto & elem : data)
|
||||||
{
|
{
|
||||||
@@ -148,5 +152,6 @@ int main()
|
|||||||
test_impl<double>(1e-9, elem.first);
|
test_impl<double>(1e-9, elem.first);
|
||||||
std::cout << "---" << std::endl;
|
std::cout << "---" << std::endl;
|
||||||
}
|
}
|
||||||
|
clblasTeardown();
|
||||||
return EXIT_SUCCESS;
|
return EXIT_SUCCESS;
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user