Backend: A lot of bugfixes in dot() for handling shapes better
This commit is contained in:
@@ -482,13 +482,17 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_LEQ_TYPE, operator <=, INT_TYPE)
|
||||
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_EQ_TYPE, operator ==, INT_TYPE)
|
||||
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_NEQ_TYPE, operator !=, INT_TYPE)
|
||||
|
||||
#define DEFINE_OUTER(LTYPE, RTYPE) \
|
||||
array_expression outer(LTYPE const & x, RTYPE const & y)\
|
||||
{\
|
||||
assert(x.nshape()==1 && y.nshape()==1);\
|
||||
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );\
|
||||
}\
|
||||
|
||||
array_expression outer(array const & x, array const & y)
|
||||
{
|
||||
assert(x.nshape()==1 && y.nshape()==1);
|
||||
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );
|
||||
}
|
||||
|
||||
DEFINE_OUTER(array, array)
|
||||
DEFINE_OUTER(array_expression, array)
|
||||
DEFINE_OUTER(array, array_expression)
|
||||
DEFINE_OUTER(array_expression, array_expression)
|
||||
|
||||
#undef DEFINE_ELEMENT_BINARY_OPERATOR
|
||||
//---------------------------------------
|
||||
@@ -705,6 +709,10 @@ namespace detail
|
||||
int_t N = A.shape()[1];
|
||||
array_expression::node & A_root = const_cast<array_expression::node &>(A.tree()[A.root()]);
|
||||
bool A_trans = A_root.op.type==OPERATOR_TRANS_TYPE;
|
||||
while(A_root.lhs.type_family==COMPOSITE_OPERATOR_FAMILY){
|
||||
A_root = A.tree()[A_root.lhs.node_index];
|
||||
A_trans ^= A_root.op.type==OPERATOR_TRANS_TYPE;
|
||||
}
|
||||
if(A_trans)
|
||||
{
|
||||
array_expression tmp(A, repmat(x, 1, M), op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ELEMENT_PROD_TYPE), A.context(), A.dtype(), size4(N, M));
|
||||
@@ -717,12 +725,6 @@ namespace detail
|
||||
|
||||
}
|
||||
|
||||
array_expression matvecprod(array_expression const & A, array_expression const & x)
|
||||
{
|
||||
return matvecprod(A, array(x));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
array_expression reshape(array const & x, int_t shape0, int_t shape1)
|
||||
@@ -735,22 +737,23 @@ array_expression reshape(array_expression const & x, int_t shape0, int_t shape1)
|
||||
#define DEFINE_DOT(LTYPE, RTYPE) \
|
||||
array_expression dot(LTYPE const & x, RTYPE const & y)\
|
||||
{\
|
||||
if(x.nshape()==1 && y.nshape()==1)\
|
||||
{\
|
||||
if(x.nshape()<1 || y.nshape()<1){\
|
||||
return x*y;\
|
||||
}\
|
||||
if(x.nshape()==1 && y.nshape()==1){\
|
||||
if(x.shape()[1]==1 && y.shape()[0]==1)\
|
||||
return outer(x, y);\
|
||||
else if(x.shape()[0]==1 && y.shape()[1]==1)\
|
||||
return sum(x*trans(y));\
|
||||
else\
|
||||
return sum(x*y);\
|
||||
}\
|
||||
else if(x.nshape()==2 && y.nshape()==1)\
|
||||
{\
|
||||
return detail::matvecprod(x, y);\
|
||||
}\
|
||||
else if(x.nshape()==1 && y.nshape()==2)\
|
||||
{\
|
||||
return detail::matvecprod(trans(y), x);\
|
||||
}\
|
||||
return trans(detail::matvecprod(trans(y), trans(x)));\
|
||||
else /*if(x.nshape()==2 && y.nshape()==2)*/\
|
||||
{\
|
||||
return detail::matmatprod(x, y);\
|
||||
}\
|
||||
}
|
||||
|
||||
DEFINE_DOT(array, array)
|
||||
|
@@ -47,7 +47,8 @@ namespace detail
|
||||
|
||||
bool bypass(op_element const & op)
|
||||
{
|
||||
return op.type == OPERATOR_RESHAPE_TYPE;
|
||||
return op.type == OPERATOR_RESHAPE_TYPE
|
||||
||op.type == OPERATOR_TRANS_TYPE;
|
||||
}
|
||||
|
||||
bool is_cast(op_element const & op)
|
||||
@@ -68,8 +69,7 @@ namespace detail
|
||||
|
||||
bool is_node_leaf(op_element const & op)
|
||||
{
|
||||
return op.type==OPERATOR_TRANS_TYPE
|
||||
|| op.type==OPERATOR_MATRIX_DIAG_TYPE
|
||||
return op.type==OPERATOR_MATRIX_DIAG_TYPE
|
||||
|| op.type==OPERATOR_VDIAG_TYPE
|
||||
|| op.type==OPERATOR_REPEAT_TYPE
|
||||
|| op.type==OPERATOR_MATRIX_ROW_TYPE
|
||||
@@ -212,8 +212,6 @@ const char * evaluate(operation_node_type type)
|
||||
case OPERATOR_ELEMENT_FMIN_TYPE : return "fmin";
|
||||
case OPERATOR_ELEMENT_MAX_TYPE : return "max";
|
||||
case OPERATOR_ELEMENT_MIN_TYPE : return "min";
|
||||
//Unary
|
||||
case OPERATOR_TRANS_TYPE : return "trans";
|
||||
|
||||
//Binary
|
||||
case OPERATOR_MATRIX_PRODUCT_NN_TYPE : return "prodNN";
|
||||
|
@@ -82,6 +82,13 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream << "if(" << GlobalIdx0(backend) << "==0 &&" << GlobalIdx1(backend) << "==0)" << std::endl;
|
||||
stream << "{" << std::endl;
|
||||
stream.inc_tab();
|
||||
process(stream, LHS_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array0", "#pointer[#start] = #namereg;"), expressions, mappings);
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
stream.dec_tab();
|
||||
stream << "}" << std::endl;
|
||||
|
||||
|
@@ -567,7 +567,6 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
value_scalar const & alpha, value_scalar const & beta,
|
||||
driver::Program & program, const char * suffix, execution_options_type const & options)
|
||||
{
|
||||
|
||||
if(M==0 || N==0 || K==0)
|
||||
return;
|
||||
|
||||
@@ -588,8 +587,10 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
driver::Kernel gemm(program, gemm_name);
|
||||
driver::NDRange local(p_.local_size_0, p_.local_size_1);
|
||||
|
||||
|
||||
using tools::align;
|
||||
driver::NDRange global = (strcmp(suffix,"fallback")==0)?driver::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth):driver::NDRange(M/p_.mS, N/p_.nS, p_.depth);
|
||||
|
||||
unsigned int current_arg = 0;
|
||||
set_arguments_functor helper(binder, current_arg, gemm);
|
||||
gemm.setSizeArg(current_arg++, M);
|
||||
@@ -611,9 +612,14 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
|
||||
gemm.setSizeArg(current_arg++, B.start()[0] + B.start()[1]*B.ld()/p_.simd_width);
|
||||
gemm.setSizeArg(current_arg++, B.stride()[0]);
|
||||
|
||||
// std::cout << "before " << *out << std::endl;
|
||||
|
||||
helper.set_arguments(beta.dtype(), beta.values());
|
||||
options.enqueue(program.context(), gemm, global, local);
|
||||
|
||||
options.queue(program.context()).synchronize();
|
||||
// std::cout << "after " << *out << std::endl;
|
||||
|
||||
if(p_.depth > 1)
|
||||
{
|
||||
unsigned int current_arg = 0;
|
||||
|
@@ -33,6 +33,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
{
|
||||
using tools::to_string;
|
||||
|
||||
|
||||
std::vector<mapped_mreduction*> reductions;
|
||||
expressions_tuple::data_type::const_iterator sit;
|
||||
std::vector<mapping_type>::const_iterator mit;
|
||||
@@ -114,6 +115,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
{
|
||||
std::string data_type = append_width("#scalartype",simd_width);
|
||||
|
||||
|
||||
for (const auto & e : reductions)
|
||||
{
|
||||
std::map<std::string, std::string> accessors;
|
||||
@@ -130,7 +132,6 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
||||
}
|
||||
|
||||
|
||||
//Update accumulators
|
||||
std::vector<std::string> str(simd_width);
|
||||
if (simd_width==1)
|
||||
@@ -240,6 +241,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
|
||||
stream << _size_t << " gsize1 = " << GlobalSize1(backend) <<";" << std::endl;
|
||||
|
||||
|
||||
|
||||
stream << _size_t << " upper_bound_1 = ( M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << ";" << std::endl;
|
||||
stream << "for(" << _size_t << " r = gid1; r < upper_bound_1; r += gsize1){" << std::endl;
|
||||
stream.inc_tab();
|
||||
|
@@ -256,8 +256,8 @@ std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<base> > ini
|
||||
res[std::make_pair(MATRIX_AXPY_TYPE, DTYPE)] = ptr_t(new maxpy(1,8,8,8,8,FETCH_FROM_GLOBAL_STRIDED));
|
||||
res[std::make_pair(ROW_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_rows(1, 8, 8, 4, 16, FETCH_FROM_GLOBAL_STRIDED));
|
||||
res[std::make_pair(COL_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_cols(1, 8, 8, 64, 8, FETCH_FROM_GLOBAL_STRIDED));
|
||||
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 1, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 1, 1, 1, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new mproduct_nt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new mproduct_tt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
|
||||
}
|
||||
|
@@ -59,7 +59,7 @@ extern "C"
|
||||
clRetainMemObject(mx); \
|
||||
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
|
||||
clRetainMemObject(my); \
|
||||
execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
|
||||
execute(is::assign(y, alpha*x + y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
|
||||
return clblasSuccess; \
|
||||
}
|
||||
|
||||
@@ -157,15 +157,14 @@ extern "C"
|
||||
std::swap(M, N);\
|
||||
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
|
||||
}\
|
||||
is::int_t As1 = M, As2 = N;\
|
||||
if(transA==clblasTrans) std::swap(As1, As2);\
|
||||
is::array A(As1, As2, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
|
||||
is::array A(M, N, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
|
||||
clRetainMemObject(mA);\
|
||||
\
|
||||
is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
|
||||
is::int_t sx = N, sy = M;\
|
||||
if(transA) std::swap(sx, sy);\
|
||||
is::array x(sx, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
|
||||
clRetainMemObject(mx);\
|
||||
\
|
||||
is::array y(M, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
|
||||
is::array y(sy, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
|
||||
clRetainMemObject(my);\
|
||||
\
|
||||
is::driver::Context const & context = A.context();\
|
||||
@@ -182,6 +181,7 @@ extern "C"
|
||||
//*****************
|
||||
//BLAS3
|
||||
//*****************
|
||||
|
||||
#define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CL) \
|
||||
clblasStatus clblas ## TYPE_CHAR ## gemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,\
|
||||
size_t M, size_t N, size_t K,\
|
||||
@@ -198,8 +198,7 @@ extern "C"
|
||||
std::swap(offA, offB);\
|
||||
std::swap(lda, ldb);\
|
||||
std::swap(M, N);\
|
||||
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
|
||||
transB = (transB==clblasTrans)?clblasNoTrans:clblasTrans;\
|
||||
std::swap(transA, transB);\
|
||||
}\
|
||||
is::int_t As1 = M, As2 = K;\
|
||||
is::int_t Bs1 = K, Bs2 = N;\
|
||||
@@ -214,9 +213,8 @@ extern "C"
|
||||
clRetainMemObject(mC);\
|
||||
is::driver::Context const & context = C.context();\
|
||||
/*Operation*/\
|
||||
if((transA==clblasTrans) && (transB==clblasTrans)){\
|
||||
if((transA==clblasTrans) && (transB==clblasTrans))\
|
||||
execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
||||
}\
|
||||
else if((transA==clblasTrans) && (transB==clblasNoTrans))\
|
||||
execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
|
||||
else if((transA==clblasNoTrans) && (transB==clblasTrans))\
|
||||
@@ -229,4 +227,6 @@ extern "C"
|
||||
MAKE_GEMM(S, is::FLOAT_TYPE, cl_float)
|
||||
MAKE_GEMM(D, is::DOUBLE_TYPE, cl_double)
|
||||
|
||||
#undef DOT
|
||||
|
||||
}
|
||||
|
@@ -115,7 +115,7 @@ def main():
|
||||
include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
|
||||
|
||||
#Source files
|
||||
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
|
||||
src = 'src/lib/array.cpp src/lib/wrap/clBLAS.cpp src/lib/value_scalar.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/driver/program.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/check.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/device.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/ndrange.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/backend/parse.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/stream.cpp src/lib/backend/mapped_object.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
|
||||
boostsrc = 'external/boost/libs/'
|
||||
for s in ['numpy','python','smart_ptr','system','thread']:
|
||||
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]
|
||||
|
@@ -29,7 +29,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
||||
T cij = 0;
|
||||
for(int k = 0 ; k < K ; ++k)
|
||||
cij += cA(i,k)*cB(k,j);
|
||||
cC(i,j) = cij;
|
||||
cC(i,j) = alpha*cij + beta*cC(i, j);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,6 +43,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
||||
#define RUN_TEST(NAME, GPU_OP)\
|
||||
std::cout << "[" << prefix << "] \t" << NAME << "..." << std::flush;\
|
||||
GPU_OP;\
|
||||
queue.synchronize();\
|
||||
ad::copy(C, buffer);\
|
||||
if(diff(buffer, cCbuffer, epsilon))\
|
||||
{\
|
||||
@@ -57,20 +58,22 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
|
||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||
|
||||
//Row-major
|
||||
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("GEMM(ROW, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("GEMM(ROW, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
|
||||
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
|
||||
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
//Column-major
|
||||
RUN_TEST("GEMM(COL, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
RUN_TEST("GEMM(COL, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
RUN_TEST("GEMM(COL, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
||||
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("GEMM(COL, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
|
||||
@@ -92,10 +95,11 @@ template<typename T>
|
||||
void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
{
|
||||
|
||||
int_t M = 412;
|
||||
int_t M = 427;
|
||||
int_t N = 248;
|
||||
int_t K = 376;
|
||||
|
||||
|
||||
int_t SUBM = 61;
|
||||
int_t SUBN = 75;
|
||||
int_t SUBK = 83;
|
||||
@@ -120,6 +124,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
|
||||
int main()
|
||||
{
|
||||
clblasSetup();
|
||||
auto data = ad::driver::queues.contexts();
|
||||
for(const auto & elem : data)
|
||||
{
|
||||
@@ -132,5 +137,6 @@ int main()
|
||||
test_impl<double>(1e-9, elem.first);
|
||||
std::cout << "---" << std::endl;
|
||||
}
|
||||
clblasTeardown();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@@ -18,6 +18,8 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
||||
simple_vector<T> bufy(M);
|
||||
simple_vector<T> bufx(N);
|
||||
|
||||
T alpha = 4.2, beta = 1.8;
|
||||
|
||||
ad::driver::CommandQueue queue = ad::driver::queues[y.context()][0];
|
||||
|
||||
T yi = 0, xi = 0;
|
||||
@@ -32,6 +34,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
||||
ASSIGNMENT;\
|
||||
}\
|
||||
GPU_REDUCTION;\
|
||||
queue.synchronize();\
|
||||
ad::copy(RES, BUF.data());\
|
||||
if(diff(CRES, BUF, epsilon))\
|
||||
{\
|
||||
@@ -47,24 +50,24 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
|
||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||
|
||||
|
||||
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
|
||||
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(x), x.start()[0], x.stride()[0], beta, CHANDLE(y), y.start()[0], y.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
|
||||
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
|
||||
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
|
||||
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
|
||||
|
||||
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
|
||||
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
|
||||
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
|
||||
}
|
||||
else
|
||||
@@ -102,6 +105,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
|
||||
int main()
|
||||
{
|
||||
clblasSetup();
|
||||
auto data = ad::driver::queues.contexts();
|
||||
for(const auto & elem : data)
|
||||
{
|
||||
@@ -114,5 +118,6 @@ int main()
|
||||
test_impl<double>(1e-9, elem.first);
|
||||
std::cout << "---" << std::endl;
|
||||
}
|
||||
clblasTeardown();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@@ -17,6 +17,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
||||
int_t N = cx.size();
|
||||
ad::driver::CommandQueue queue = ad::driver::queues[ctx][0];
|
||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||
ad::array scratch(N, x.dtype());
|
||||
|
||||
unsigned int failure_count = 0;
|
||||
|
||||
@@ -33,6 +34,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
||||
CPU_REDUCTION;\
|
||||
cs= ASSIGNMENT ;\
|
||||
GPU_REDUCTION;\
|
||||
queue.synchronize();\
|
||||
tmp = ds;\
|
||||
if((std::abs(cs - tmp)/std::max(cs, tmp)) > epsilon)\
|
||||
{\
|
||||
@@ -45,10 +47,9 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
|
||||
#define PREFIX "[C]"
|
||||
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
||||
0, 1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
0, 1, &clqueue, 0, NULL, NULL));
|
||||
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
|
||||
#undef PREFIX
|
||||
#define PREFIX "[C++]"
|
||||
|
||||
@@ -70,11 +71,11 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
{
|
||||
using isaac::_;
|
||||
|
||||
int_t N = 24378;
|
||||
int_t SUBN = 531;
|
||||
int_t N =2 ;
|
||||
int_t SUBN = 2;
|
||||
|
||||
INIT_VECTOR(N, SUBN, 2, 4, cx, x, ctx);
|
||||
INIT_VECTOR(N, SUBN, 5, 8, cy, y, ctx);
|
||||
INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx);
|
||||
INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx);
|
||||
|
||||
#define TEST_OPERATIONS(TYPE)\
|
||||
test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\
|
||||
@@ -88,6 +89,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
|
||||
int main()
|
||||
{
|
||||
clblasSetup();
|
||||
auto data = ad::driver::queues.contexts();
|
||||
for(const auto & elem : data)
|
||||
{
|
||||
@@ -100,5 +102,6 @@ int main()
|
||||
test_impl<double>(1e-9, elem.first);
|
||||
std::cout << "---" << std::endl;
|
||||
}
|
||||
clblasTeardown();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
@@ -20,7 +20,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
||||
cl_command_queue clqueue = (*queue.handle().cl)();
|
||||
int_t N = cz.size();
|
||||
|
||||
T aa = 3.12, bb=3.5;
|
||||
T aa = 4.378, bb=3.5;
|
||||
isaac::value_scalar a(aa), b(bb);
|
||||
isaac::scalar da(a, ctx), db(b, ctx);
|
||||
|
||||
@@ -32,6 +32,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
||||
for(int_t i = 0 ; i < N ; ++i)\
|
||||
CPU_LOOP;\
|
||||
GPU_EXPR;\
|
||||
queue.synchronize();\
|
||||
isaac::copy(z, buffer.data());\
|
||||
CONVERT;\
|
||||
if(diff(cz, buffer, epsilon))\
|
||||
@@ -44,16 +45,18 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
|
||||
}
|
||||
|
||||
#define PREFIX "[C]"
|
||||
RUN_TEST_VECTOR_AXPY("AXPY", cy[i] = cx[i] + a*cy[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
||||
RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
RUN_TEST_VECTOR_AXPY("COPY", cy[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
|
||||
RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
RUN_TEST_VECTOR_AXPY("SCAL", cx[i] = a*cx[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
|
||||
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*z.data().handle().cl)(), z.start()[0], z.stride()[0],
|
||||
1, &clqueue, 0, NULL, NULL));
|
||||
|
||||
|
||||
#undef PREFIX
|
||||
#define PREFIX "[C++]"
|
||||
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, ctx))
|
||||
@@ -136,6 +139,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
|
||||
|
||||
int main()
|
||||
{
|
||||
clblasSetup();
|
||||
auto data = ad::driver::queues.contexts();
|
||||
for(const auto & elem : data)
|
||||
{
|
||||
@@ -148,5 +152,6 @@ int main()
|
||||
test_impl<double>(1e-9, elem.first);
|
||||
std::cout << "---" << std::endl;
|
||||
}
|
||||
clblasTeardown();
|
||||
return EXIT_SUCCESS;
|
||||
}
|
||||
|
Reference in New Issue
Block a user