Backend: A lot of bugfixes in dot() for handling shapes better

This commit is contained in:
Philippe Tillet
2015-06-30 17:55:57 -04:00
parent e7cabf65ac
commit cf2dba43ef
12 changed files with 108 additions and 73 deletions

View File

@@ -482,13 +482,17 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_LEQ_TYPE, operator <=, INT_TYPE)
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_EQ_TYPE, operator ==, INT_TYPE)
DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_NEQ_TYPE, operator !=, INT_TYPE)
#define DEFINE_OUTER(LTYPE, RTYPE) \
array_expression outer(LTYPE const & x, RTYPE const & y)\
{\
assert(x.nshape()==1 && y.nshape()==1);\
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );\
}\
array_expression outer(array const & x, array const & y)
{
assert(x.nshape()==1 && y.nshape()==1);
return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );
}
DEFINE_OUTER(array, array)
DEFINE_OUTER(array_expression, array)
DEFINE_OUTER(array, array_expression)
DEFINE_OUTER(array_expression, array_expression)
#undef DEFINE_ELEMENT_BINARY_OPERATOR
//---------------------------------------
@@ -705,6 +709,10 @@ namespace detail
int_t N = A.shape()[1];
array_expression::node & A_root = const_cast<array_expression::node &>(A.tree()[A.root()]);
bool A_trans = A_root.op.type==OPERATOR_TRANS_TYPE;
while(A_root.lhs.type_family==COMPOSITE_OPERATOR_FAMILY){
A_root = A.tree()[A_root.lhs.node_index];
A_trans ^= A_root.op.type==OPERATOR_TRANS_TYPE;
}
if(A_trans)
{
array_expression tmp(A, repmat(x, 1, M), op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ELEMENT_PROD_TYPE), A.context(), A.dtype(), size4(N, M));
@@ -717,12 +725,6 @@ namespace detail
}
array_expression matvecprod(array_expression const & A, array_expression const & x)
{
return matvecprod(A, array(x));
}
}
array_expression reshape(array const & x, int_t shape0, int_t shape1)
@@ -735,22 +737,23 @@ array_expression reshape(array_expression const & x, int_t shape0, int_t shape1)
#define DEFINE_DOT(LTYPE, RTYPE) \
array_expression dot(LTYPE const & x, RTYPE const & y)\
{\
if(x.nshape()==1 && y.nshape()==1)\
{\
return sum(x*y);\
if(x.nshape()<1 || y.nshape()<1){\
return x*y;\
}\
if(x.nshape()==1 && y.nshape()==1){\
if(x.shape()[1]==1 && y.shape()[0]==1)\
return outer(x, y);\
else if(x.shape()[0]==1 && y.shape()[1]==1)\
return sum(x*trans(y));\
else\
return sum(x*y);\
}\
else if(x.nshape()==2 && y.nshape()==1)\
{\
return detail::matvecprod(x, y);\
}\
else if(x.nshape()==1 && y.nshape()==2)\
{\
return detail::matvecprod(trans(y), x);\
}\
return trans(detail::matvecprod(trans(y), trans(x)));\
else /*if(x.nshape()==2 && y.nshape()==2)*/\
{\
return detail::matmatprod(x, y);\
}\
}
DEFINE_DOT(array, array)

View File

@@ -47,7 +47,8 @@ namespace detail
bool bypass(op_element const & op)
{
return op.type == OPERATOR_RESHAPE_TYPE;
return op.type == OPERATOR_RESHAPE_TYPE
||op.type == OPERATOR_TRANS_TYPE;
}
bool is_cast(op_element const & op)
@@ -68,8 +69,7 @@ namespace detail
bool is_node_leaf(op_element const & op)
{
return op.type==OPERATOR_TRANS_TYPE
|| op.type==OPERATOR_MATRIX_DIAG_TYPE
return op.type==OPERATOR_MATRIX_DIAG_TYPE
|| op.type==OPERATOR_VDIAG_TYPE
|| op.type==OPERATOR_REPEAT_TYPE
|| op.type==OPERATOR_MATRIX_ROW_TYPE
@@ -212,8 +212,6 @@ const char * evaluate(operation_node_type type)
case OPERATOR_ELEMENT_FMIN_TYPE : return "fmin";
case OPERATOR_ELEMENT_MAX_TYPE : return "max";
case OPERATOR_ELEMENT_MIN_TYPE : return "min";
//Unary
case OPERATOR_TRANS_TYPE : return "trans";
//Binary
case OPERATOR_MATRIX_PRODUCT_NN_TYPE : return "prodNN";

View File

@@ -82,6 +82,13 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const &
stream.dec_tab();
stream << "}" << std::endl;
stream << "if(" << GlobalIdx0(backend) << "==0 &&" << GlobalIdx1(backend) << "==0)" << std::endl;
stream << "{" << std::endl;
stream.inc_tab();
process(stream, LHS_NODE_TYPE, tools::make_map<std::map<std::string, std::string> >("array0", "#pointer[#start] = #namereg;"), expressions, mappings);
stream.dec_tab();
stream << "}" << std::endl;
stream.dec_tab();
stream << "}" << std::endl;

View File

@@ -567,7 +567,6 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
value_scalar const & alpha, value_scalar const & beta,
driver::Program & program, const char * suffix, execution_options_type const & options)
{
if(M==0 || N==0 || K==0)
return;
@@ -588,8 +587,10 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
driver::Kernel gemm(program, gemm_name);
driver::NDRange local(p_.local_size_0, p_.local_size_1);
using tools::align;
driver::NDRange global = (strcmp(suffix,"fallback")==0)?driver::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth):driver::NDRange(M/p_.mS, N/p_.nS, p_.depth);
unsigned int current_arg = 0;
set_arguments_functor helper(binder, current_arg, gemm);
gemm.setSizeArg(current_arg++, M);
@@ -611,9 +612,14 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width
gemm.setSizeArg(current_arg++, B.start()[0] + B.start()[1]*B.ld()/p_.simd_width);
gemm.setSizeArg(current_arg++, B.stride()[0]);
// std::cout << "before " << *out << std::endl;
helper.set_arguments(beta.dtype(), beta.values());
options.enqueue(program.context(), gemm, global, local);
options.queue(program.context()).synchronize();
// std::cout << "after " << *out << std::endl;
if(p_.depth > 1)
{
unsigned int current_arg = 0;

View File

@@ -33,6 +33,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
{
using tools::to_string;
std::vector<mapped_mreduction*> reductions;
expressions_tuple::data_type::const_iterator sit;
std::vector<mapping_type>::const_iterator mit;
@@ -114,6 +115,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
{
std::string data_type = append_width("#scalartype",simd_width);
for (const auto & e : reductions)
{
std::map<std::string, std::string> accessors;
@@ -130,7 +132,6 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
}
//Update accumulators
std::vector<std::string> str(simd_width);
if (simd_width==1)
@@ -240,6 +241,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con
stream << _size_t << " gsize1 = " << GlobalSize1(backend) <<";" << std::endl;
stream << _size_t << " upper_bound_1 = ( M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << ";" << std::endl;
stream << "for(" << _size_t << " r = gid1; r < upper_bound_1; r += gsize1){" << std::endl;
stream.inc_tab();

View File

@@ -256,8 +256,8 @@ std::map<std::pair<expression_type, numeric_type>, tools::shared_ptr<base> > ini
res[std::make_pair(MATRIX_AXPY_TYPE, DTYPE)] = ptr_t(new maxpy(1,8,8,8,8,FETCH_FROM_GLOBAL_STRIDED));
res[std::make_pair(ROW_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_rows(1, 8, 8, 4, 16, FETCH_FROM_GLOBAL_STRIDED));
res[std::make_pair(COL_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_cols(1, 8, 8, 64, 8, FETCH_FROM_GLOBAL_STRIDED));
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 1, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 1, 1, 1, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new mproduct_nt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new mproduct_tt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true));
}

View File

@@ -59,7 +59,7 @@ extern "C"
clRetainMemObject(mx); \
is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \
clRetainMemObject(my); \
execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
execute(is::assign(y, alpha*x + y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \
return clblasSuccess; \
}
@@ -157,15 +157,14 @@ extern "C"
std::swap(M, N);\
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
}\
is::int_t As1 = M, As2 = N;\
if(transA==clblasTrans) std::swap(As1, As2);\
is::array A(As1, As2, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
is::array A(M, N, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\
clRetainMemObject(mA);\
\
is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
is::int_t sx = N, sy = M;\
if(transA) std::swap(sx, sy);\
is::array x(sx, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\
clRetainMemObject(mx);\
\
is::array y(M, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
is::array y(sy, TYPE_ISAAC, cl::Buffer(my), offy, incy);\
clRetainMemObject(my);\
\
is::driver::Context const & context = A.context();\
@@ -182,6 +181,7 @@ extern "C"
//*****************
//BLAS3
//*****************
#define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CL) \
clblasStatus clblas ## TYPE_CHAR ## gemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,\
size_t M, size_t N, size_t K,\
@@ -198,8 +198,7 @@ extern "C"
std::swap(offA, offB);\
std::swap(lda, ldb);\
std::swap(M, N);\
transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\
transB = (transB==clblasTrans)?clblasNoTrans:clblasTrans;\
std::swap(transA, transB);\
}\
is::int_t As1 = M, As2 = K;\
is::int_t Bs1 = K, Bs2 = N;\
@@ -214,9 +213,8 @@ extern "C"
clRetainMemObject(mC);\
is::driver::Context const & context = C.context();\
/*Operation*/\
if((transA==clblasTrans) && (transB==clblasTrans)){\
if((transA==clblasTrans) && (transB==clblasTrans))\
execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
}\
else if((transA==clblasTrans) && (transB==clblasNoTrans))\
execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\
else if((transA==clblasNoTrans) && (transB==clblasTrans))\
@@ -229,4 +227,6 @@ extern "C"
MAKE_GEMM(S, is::FLOAT_TYPE, cl_float)
MAKE_GEMM(D, is::DOUBLE_TYPE, cl_double)
#undef DOT
}

View File

@@ -115,7 +115,7 @@ def main():
include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")]
#Source files
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
src = 'src/lib/array.cpp src/lib/wrap/clBLAS.cpp src/lib/value_scalar.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/driver/program.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/check.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/device.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/ndrange.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/backend/parse.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/stream.cpp src/lib/backend/mapped_object.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]

View File

@@ -29,7 +29,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
T cij = 0;
for(int k = 0 ; k < K ; ++k)
cij += cA(i,k)*cB(k,j);
cC(i,j) = cij;
cC(i,j) = alpha*cij + beta*cC(i, j);
}
}
@@ -43,6 +43,7 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
#define RUN_TEST(NAME, GPU_OP)\
std::cout << "[" << prefix << "] \t" << NAME << "..." << std::flush;\
GPU_OP;\
queue.synchronize();\
ad::copy(C, buffer);\
if(diff(buffer, cCbuffer, epsilon))\
{\
@@ -57,20 +58,22 @@ void test_impl(T epsilon, simple_matrix_base<T> & cC, simple_matrix_base<T> cons
cl_command_queue clqueue = (*queue.handle().cl)();
//Row-major
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
RUN_TEST("GEMM(ROW, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(ROW, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(ROW, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B),
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
RUN_TEST("GEMM(ROW, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT),
CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
//Column-major
RUN_TEST("GEMM(COL, N, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(COL, N, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(COL, T, N)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("GEMM(COL, T, T)", BLAS<T>::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT),
@@ -92,9 +95,10 @@ template<typename T>
void test_impl(T epsilon, ad::driver::Context const & ctx)
{
int_t M = 412;
int_t N = 248;
int_t K = 376;
int_t M = 427;
int_t N = 248;
int_t K = 376;
int_t SUBM = 61;
int_t SUBN = 75;
@@ -120,6 +124,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
int main()
{
clblasSetup();
auto data = ad::driver::queues.contexts();
for(const auto & elem : data)
{
@@ -132,5 +137,6 @@ int main()
test_impl<double>(1e-9, elem.first);
std::cout << "---" << std::endl;
}
clblasTeardown();
return EXIT_SUCCESS;
}

View File

@@ -18,6 +18,8 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
simple_vector<T> bufy(M);
simple_vector<T> bufx(N);
T alpha = 4.2, beta = 1.8;
ad::driver::CommandQueue queue = ad::driver::queues[y.context()][0];
T yi = 0, xi = 0;
@@ -32,6 +34,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
ASSIGNMENT;\
}\
GPU_REDUCTION;\
queue.synchronize();\
ad::copy(RES, BUF.data());\
if(diff(CRES, BUF, epsilon))\
{\
@@ -47,24 +50,24 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
cl_command_queue clqueue = (*queue.handle().cl)();
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(x), x.start()[0], x.stride()[0], beta, CHANDLE(y), y.start()[0], y.stride()[0],
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi,
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A),
TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i],
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0],
1, &clqueue, 0, NULL, NULL), y, bufy, cy);
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi,
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A),
CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0],
TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i],
BLAS<T>::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A),
CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0],
1, &clqueue, 0, NULL, NULL), x, bufx, cx);
}
else
@@ -102,6 +105,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
int main()
{
clblasSetup();
auto data = ad::driver::queues.contexts();
for(const auto & elem : data)
{
@@ -114,5 +118,6 @@ int main()
test_impl<double>(1e-9, elem.first);
std::cout << "---" << std::endl;
}
clblasTeardown();
return EXIT_SUCCESS;
}

View File

@@ -17,6 +17,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
int_t N = cx.size();
ad::driver::CommandQueue queue = ad::driver::queues[ctx][0];
cl_command_queue clqueue = (*queue.handle().cl)();
ad::array scratch(N, x.dtype());
unsigned int failure_count = 0;
@@ -33,6 +34,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
CPU_REDUCTION;\
cs= ASSIGNMENT ;\
GPU_REDUCTION;\
queue.synchronize();\
tmp = ds;\
if((std::abs(cs - tmp)/std::max(cs, tmp)) > epsilon)\
{\
@@ -45,10 +47,9 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
#define PREFIX "[C]"
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
0, 1, &clqueue, 0, NULL, NULL));
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
0, 1, &clqueue, 0, NULL, NULL));
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
#undef PREFIX
#define PREFIX "[C++]"
@@ -70,11 +71,11 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
{
using isaac::_;
int_t N = 24378;
int_t SUBN = 531;
int_t N =2 ;
int_t SUBN = 2;
INIT_VECTOR(N, SUBN, 2, 4, cx, x, ctx);
INIT_VECTOR(N, SUBN, 5, 8, cy, y, ctx);
INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx);
INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx);
#define TEST_OPERATIONS(TYPE)\
test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\
@@ -88,6 +89,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
int main()
{
clblasSetup();
auto data = ad::driver::queues.contexts();
for(const auto & elem : data)
{
@@ -100,5 +102,6 @@ int main()
test_impl<double>(1e-9, elem.first);
std::cout << "---" << std::endl;
}
clblasTeardown();
return EXIT_SUCCESS;
}

View File

@@ -20,7 +20,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
cl_command_queue clqueue = (*queue.handle().cl)();
int_t N = cz.size();
T aa = 3.12, bb=3.5;
T aa = 4.378, bb=3.5;
isaac::value_scalar a(aa), b(bb);
isaac::scalar da(a, ctx), db(b, ctx);
@@ -32,6 +32,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
for(int_t i = 0 ; i < N ; ++i)\
CPU_LOOP;\
GPU_EXPR;\
queue.synchronize();\
isaac::copy(z, buffer.data());\
CONVERT;\
if(diff(cz, buffer, epsilon))\
@@ -44,16 +45,18 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
}
#define PREFIX "[C]"
RUN_TEST_VECTOR_AXPY("AXPY", cy[i] = cx[i] + a*cy[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
RUN_TEST_VECTOR_AXPY("COPY", cy[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
(*y.data().handle().cl)(), y.start()[0], y.stride()[0],
RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
(*z.data().handle().cl)(), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
RUN_TEST_VECTOR_AXPY("SCAL", cx[i] = a*cx[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0],
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, (*z.data().handle().cl)(), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
#undef PREFIX
#define PREFIX "[C++]"
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, ctx))
@@ -136,6 +139,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx)
int main()
{
clblasSetup();
auto data = ad::driver::queues.contexts();
for(const auto & elem : data)
{
@@ -148,5 +152,6 @@ int main()
test_impl<double>(1e-9, elem.first);
std::cout << "---" << std::endl;
}
clblasTeardown();
return EXIT_SUCCESS;
}