diff --git a/lib/array.cpp b/lib/array.cpp index a3e399130..5a73a68c6 100644 --- a/lib/array.cpp +++ b/lib/array.cpp @@ -482,13 +482,17 @@ DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_LEQ_TYPE, operator <=, INT_TYPE) DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_EQ_TYPE, operator ==, INT_TYPE) DEFINE_ELEMENT_BINARY_OPERATOR(OPERATOR_ELEMENT_NEQ_TYPE, operator !=, INT_TYPE) +#define DEFINE_OUTER(LTYPE, RTYPE) \ +array_expression outer(LTYPE const & x, RTYPE const & y)\ +{\ + assert(x.nshape()==1 && y.nshape()==1);\ + return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) );\ +}\ -array_expression outer(array const & x, array const & y) -{ - assert(x.nshape()==1 && y.nshape()==1); - return array_expression(x, y, op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_OUTER_PROD_TYPE), x.context(), x.dtype(), size4(max(x.shape()), max(y.shape())) ); -} - +DEFINE_OUTER(array, array) +DEFINE_OUTER(array_expression, array) +DEFINE_OUTER(array, array_expression) +DEFINE_OUTER(array_expression, array_expression) #undef DEFINE_ELEMENT_BINARY_OPERATOR //--------------------------------------- @@ -705,6 +709,10 @@ namespace detail int_t N = A.shape()[1]; array_expression::node & A_root = const_cast(A.tree()[A.root()]); bool A_trans = A_root.op.type==OPERATOR_TRANS_TYPE; + while(A_root.lhs.type_family==COMPOSITE_OPERATOR_FAMILY){ + A_root = A.tree()[A_root.lhs.node_index]; + A_trans ^= A_root.op.type==OPERATOR_TRANS_TYPE; + } if(A_trans) { array_expression tmp(A, repmat(x, 1, M), op_element(OPERATOR_BINARY_TYPE_FAMILY, OPERATOR_ELEMENT_PROD_TYPE), A.context(), A.dtype(), size4(N, M)); @@ -717,12 +725,6 @@ namespace detail } - array_expression matvecprod(array_expression const & A, array_expression const & x) - { - return matvecprod(A, array(x)); - } - - } array_expression reshape(array const & x, int_t shape0, int_t shape1) @@ -735,22 +737,23 @@ array_expression reshape(array_expression const & x, int_t shape0, int_t shape1) #define DEFINE_DOT(LTYPE, RTYPE) \ array_expression dot(LTYPE const & x, RTYPE const & y)\ {\ - if(x.nshape()==1 && y.nshape()==1)\ - {\ - return sum(x*y);\ + if(x.nshape()<1 || y.nshape()<1){\ + return x*y;\ + }\ + if(x.nshape()==1 && y.nshape()==1){\ + if(x.shape()[1]==1 && y.shape()[0]==1)\ + return outer(x, y);\ + else if(x.shape()[0]==1 && y.shape()[1]==1)\ + return sum(x*trans(y));\ + else\ + return sum(x*y);\ }\ else if(x.nshape()==2 && y.nshape()==1)\ - {\ return detail::matvecprod(x, y);\ - }\ else if(x.nshape()==1 && y.nshape()==2)\ - {\ - return detail::matvecprod(trans(y), x);\ - }\ + return trans(detail::matvecprod(trans(y), trans(x)));\ else /*if(x.nshape()==2 && y.nshape()==2)*/\ - {\ return detail::matmatprod(x, y);\ - }\ } DEFINE_DOT(array, array) diff --git a/lib/backend/parse.cpp b/lib/backend/parse.cpp index 4db20b2bf..73e9df5a5 100644 --- a/lib/backend/parse.cpp +++ b/lib/backend/parse.cpp @@ -47,7 +47,8 @@ namespace detail bool bypass(op_element const & op) { - return op.type == OPERATOR_RESHAPE_TYPE; + return op.type == OPERATOR_RESHAPE_TYPE + ||op.type == OPERATOR_TRANS_TYPE; } bool is_cast(op_element const & op) @@ -68,8 +69,7 @@ namespace detail bool is_node_leaf(op_element const & op) { - return op.type==OPERATOR_TRANS_TYPE - || op.type==OPERATOR_MATRIX_DIAG_TYPE + return op.type==OPERATOR_MATRIX_DIAG_TYPE || op.type==OPERATOR_VDIAG_TYPE || op.type==OPERATOR_REPEAT_TYPE || op.type==OPERATOR_MATRIX_ROW_TYPE @@ -212,8 +212,6 @@ const char * evaluate(operation_node_type type) case OPERATOR_ELEMENT_FMIN_TYPE : return "fmin"; case OPERATOR_ELEMENT_MAX_TYPE : return "max"; case OPERATOR_ELEMENT_MIN_TYPE : return "min"; - //Unary - case OPERATOR_TRANS_TYPE : return "trans"; //Binary case OPERATOR_MATRIX_PRODUCT_NN_TYPE : return "prodNN"; diff --git a/lib/backend/templates/maxpy.cpp b/lib/backend/templates/maxpy.cpp index f94942561..870b60507 100644 --- a/lib/backend/templates/maxpy.cpp +++ b/lib/backend/templates/maxpy.cpp @@ -82,6 +82,13 @@ std::string maxpy::generate_impl(const char * suffix, expressions_tuple const & stream.dec_tab(); stream << "}" << std::endl; + stream << "if(" << GlobalIdx0(backend) << "==0 &&" << GlobalIdx1(backend) << "==0)" << std::endl; + stream << "{" << std::endl; + stream.inc_tab(); + process(stream, LHS_NODE_TYPE, tools::make_map >("array0", "#pointer[#start] = #namereg;"), expressions, mappings); + stream.dec_tab(); + stream << "}" << std::endl; + stream.dec_tab(); stream << "}" << std::endl; diff --git a/lib/backend/templates/mproduct.cpp b/lib/backend/templates/mproduct.cpp index 3daab6db0..f4541f5b6 100644 --- a/lib/backend/templates/mproduct.cpp +++ b/lib/backend/templates/mproduct.cpp @@ -567,7 +567,6 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width value_scalar const & alpha, value_scalar const & beta, driver::Program & program, const char * suffix, execution_options_type const & options) { - if(M==0 || N==0 || K==0) return; @@ -588,8 +587,10 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width driver::Kernel gemm(program, gemm_name); driver::NDRange local(p_.local_size_0, p_.local_size_1); + using tools::align; driver::NDRange global = (strcmp(suffix,"fallback")==0)?driver::NDRange(align(align(M,p_.mS)/p_.mS, p_.local_size_0), align(align(N,p_.nS)/p_.nS, p_.local_size_1), p_.depth):driver::NDRange(M/p_.mS, N/p_.nS, p_.depth); + unsigned int current_arg = 0; set_arguments_functor helper(binder, current_arg, gemm); gemm.setSizeArg(current_arg++, M); @@ -611,9 +612,14 @@ mproduct_parameters::mproduct_parameters(unsigned int simd_width gemm.setSizeArg(current_arg++, B.start()[0] + B.start()[1]*B.ld()/p_.simd_width); gemm.setSizeArg(current_arg++, B.stride()[0]); +// std::cout << "before " << *out << std::endl; + helper.set_arguments(beta.dtype(), beta.values()); options.enqueue(program.context(), gemm, global, local); + options.queue(program.context()).synchronize(); +// std::cout << "after " << *out << std::endl; + if(p_.depth > 1) { unsigned int current_arg = 0; diff --git a/lib/backend/templates/mreduction.cpp b/lib/backend/templates/mreduction.cpp index b63d412e0..86c65bc55 100644 --- a/lib/backend/templates/mreduction.cpp +++ b/lib/backend/templates/mreduction.cpp @@ -33,6 +33,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con { using tools::to_string; + std::vector reductions; expressions_tuple::data_type::const_iterator sit; std::vector::const_iterator mit; @@ -114,6 +115,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con { std::string data_type = append_width("#scalartype",simd_width); + for (const auto & e : reductions) { std::map accessors; @@ -130,7 +132,6 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con e->process_recursive(stream, PARENT_NODE_TYPE, accessors); } - //Update accumulators std::vector str(simd_width); if (simd_width==1) @@ -240,6 +241,7 @@ std::string mreduction::generate_impl(const char * suffix, expressions_tuple con stream << _size_t << " gsize1 = " << GlobalSize1(backend) <<";" << std::endl; + stream << _size_t << " upper_bound_1 = ( M +" << p_.local_size_1 - 1 << ")/" << p_.local_size_1 << "*" << p_.local_size_1 << ";" << std::endl; stream << "for(" << _size_t << " r = gid1; r < upper_bound_1; r += gsize1){" << std::endl; stream.inc_tab(); diff --git a/lib/model/model.cpp b/lib/model/model.cpp index 3ab3aacbd..30a83b0da 100644 --- a/lib/model/model.cpp +++ b/lib/model/model.cpp @@ -256,8 +256,8 @@ std::map, tools::shared_ptr > ini res[std::make_pair(MATRIX_AXPY_TYPE, DTYPE)] = ptr_t(new maxpy(1,8,8,8,8,FETCH_FROM_GLOBAL_STRIDED)); res[std::make_pair(ROW_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_rows(1, 8, 8, 4, 16, FETCH_FROM_GLOBAL_STRIDED)); res[std::make_pair(COL_WISE_REDUCTION_TYPE, DTYPE)] = ptr_t(new mreduction_cols(1, 8, 8, 64, 8, FETCH_FROM_GLOBAL_STRIDED)); - res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 1, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); - res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 1, 1, 1, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); + res[std::make_pair(MATRIX_PRODUCT_NN_TYPE, DTYPE)] = ptr_t(new mproduct_nn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); + res[std::make_pair(MATRIX_PRODUCT_TN_TYPE, DTYPE)] = ptr_t(new mproduct_tn(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); res[std::make_pair(MATRIX_PRODUCT_NT_TYPE, DTYPE)] = ptr_t(new mproduct_nt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); res[std::make_pair(MATRIX_PRODUCT_TT_TYPE, DTYPE)] = ptr_t(new mproduct_tt(1, 8, 8, 8, 1, 4, 1, 4, FETCH_FROM_LOCAL, FETCH_FROM_LOCAL, 8, 8, true)); } diff --git a/lib/wrap/clBLAS.cpp b/lib/wrap/clBLAS.cpp index e19a10c4b..852d5495f 100644 --- a/lib/wrap/clBLAS.cpp +++ b/lib/wrap/clBLAS.cpp @@ -59,7 +59,7 @@ extern "C" clRetainMemObject(mx); \ is::array y(N, TYPE_ISAAC, cl::Buffer(my), offy, incy); \ clRetainMemObject(my); \ - execute(is::assign(y, x + alpha*y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \ + execute(is::assign(y, alpha*x + y), y.context(), numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events); \ return clblasSuccess; \ } @@ -157,15 +157,14 @@ extern "C" std::swap(M, N);\ transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\ }\ - is::int_t As1 = M, As2 = N;\ - if(transA==clblasTrans) std::swap(As1, As2);\ - is::array A(As1, As2, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\ + is::array A(M, N, TYPE_ISAAC, cl::Buffer(mA), offA, lda);\ clRetainMemObject(mA);\ \ - is::array x(N, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\ + is::int_t sx = N, sy = M;\ + if(transA) std::swap(sx, sy);\ + is::array x(sx, TYPE_ISAAC, cl::Buffer(mx), offx, incx);\ clRetainMemObject(mx);\ - \ - is::array y(M, TYPE_ISAAC, cl::Buffer(my), offy, incy);\ + is::array y(sy, TYPE_ISAAC, cl::Buffer(my), offy, incy);\ clRetainMemObject(my);\ \ is::driver::Context const & context = A.context();\ @@ -182,6 +181,7 @@ extern "C" //***************** //BLAS3 //***************** + #define MAKE_GEMM(TYPE_CHAR, TYPE_ISAAC, TYPE_CL) \ clblasStatus clblas ## TYPE_CHAR ## gemm(clblasOrder order, clblasTranspose transA, clblasTranspose transB,\ size_t M, size_t N, size_t K,\ @@ -198,8 +198,7 @@ extern "C" std::swap(offA, offB);\ std::swap(lda, ldb);\ std::swap(M, N);\ - transA = (transA==clblasTrans)?clblasNoTrans:clblasTrans;\ - transB = (transB==clblasTrans)?clblasNoTrans:clblasTrans;\ + std::swap(transA, transB);\ }\ is::int_t As1 = M, As2 = K;\ is::int_t Bs1 = K, Bs2 = N;\ @@ -214,9 +213,8 @@ extern "C" clRetainMemObject(mC);\ is::driver::Context const & context = C.context();\ /*Operation*/\ - if((transA==clblasTrans) && (transB==clblasTrans)){\ + if((transA==clblasTrans) && (transB==clblasTrans))\ execute(is::assign(C, alpha*dot(A.T(), B.T()) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ -}\ else if((transA==clblasTrans) && (transB==clblasNoTrans))\ execute(is::assign(C, alpha*dot(A.T(), B) + beta*C), context, numCommandQueues, commandQueues, numEventsInWaitList, eventWaitList, events);\ else if((transA==clblasNoTrans) && (transB==clblasTrans))\ @@ -229,4 +227,6 @@ extern "C" MAKE_GEMM(S, is::FLOAT_TYPE, cl_float) MAKE_GEMM(D, is::DOUBLE_TYPE, cl_double) +#undef DOT + } diff --git a/python/setup.py b/python/setup.py index 47751f4f9..474f1ea18 100644 --- a/python/setup.py +++ b/python/setup.py @@ -115,7 +115,7 @@ def main(): include =' src/include'.split() + ['external/boost/include', os.path.join(find_module("numpy")[1], "core", "include")] #Source files - src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/mapped_object.cpp src/lib/backend/stream.cpp src/lib/backend/parse.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']] + src = 'src/lib/array.cpp src/lib/wrap/clBLAS.cpp src/lib/value_scalar.cpp src/lib/symbolic/preset.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/model/model.cpp src/lib/model/predictors/random_forest.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/driver/program.cpp src/lib/driver/context.cpp src/lib/driver/command_queue.cpp src/lib/driver/check.cpp src/lib/driver/buffer.cpp src/lib/driver/event.cpp src/lib/driver/device.cpp src/lib/driver/backend.cpp src/lib/driver/platform.cpp src/lib/driver/ndrange.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/backend/parse.cpp src/lib/backend/templates/reduction.cpp src/lib/backend/templates/mreduction.cpp src/lib/backend/templates/mproduct.cpp src/lib/backend/templates/maxpy.cpp src/lib/backend/templates/vaxpy.cpp src/lib/backend/templates/base.cpp src/lib/backend/stream.cpp src/lib/backend/mapped_object.cpp src/lib/backend/keywords.cpp src/lib/backend/binder.cpp '.split() + [os.path.join('src', 'wrap', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'model.cpp', 'exceptions.cpp']] boostsrc = 'external/boost/libs/' for s in ['numpy','python','smart_ptr','system','thread']: src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] diff --git a/tests/linalg/mproduct.cpp b/tests/linalg/mproduct.cpp index c1bb2fe6d..2471ace68 100644 --- a/tests/linalg/mproduct.cpp +++ b/tests/linalg/mproduct.cpp @@ -29,7 +29,7 @@ void test_impl(T epsilon, simple_matrix_base & cC, simple_matrix_base cons T cij = 0; for(int k = 0 ; k < K ; ++k) cij += cA(i,k)*cB(k,j); - cC(i,j) = cij; + cC(i,j) = alpha*cij + beta*cC(i, j); } } @@ -43,6 +43,7 @@ void test_impl(T epsilon, simple_matrix_base & cC, simple_matrix_base cons #define RUN_TEST(NAME, GPU_OP)\ std::cout << "[" << prefix << "] \t" << NAME << "..." << std::flush;\ GPU_OP;\ + queue.synchronize();\ ad::copy(C, buffer);\ if(diff(buffer, cCbuffer, epsilon))\ {\ @@ -57,20 +58,22 @@ void test_impl(T epsilon, simple_matrix_base & cC, simple_matrix_base cons cl_command_queue clqueue = (*queue.handle().cl)(); //Row-major - RUN_TEST("GEMM(ROW, N, N)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B), + RUN_TEST("GEMM(ROW, N, N)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B), CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); RUN_TEST("GEMM(ROW, N, T)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT), CHANDLE(A), OFF(A), LD(A), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); RUN_TEST("GEMM(ROW, T, N)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasTrans, N, M, K, alpha, CHANDLE(B), OFF(B), LD(B), CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); - RUN_TEST("GEMM(ROW, T, T)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasNoTrans, clblasNoTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT), + RUN_TEST("GEMM(ROW, T, T)", BLAS::F(clblasSgemm,clblasDgemm)(clblasRowMajor, clblasTrans, clblasTrans, N, M, K, alpha, CHANDLE(BT), OFF(BT), LD(BT), CHANDLE(AT), OFF(AT), LD(AT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); //Column-major RUN_TEST("GEMM(COL, N, N)", BLAS::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); + RUN_TEST("GEMM(COL, N, T)", BLAS::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasNoTrans, clblasTrans, M, N, K, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(BT), OFF(BT), LD(BT), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); + RUN_TEST("GEMM(COL, T, N)", BLAS::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasNoTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT), CHANDLE(B), OFF(B), LD(B), beta, CHANDLE(C), OFF(C), LD(C), 1, &clqueue, 0, NULL, NULL)); RUN_TEST("GEMM(COL, T, T)", BLAS::F(clblasSgemm,clblasDgemm)(clblasColumnMajor, clblasTrans, clblasTrans, M, N, K, alpha, CHANDLE(AT), OFF(AT), LD(AT), @@ -92,9 +95,10 @@ template void test_impl(T epsilon, ad::driver::Context const & ctx) { - int_t M = 412; - int_t N = 248; - int_t K = 376; + int_t M = 427; + int_t N = 248; + int_t K = 376; + int_t SUBM = 61; int_t SUBN = 75; @@ -120,6 +124,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx) int main() { + clblasSetup(); auto data = ad::driver::queues.contexts(); for(const auto & elem : data) { @@ -132,5 +137,6 @@ int main() test_impl(1e-9, elem.first); std::cout << "---" << std::endl; } + clblasTeardown(); return EXIT_SUCCESS; } diff --git a/tests/linalg/mreduction.cpp b/tests/linalg/mreduction.cpp index ae27d49c0..6a21a0bf1 100644 --- a/tests/linalg/mreduction.cpp +++ b/tests/linalg/mreduction.cpp @@ -18,6 +18,8 @@ void test_row_wise_reduction(T epsilon, simple_vector_base & cy, simple_matri simple_vector bufy(M); simple_vector bufx(N); + T alpha = 4.2, beta = 1.8; + ad::driver::CommandQueue queue = ad::driver::queues[y.context()][0]; T yi = 0, xi = 0; @@ -32,6 +34,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base & cy, simple_matri ASSIGNMENT;\ }\ GPU_REDUCTION;\ + queue.synchronize();\ ad::copy(RES, BUF.data());\ if(diff(CRES, BUF, epsilon))\ {\ @@ -47,24 +50,24 @@ void test_row_wise_reduction(T epsilon, simple_vector_base & cy, simple_matri cl_command_queue clqueue = (*queue.handle().cl)(); - TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi, - BLAS::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A), - CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0], + TEST_OPERATION("GEMV(ROW, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i], + BLAS::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A), + CHANDLE(x), x.start()[0], x.stride()[0], beta, CHANDLE(y), y.start()[0], y.stride()[0], 1, &clqueue, 0, NULL, NULL), y, bufy, cy); - TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi, - BLAS::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A), - CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0], + TEST_OPERATION("GEMV(ROW, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i], + BLAS::F(clblasSgemv, clblasDgemv)(clblasRowMajor, clblasNoTrans, N, M, alpha, CHANDLE(A), OFF(A), LD(A), + CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0], 1, &clqueue, 0, NULL, NULL), x, bufx, cx); - TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = yi, - BLAS::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, 1, CHANDLE(A), OFF(A), LD(A), + TEST_OPERATION("GEMV(COL, NoTrans)", M, N, yi+=cA(i,j)*cx[j], cy[i] = alpha*yi + beta*cy[i], + BLAS::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasNoTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A), CHANDLE(x), x.start()[0], x.stride()[0], 0, CHANDLE(y), y.start()[0], y.stride()[0], 1, &clqueue, 0, NULL, NULL), y, bufy, cy); - TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = xi, - BLAS::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, N, M, 1, CHANDLE(A), OFF(A), LD(A), - CHANDLE(y), y.start()[0], y.stride()[0], 0, CHANDLE(x), x.start()[0], x.stride()[0], + TEST_OPERATION("GEMV(COL, Trans)", N, M, xi+=cA(j,i)*cy[j], cx[i] = alpha*xi + beta*cx[i], + BLAS::F(clblasSgemv, clblasDgemv)(clblasColumnMajor, clblasTrans, M, N, alpha, CHANDLE(A), OFF(A), LD(A), + CHANDLE(y), y.start()[0], y.stride()[0], beta, CHANDLE(x), x.start()[0], x.stride()[0], 1, &clqueue, 0, NULL, NULL), x, bufx, cx); } else @@ -102,6 +105,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx) int main() { + clblasSetup(); auto data = ad::driver::queues.contexts(); for(const auto & elem : data) { @@ -114,5 +118,6 @@ int main() test_impl(1e-9, elem.first); std::cout << "---" << std::endl; } + clblasTeardown(); return EXIT_SUCCESS; } diff --git a/tests/linalg/reduction.cpp b/tests/linalg/reduction.cpp index 30e202034..4c6d00694 100644 --- a/tests/linalg/reduction.cpp +++ b/tests/linalg/reduction.cpp @@ -17,6 +17,7 @@ void test_reduction(T epsilon, simple_vector_base & cx, simple_vector_base & cx, simple_vector_base epsilon)\ {\ @@ -45,10 +47,9 @@ void test_reduction(T epsilon, simple_vector_base & cx, simple_vector_base::F(clblasSdot, clblasDdot)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], (*y.data().handle().cl)(), y.start()[0], y.stride()[0], - 0, 1, &clqueue, 0, NULL, NULL)); - + CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL)); RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS::F(clblasSasum, clblasDasum)(N, (*ds.data().handle().cl)(), 0, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], - 0, 1, &clqueue, 0, NULL, NULL)); + CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL)); #undef PREFIX #define PREFIX "[C++]" @@ -70,11 +71,11 @@ void test_impl(T epsilon, ad::driver::Context const & ctx) { using isaac::_; - int_t N = 24378; - int_t SUBN = 531; + int_t N =2 ; + int_t SUBN = 2; - INIT_VECTOR(N, SUBN, 2, 4, cx, x, ctx); - INIT_VECTOR(N, SUBN, 5, 8, cy, y, ctx); + INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx); + INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx); #define TEST_OPERATIONS(TYPE)\ test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\ @@ -88,6 +89,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx) int main() { + clblasSetup(); auto data = ad::driver::queues.contexts(); for(const auto & elem : data) { @@ -100,5 +102,6 @@ int main() test_impl(1e-9, elem.first); std::cout << "---" << std::endl; } + clblasTeardown(); return EXIT_SUCCESS; } diff --git a/tests/linalg/vaxpy.cpp b/tests/linalg/vaxpy.cpp index 91d2955ff..ae8c7cea3 100644 --- a/tests/linalg/vaxpy.cpp +++ b/tests/linalg/vaxpy.cpp @@ -20,7 +20,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base & cx, simple_vect cl_command_queue clqueue = (*queue.handle().cl)(); int_t N = cz.size(); - T aa = 3.12, bb=3.5; + T aa = 4.378, bb=3.5; isaac::value_scalar a(aa), b(bb); isaac::scalar da(a, ctx), db(b, ctx); @@ -32,6 +32,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base & cx, simple_vect for(int_t i = 0 ; i < N ; ++i)\ CPU_LOOP;\ GPU_EXPR;\ + queue.synchronize();\ isaac::copy(z, buffer.data());\ CONVERT;\ if(diff(cz, buffer, epsilon))\ @@ -44,16 +45,18 @@ void test_element_wise_vector(T epsilon, simple_vector_base & cx, simple_vect } #define PREFIX "[C]" - RUN_TEST_VECTOR_AXPY("AXPY", cy[i] = cx[i] + a*cy[i], BLAS::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], - (*y.data().handle().cl)(), y.start()[0], y.stride()[0], + RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS::F(clblasSaxpy, clblasDaxpy)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], + (*z.data().handle().cl)(), z.start()[0], z.stride()[0], 1, &clqueue, 0, NULL, NULL)); - RUN_TEST_VECTOR_AXPY("COPY", cy[i] = cx[i], BLAS::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], - (*y.data().handle().cl)(), y.start()[0], y.stride()[0], + RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS::F(clblasScopy, clblasDcopy)(N, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], + (*z.data().handle().cl)(), z.start()[0], z.stride()[0], 1, &clqueue, 0, NULL, NULL)); - RUN_TEST_VECTOR_AXPY("SCAL", cx[i] = a*cx[i], BLAS::F(clblasSscal, clblasDscal)(N, a, (*x.data().handle().cl)(), x.start()[0], x.stride()[0], + RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS::F(clblasSscal, clblasDscal)(N, a, (*z.data().handle().cl)(), z.start()[0], z.stride()[0], 1, &clqueue, 0, NULL, NULL)); + + #undef PREFIX #define PREFIX "[C++]" RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, ctx)) @@ -136,6 +139,7 @@ void test_impl(T epsilon, ad::driver::Context const & ctx) int main() { + clblasSetup(); auto data = ad::driver::queues.contexts(); for(const auto & elem : data) { @@ -148,5 +152,6 @@ int main() test_impl(1e-9, elem.first); std::cout << "---" << std::endl; } + clblasTeardown(); return EXIT_SUCCESS; }