Code quality: bugfix in bench/test to note call clBLAS on CUDA backend

This commit is contained in:
Philippe Tillet
2015-08-26 14:12:50 -04:00
parent 9da87bee51
commit 69c11d16cc
6 changed files with 62 additions and 42 deletions

View File

@@ -145,7 +145,8 @@ void bench(sc::numeric_type dtype, std::string operation)
BENCHMARK_ISAAC(y = sc::control(x + alpha*y, sc::execution_options_type(0, &events)), 3*N*dtsize/t) BENCHMARK_ISAAC(y = sc::control(x + alpha*y, sc::execution_options_type(0, &events)), 3*N*dtsize/t)
/* clblas */ /* clblas */
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t); if(A.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t);
#endif #endif
/* BLAS */ /* BLAS */
#ifdef BENCH_CBLAS #ifdef BENCH_CBLAS
@@ -235,7 +236,8 @@ void bench(sc::numeric_type dtype, std::string operation)
#endif #endif
BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t); BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t) if(A.context().backend()==sc::driver::OPENCL)
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t)
#endif #endif
#ifdef BENCH_CBLAS #ifdef BENCH_CBLAS
std::vector<float> cA(M*N), cx(N), cy(M); std::vector<float> cA(M*N), cx(N), cy(M);
@@ -324,8 +326,9 @@ void bench(sc::numeric_type dtype, std::string operation)
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t); BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
/* clblas */ /* clblas */
#ifdef BENCH_CLBLAS #ifdef BENCH_CLBLAS
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb, if(A.context().backend()==sc::driver::OPENCL)
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, &event), (double)2*M*N*K/t) BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, &event), (double)2*M*N*K/t)
#endif #endif
/* BLAS */ /* BLAS */
#ifdef BENCH_CBLAS #ifdef BENCH_CBLAS

View File

@@ -1,6 +1,8 @@
#include "isaac/driver/handle.h" #include <cassert>
#include <memory> #include <memory>
#include "isaac/driver/handle.h"
namespace isaac namespace isaac
{ {
@@ -98,21 +100,29 @@ Handle<CLType, CUType>::~Handle()
template<class CLType, class CUType> template<class CLType, class CUType>
CLType & Handle<CLType, CUType>::cl() CLType & Handle<CLType, CUType>::cl()
{ return *cl_; } {
assert(backend_==OPENCL);
return *cl_;
}
template<class CLType, class CUType> template<class CLType, class CUType>
CLType const & Handle<CLType, CUType>::cl() const CLType const & Handle<CLType, CUType>::cl() const
{ return *cl_; } {
assert(backend_==OPENCL);
return *cl_;
}
template<class CLType, class CUType> template<class CLType, class CUType>
CUType & Handle<CLType, CUType>::cu() CUType & Handle<CLType, CUType>::cu()
{ {
assert(backend_==CUDA);
return *cu_; return *cu_;
} }
template<class CLType, class CUType> template<class CLType, class CUType>
CUType const & Handle<CLType, CUType>::cu() const CUType const & Handle<CLType, CUType>::cu() const
{ {
assert(backend_==CUDA);
return *cu_; return *cu_;
} }

View File

@@ -72,7 +72,7 @@ def main():
libraries += ['gnustl_shared'] libraries += ['gnustl_shared']
#Source files #Source files
src = 'src/lib/symbolic/preset.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/io.cpp src/lib/symbolic/expression.cpp src/lib/array.cpp src/lib/value_scalar.cpp src/lib/driver/backend.cpp src/lib/driver/device.cpp src/lib/driver/kernel.cpp src/lib/driver/buffer.cpp src/lib/driver/platform.cpp src/lib/driver/check.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/dispatch.cpp src/lib/driver/program_cache.cpp src/lib/driver/context.cpp src/lib/driver/event.cpp src/lib/driver/ndrange.cpp src/lib/driver/handle.cpp src/lib/exception/unknown_datatype.cpp src/lib/exception/operation_not_supported.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/stream.cpp src/lib/kernels/parse.cpp src/lib/kernels/keywords.cpp src/lib/kernels/binder.cpp src/lib/wrap/clBLAS.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']] src = 'src/lib/exception/operation_not_supported.cpp src/lib/exception/unknown_datatype.cpp src/lib/value_scalar.cpp src/lib/driver/check.cpp src/lib/driver/ndrange.cpp src/lib/driver/platform.cpp src/lib/driver/backend.cpp src/lib/driver/program.cpp src/lib/driver/command_queue.cpp src/lib/driver/event.cpp src/lib/driver/kernel.cpp src/lib/driver/handle.cpp src/lib/driver/device.cpp src/lib/driver/program_cache.cpp src/lib/driver/buffer.cpp src/lib/driver/context.cpp src/lib/driver/dispatch.cpp src/lib/kernels/templates/axpy.cpp src/lib/kernels/templates/gemv.cpp src/lib/kernels/templates/dot.cpp src/lib/kernels/templates/base.cpp src/lib/kernels/templates/ger.cpp src/lib/kernels/templates/gemm.cpp src/lib/kernels/stream.cpp src/lib/kernels/keywords.cpp src/lib/kernels/mapped_object.cpp src/lib/kernels/binder.cpp src/lib/kernels/parse.cpp src/lib/wrap/clBLAS.cpp src/lib/profiles/predictors/random_forest.cpp src/lib/profiles/presets.cpp src/lib/profiles/profiles.cpp src/lib/symbolic/execute.cpp src/lib/symbolic/expression.cpp src/lib/symbolic/io.cpp src/lib/symbolic/preset.cpp src/lib/array.cpp '.split() + [os.path.join('src', 'bind', sf) for sf in ['_isaac.cpp', 'core.cpp', 'driver.cpp', 'kernels.cpp', 'exceptions.cpp']]
boostsrc = 'external/boost/libs/' boostsrc = 'external/boost/libs/'
for s in ['numpy','python','smart_ptr','system','thread']: for s in ['numpy','python','smart_ptr','system','thread']:
src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x] src = src + [x for x in recursive_glob('external/boost/libs/' + s + '/src/','.cpp') if 'win32' not in x and 'pthread' not in x]

View File

@@ -10,7 +10,7 @@ typedef isaac::int_t int_t;
template<typename T> template<typename T>
void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T>& cy, simple_vector_base<T>& cz, void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T>& cy, simple_vector_base<T>& cz,
sc::array& x, sc::array& y, sc::array& z) sc::array& x, sc::array& y, sc::array& z, interface_t interf)
{ {
using namespace std; using namespace std;
@@ -18,7 +18,6 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
sc::numeric_type dtype = x.dtype(); sc::numeric_type dtype = x.dtype();
sc::driver::Context const & context = x.context(); sc::driver::Context const & context = x.context();
sc::driver::CommandQueue queue = sc::driver::backend::queues::get(context,0); sc::driver::CommandQueue queue = sc::driver::backend::queues::get(context,0);
cl_command_queue clqueue = queue.handle().cl();
int_t N = cz.size(); int_t N = cz.size();
T aa = static_cast<T>(-4.3); T aa = static_cast<T>(-4.3);
@@ -30,7 +29,7 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
#define CONVERT #define CONVERT
#define RUN_TEST_VECTOR_AXPY(NAME, CPU_LOOP, GPU_EXPR) \ #define RUN_TEST_VECTOR_AXPY(NAME, CPU_LOOP, GPU_EXPR) \
{\ {\
std::cout << PREFIX << " " << NAME "..." << std::flush;\ std::cout << NAME "..." << std::flush;\
for(int_t i = 0 ; i < N ; ++i)\ for(int_t i = 0 ; i < N ; ++i)\
CPU_LOOP;\ CPU_LOOP;\
GPU_EXPR;\ GPU_EXPR;\
@@ -46,22 +45,23 @@ void test_element_wise_vector(T epsilon, simple_vector_base<T> & cx, simple_vect
std::cout << std::endl;\ std::cout << std::endl;\
} }
if(queue.device().backend()==sc::driver::OPENCL){
#define PREFIX "[C]"
RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(z), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, CHANDLE(x), x.start()[0], x.stride()[0], if(queue.device().backend()==sc::driver::OPENCL && interf==clBLAS)
CHANDLE(z), z.start()[0], z.stride()[0], {
1, &clqueue, 0, NULL, NULL)); cl_command_queue clqueue = queue.handle().cl();
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), z.start()[0], z.stride()[0], RUN_TEST_VECTOR_AXPY("AXPY", cz[i] = a*cx[i] + cz[i], BLAS<T>::F(clblasSaxpy, clblasDaxpy)(N, a, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(z), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
RUN_TEST_VECTOR_AXPY("COPY", cz[i] = cx[i], BLAS<T>::F(clblasScopy, clblasDcopy)(N, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(z), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL)); 1, &clqueue, 0, NULL, NULL));
#undef PREFIX
RUN_TEST_VECTOR_AXPY("SCAL", cz[i] = a*cz[i], BLAS<T>::F(clblasSscal, clblasDscal)(N, a, CHANDLE(z), z.start()[0], z.stride()[0],
1, &clqueue, 0, NULL, NULL));
} }
#define PREFIX "[C++]"
RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context)) RUN_TEST_VECTOR_AXPY("z = 0", cz[i] = 0, z = zeros(N, 1, dtype, context))
RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], z = x) RUN_TEST_VECTOR_AXPY("z = x", cz[i] = cx[i], z = x)
RUN_TEST_VECTOR_AXPY("z = -x", cz[i] = -cx[i], z = -x) RUN_TEST_VECTOR_AXPY("z = -x", cz[i] = -cx[i], z = -x)
@@ -128,14 +128,16 @@ void test_impl(T epsilon, sc::driver::Context const & ctx)
INIT_VECTOR(N, SUBN, 3, 2, cz, z, ctx); INIT_VECTOR(N, SUBN, 3, 2, cz, z, ctx);
#define TEST_OPERATIONS(TYPE)\ #define TEST_OPERATIONS(TYPE, INTERF)\
test_element_wise_vector(epsilon, cx_ ## TYPE, cy_ ## TYPE, cz_ ## TYPE,\ test_element_wise_vector(epsilon, cx_ ## TYPE, cy_ ## TYPE, cz_ ## TYPE,\
x_ ## TYPE, y_ ## TYPE, z_ ## TYPE);\ x_ ## TYPE, y_ ## TYPE, z_ ## TYPE, INTERF);\
std::cout << "> standard..." << std::endl; std::cout << "> standard..." << std::endl;
TEST_OPERATIONS(full); TEST_OPERATIONS(full, clBLAS);
TEST_OPERATIONS(full, CPP);
std::cout << "> slice..." << std::endl; std::cout << "> slice..." << std::endl;
TEST_OPERATIONS(slice); TEST_OPERATIONS(slice, clBLAS);
TEST_OPERATIONS(slice, CPP);
} }
int main() int main()

View File

@@ -10,13 +10,12 @@ typedef sc::int_t int_t;
template<typename T> template<typename T>
void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T> & cy, void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T> & cy,
sc::array & x, sc::array & y) sc::array & x, sc::array & y, interface_t interf)
{ {
using namespace std; using namespace std;
sc::driver::Context const & ctx = x.context(); sc::driver::Context const & ctx = x.context();
int_t N = cx.size(); int_t N = cx.size();
sc::driver::CommandQueue queue = sc::driver::backend::queues::get(ctx,0); sc::driver::CommandQueue queue = sc::driver::backend::queues::get(ctx,0);
cl_command_queue clqueue = queue.handle().cl();
sc::array scratch(N, x.dtype()); sc::array scratch(N, x.dtype());
unsigned int failure_count = 0; unsigned int failure_count = 0;
@@ -28,7 +27,7 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
isaac::scalar ds(dtype, ctx); isaac::scalar ds(dtype, ctx);
#define RUN_TEST(NAME, CPU_REDUCTION, INIT, ASSIGNMENT, GPU_REDUCTION) \ #define RUN_TEST(NAME, CPU_REDUCTION, INIT, ASSIGNMENT, GPU_REDUCTION) \
cout << PREFIX << " " << NAME "..." << flush;\ cout << NAME "..." << flush;\
cs = INIT;\ cs = INIT;\
for(int_t i = 0 ; i < N ; ++i)\ for(int_t i = 0 ; i < N ; ++i)\
CPU_REDUCTION;\ CPU_REDUCTION;\
@@ -44,14 +43,18 @@ void test_reduction(T epsilon, simple_vector_base<T> & cx, simple_vector_base<T
else\ else\
cout << endl; cout << endl;
#define PREFIX "[C]"
RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, CHANDLE(ds), 0, CHANDLE(x), x.start()[0], x.stride()[0], if(ctx.backend()==sc::driver::OPENCL && interf==clBLAS)
CHANDLE(y), y.start()[0], y.stride()[0], {
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL)); cl_command_queue clqueue = queue.handle().cl();
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, CHANDLE(ds), 0, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL)); RUN_TEST("DOT", cs+=cx[i]*cy[i], 0, cs, BLAS<T>::F(clblasSdot, clblasDdot)(N, CHANDLE(ds), 0, CHANDLE(x), x.start()[0], x.stride()[0],
#undef PREFIX CHANDLE(y), y.start()[0], y.stride()[0],
#define PREFIX "[C++]" CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
RUN_TEST("ASUM", cs+=std::fabs(cx[i]), 0, cs, BLAS<T>::F(clblasSasum, clblasDasum)(N, CHANDLE(ds), 0, CHANDLE(x), x.start()[0], x.stride()[0],
CHANDLE(scratch), 1, &clqueue, 0, NULL, NULL));
}
RUN_TEST("s = x'.y", cs+=cx[i]*cy[i], 0, cs, ds = dot(x,y)); RUN_TEST("s = x'.y", cs+=cx[i]*cy[i], 0, cs, ds = dot(x,y));
RUN_TEST("s = exp(x'.y)", cs += cx[i]*cy[i], 0, std::exp(cs), ds = exp(dot(x,y))); RUN_TEST("s = exp(x'.y)", cs += cx[i]*cy[i], 0, std::exp(cs), ds = exp(dot(x,y)));
@@ -77,14 +80,16 @@ void test_impl(T epsilon, sc::driver::Context const & ctx)
INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx); INIT_VECTOR(N, SUBN, 0, 1, cx, x, ctx);
INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx); INIT_VECTOR(N, SUBN, 0, 1, cy, y, ctx);
#define TEST_OPERATIONS(TYPE)\ #define TEST_OPERATIONS(TYPE, ITF)\
test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\ test_reduction(epsilon, cx_ ## TYPE, cy_ ## TYPE,\
x_ ## TYPE, y_ ## TYPE);\ x_ ## TYPE, y_ ## TYPE, ITF);\
std::cout << "> standard..." << std::endl; std::cout << "> standard..." << std::endl;
TEST_OPERATIONS(full); TEST_OPERATIONS(full, clBLAS);
TEST_OPERATIONS(full, CPP);
std::cout << "> slice..." << std::endl; std::cout << "> slice..." << std::endl;
TEST_OPERATIONS(slice); TEST_OPERATIONS(slice, clBLAS);
TEST_OPERATIONS(slice, CPP);
} }

View File

@@ -47,7 +47,7 @@ void test_row_wise_reduction(T epsilon, simple_vector_base<T> & cy, simple_matri
std::cout << std::endl; std::cout << std::endl;
if(interf==clBLAS) if(y.context().backend()==sc::driver::OPENCL && interf==clBLAS)
{ {
cl_command_queue clqueue = queue.handle().cl(); cl_command_queue clqueue = queue.handle().cl();