Now using events to time autotuning
This commit is contained in:
167
bench/blas.cpp
167
bench/blas.cpp
@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
|
||||
while(total_time*1e-9 < 1e-1){\
|
||||
cl::Event event;\
|
||||
OP;\
|
||||
queue.flush();\
|
||||
queue.finish();\
|
||||
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
|
||||
total_time+=times.back();\
|
||||
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
|
||||
std::cout << " " << PERF << std::flush;\
|
||||
}
|
||||
|
||||
/*---------*/
|
||||
/*--BLAS1--*/
|
||||
/*---------*/
|
||||
std::cout << "#AXPY" << std::endl;
|
||||
for(auto N : BLAS1_N)
|
||||
{
|
||||
std::cout << N;
|
||||
ad::array x(N, dtype), y(N, dtype);
|
||||
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
|
||||
/* ATIDLAS */
|
||||
y = x + y; queue.flush(); queue.finish();
|
||||
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
|
||||
/* clAmdBlas */
|
||||
#ifdef BENCH_CLAMDBLAS
|
||||
BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
#endif
|
||||
/* BLAS */
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cx(N), cy(N);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
#endif
|
||||
/* CuBLAS */
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cux, *cuy;
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
|
||||
// std::cout << "#DOT" << std::endl;
|
||||
// for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
|
||||
// /*---------*/
|
||||
// /*--BLAS1--*/
|
||||
// /*---------*/
|
||||
// std::cout << "#AXPY" << std::endl;
|
||||
// for(int_t N : create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// int_t N = *it;
|
||||
// std::cout << N;
|
||||
// /* ATIDLAS */
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// ad::array scratch(N, dtype);
|
||||
// ad::scalar s(dtype);
|
||||
// CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
|
||||
// cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
|
||||
// /* ATIDLAS */
|
||||
// y = x + y; queue.flush(); queue.finish();
|
||||
// BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
|
||||
// /* clAmdBlas */
|
||||
//#ifdef BENCH_CLAMDBLAS
|
||||
// CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
|
||||
// BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
|
||||
//#endif
|
||||
// /* BLAS */
|
||||
//#ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
|
||||
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
|
||||
//#endif
|
||||
// /* CuBLAS */
|
||||
//#ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
//#endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
|
||||
// /*---------*/
|
||||
// /*--BLAS2--*/
|
||||
// /*---------*/
|
||||
// //T-layout
|
||||
// std::cout << "#GEMV-T" << std::endl;
|
||||
// for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
|
||||
// for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
|
||||
// {
|
||||
// int_t M = *Mit;
|
||||
// int_t N = *Nit;
|
||||
// std::cout << M << "," << N;
|
||||
// /* ATIDLAS */
|
||||
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
// CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
|
||||
// /* clAmdBlas */
|
||||
// #ifdef BENCH_CLAMDBLAS
|
||||
// CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
|
||||
// #endif
|
||||
// /* BLAS */
|
||||
// #ifdef BENCH_CBLAS
|
||||
// std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// ad::copy(A, cA);
|
||||
// CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
|
||||
// #endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
// std::cout << "#DOT" << std::endl;
|
||||
// for(int_t N : create_log_range(1e3, 2e7, 50, 64))
|
||||
// {
|
||||
// std::cout << N;
|
||||
// /* ATIDLAS */
|
||||
// ad::array x(N, dtype), y(N, dtype);
|
||||
// cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
|
||||
// ad::array scratch(N, dtype);
|
||||
// ad::scalar s(dtype);
|
||||
// s = dot(x,y); queue.flush(); queue.finish();
|
||||
// BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
|
||||
// /* clAmdBlas */
|
||||
//#ifdef BENCH_CLAMDBLAS
|
||||
// BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
|
||||
//#endif
|
||||
// /* BLAS */
|
||||
//#ifdef BENCH_CBLAS
|
||||
// std::vector<float> cx(N), cy(N);
|
||||
// ad::copy(x, cx);
|
||||
// ad::copy(y, cy);
|
||||
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
|
||||
//#endif
|
||||
//#ifdef BENCH_CUBLAS
|
||||
// T *cux, *cuy;
|
||||
// T result;
|
||||
// cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
// cudaMalloc((void**) &cuy, N * sizeof(T));
|
||||
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
|
||||
// cudaFree(cux);
|
||||
// cudaFree(cuy);
|
||||
//#endif
|
||||
// std::cout << std::endl;
|
||||
// }
|
||||
// std::cout << "\n\n" << std::flush;
|
||||
|
||||
/*---------*/
|
||||
/*--BLAS2--*/
|
||||
/*---------*/
|
||||
//T-layout
|
||||
std::cout << "#GEMV-T" << std::endl;
|
||||
for(int_t N: std::vector<int>{64})
|
||||
for(int_t M: create_full_range(128, 10000, 64))
|
||||
{
|
||||
std::cout << M << "," << N;
|
||||
/* ATIDLAS */
|
||||
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
|
||||
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
|
||||
y = dot(trans(A),x); queue.flush(); queue.finish();
|
||||
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
|
||||
#ifdef BENCH_CLAMDBLAS
|
||||
BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
|
||||
#endif
|
||||
#ifdef BENCH_CBLAS
|
||||
std::vector<float> cA(N*M), cx(N), cy(M);
|
||||
ad::copy(x, cx);
|
||||
ad::copy(y, cy);
|
||||
ad::copy(A, cA);
|
||||
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
|
||||
#endif
|
||||
#ifdef BENCH_CUBLAS
|
||||
T *cuA, *cux, *cuy;
|
||||
cudaMalloc((void**) &cuA, N * M * sizeof(T));
|
||||
cudaMalloc((void**) &cux, N * sizeof(T));
|
||||
cudaMalloc((void**) &cuy, M * sizeof(T));
|
||||
BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
|
||||
cudaFree(cuA);
|
||||
cudaFree(cux);
|
||||
cudaFree(cuy);
|
||||
#endif
|
||||
std::cout << std::endl;
|
||||
}
|
||||
std::cout << "\n\n" << std::flush;
|
||||
|
||||
//// /*---------*/
|
||||
//// /*--BLAS3--*/
|
||||
|
@@ -34,9 +34,11 @@ public:
|
||||
|
||||
//General constructor
|
||||
array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
|
||||
|
||||
array(array_expression const & proxy);
|
||||
array(array const &);
|
||||
|
||||
template<class T>
|
||||
array(controller<T> const &);
|
||||
//Getters
|
||||
numeric_type dtype() const;
|
||||
size4 shape() const;
|
||||
|
@@ -284,6 +284,10 @@ private:
|
||||
compilation_options_type compilation_options_;
|
||||
};
|
||||
|
||||
template<class TYPE>
|
||||
controller<TYPE> control(TYPE const & x, execution_options_type const& execution_options = execution_options_type(),
|
||||
dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type())
|
||||
{ return controller<TYPE>(x, execution_options, dispatcher_options, compilation_options); }
|
||||
|
||||
class expressions_tuple
|
||||
{
|
||||
|
@@ -83,23 +83,22 @@ array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const
|
||||
ld_(ld), context_(context), data_(data)
|
||||
{ }
|
||||
|
||||
array::array(array_expression const & proxy) :
|
||||
dtype_(proxy.dtype()),
|
||||
shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
|
||||
context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
|
||||
{
|
||||
*this = proxy;
|
||||
}
|
||||
array::array(array_expression const & proxy) : array(control(proxy)){}
|
||||
array::array(array const & other) : array(control(other)){}
|
||||
|
||||
array::array(array const & other) :
|
||||
dtype_(other.dtype()),
|
||||
shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
|
||||
context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
|
||||
template<class TYPE>
|
||||
array::array(controller<TYPE> const & other) :
|
||||
dtype_(other.x().dtype()),
|
||||
shape_(other.x().shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
|
||||
context_(other.x().context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
|
||||
{
|
||||
*this = other;
|
||||
}
|
||||
|
||||
|
||||
template array::array(controller<array> const&);
|
||||
template array::array(controller<array_expression> const&);
|
||||
|
||||
/*--- Getters ---*/
|
||||
numeric_type array::dtype() const
|
||||
{ return dtype_; }
|
||||
|
@@ -213,6 +213,7 @@ class ArgumentsHandler:
|
||||
self.blas3_size = map(int, self.blas3_size)
|
||||
|
||||
if __name__ == "__main__":
|
||||
atd.state.queue_properties = atd.queue_properties_type.CL_QUEUE_PROFILING_ENABLE
|
||||
|
||||
platforms = atd.get_platforms()
|
||||
devices = [d for platform in platforms for d in platform.get_devices()]
|
||||
|
@@ -222,13 +222,9 @@ def benchmark(template, symbolic):
|
||||
atd.synchronize(symbolic.context)
|
||||
current_time = 0
|
||||
timings = []
|
||||
while current_time < 1e-1:
|
||||
time_before = time.time()
|
||||
x = atd.array(symbolic)
|
||||
atd.synchronize(symbolic.context)
|
||||
timings.append(time.time() - time_before)
|
||||
current_time = current_time + timings[-1]
|
||||
return np.median(timings)
|
||||
x, event, cache = atd.flush(symbolic)
|
||||
atd.synchronize(symbolic.context)
|
||||
return 1e-9*(event.end - event.start)
|
||||
|
||||
|
||||
def sanitize_string(string, keep_chars = ['_']):
|
||||
|
@@ -166,6 +166,16 @@ namespace detail
|
||||
return res;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
std::vector<T> to_vector(bp::list const & list)
|
||||
{
|
||||
std::size_t len = bp::len(list);
|
||||
std::vector<T> res; res.reserve(len);
|
||||
for(int i = 0 ; i < len ; ++i)
|
||||
res.push_back(boost::python::extract<T>(list[i]));
|
||||
return res;
|
||||
}
|
||||
|
||||
bp::list nv_compute_capability(cl::Device const & device)
|
||||
{
|
||||
bp::list res;
|
||||
@@ -288,6 +298,10 @@ namespace detail
|
||||
wrap_command_queue_info(cl::CommandQueue const & x)
|
||||
{ return x.getInfo<INFO>(NULL); }
|
||||
|
||||
template<cl_int INFO>
|
||||
typename cl::detail::param_traits<cl::detail::cl_profiling_info, INFO>::param_type
|
||||
wrap_profiling_info(cl::Event const & x)
|
||||
{ return x.getProfilingInfo<INFO>(NULL); }
|
||||
|
||||
std::string to_string(cl_device_type type)
|
||||
{
|
||||
@@ -301,8 +315,20 @@ namespace detail
|
||||
boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
|
||||
{ return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }
|
||||
|
||||
bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
|
||||
{
|
||||
cl::Event event;
|
||||
atd::operation_cache cache;
|
||||
std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
|
||||
boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &event, &cache, &cdependencies),
|
||||
atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
|
||||
|
||||
return bp::make_tuple(*parray, event, cache);
|
||||
}
|
||||
}
|
||||
|
||||
struct state_type{ };
|
||||
state_type state;
|
||||
|
||||
void export_cl()
|
||||
{
|
||||
@@ -362,9 +388,32 @@ void export_cl()
|
||||
.add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
|
||||
;
|
||||
|
||||
bp::class_<cl::Event>("event")
|
||||
#define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_profiling_info<NAME>)
|
||||
WRAP("start", CL_PROFILING_COMMAND_START)
|
||||
WRAP("submit", CL_PROFILING_COMMAND_SUBMIT)
|
||||
WRAP("end", CL_PROFILING_COMMAND_END)
|
||||
;
|
||||
|
||||
bp::class_<atd::operation_cache>("operation_cache", bp::no_init)
|
||||
.def("enqueue", &atd::operation_cache::enqueue)
|
||||
;
|
||||
|
||||
bp::def("synchronize", &atd::cl_ext::synchronize);
|
||||
bp::def("get_platforms", &detail::get_platforms);
|
||||
|
||||
bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
|
||||
|
||||
bp::enum_<cl_command_queue_properties>("queue_properties_type")
|
||||
.value("CL_QUEUE_PROFILING_ENABLE", CL_QUEUE_PROFILING_ENABLE)
|
||||
.value("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
|
||||
;
|
||||
|
||||
bp::class_<state_type>("state_type")
|
||||
.def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
|
||||
;
|
||||
|
||||
bp::scope().attr("state") = bp::object(bp::ptr(&state));
|
||||
}
|
||||
|
||||
namespace detail
|
||||
|
Reference in New Issue
Block a user