Now using events to time autotuning

This commit is contained in:
Philippe Tillet
2015-02-06 22:11:03 -05:00
parent 385f007c0b
commit b768e913c9
7 changed files with 163 additions and 95 deletions

View File

@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
while(total_time*1e-9 < 1e-1){\ while(total_time*1e-9 < 1e-1){\
cl::Event event;\ cl::Event event;\
OP;\ OP;\
queue.flush();\
queue.finish();\ queue.finish();\
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\ times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
total_time+=times.back();\ total_time+=times.back();\
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
std::cout << " " << PERF << std::flush;\ std::cout << " " << PERF << std::flush;\
} }
/*---------*/ // /*---------*/
/*--BLAS1--*/ // /*--BLAS1--*/
/*---------*/ // /*---------*/
std::cout << "#AXPY" << std::endl; // std::cout << "#AXPY" << std::endl;
for(auto N : BLAS1_N) // for(int_t N : create_log_range(1e3, 2e7, 50, 64))
{
std::cout << N;
ad::array x(N, dtype), y(N, dtype);
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
/* ATIDLAS */
y = x + y; queue.flush(); queue.finish();
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
/* clAmdBlas */
#ifdef BENCH_CLAMDBLAS
BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
#endif
/* BLAS */
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
ad::copy(x, cx);
ad::copy(y, cy);
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
#endif
/* CuBLAS */
#ifdef BENCH_CUBLAS
T *cux, *cuy;
cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, N * sizeof(T));
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
cudaFree(cux);
cudaFree(cuy);
#endif
std::cout << std::endl;
}
std::cout << "\n\n" << std::flush;
// std::cout << "#DOT" << std::endl;
// for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
// { // {
// int_t N = *it;
// std::cout << N; // std::cout << N;
// /* ATIDLAS */
// ad::array x(N, dtype), y(N, dtype); // ad::array x(N, dtype), y(N, dtype);
// ad::array scratch(N, dtype); // cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
// ad::scalar s(dtype); // /* ATIDLAS */
// CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize)); // y = x + y; queue.flush(); queue.finish();
// BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
// /* clAmdBlas */ // /* clAmdBlas */
//#ifdef BENCH_CLAMDBLAS //#ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize)) // BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
//#endif //#endif
// /* BLAS */ // /* BLAS */
//#ifdef BENCH_CBLAS //#ifdef BENCH_CBLAS
// std::vector<float> cx(N), cy(N); // std::vector<float> cx(N), cy(N);
// ad::copy(x, cx); // ad::copy(x, cx);
// ad::copy(y, cy); // ad::copy(y, cy);
// CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize)); // BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
//#endif
// /* CuBLAS */
//#ifdef BENCH_CUBLAS
// T *cux, *cuy;
// cudaMalloc((void**) &cux, N * sizeof(T));
// cudaMalloc((void**) &cuy, N * sizeof(T));
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
// cudaFree(cux);
// cudaFree(cuy);
//#endif //#endif
// std::cout << std::endl; // std::cout << std::endl;
// } // }
// std::cout << "\n\n" << std::flush; // std::cout << "\n\n" << std::flush;
// /*---------*/ // std::cout << "#DOT" << std::endl;
// /*--BLAS2--*/ // for(int_t N : create_log_range(1e3, 2e7, 50, 64))
// /*---------*/ // {
// //T-layout // std::cout << N;
// std::cout << "#GEMV-T" << std::endl; // /* ATIDLAS */
// for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit) // ad::array x(N, dtype), y(N, dtype);
// for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit) // cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
// { // ad::array scratch(N, dtype);
// int_t M = *Mit; // ad::scalar s(dtype);
// int_t N = *Nit; // s = dot(x,y); queue.flush(); queue.finish();
// std::cout << M << "," << N; // BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
// /* ATIDLAS */ // /* clAmdBlas */
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype); //#ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize)); // BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
// /* clAmdBlas */ //#endif
// #ifdef BENCH_CLAMDBLAS // /* BLAS */
// CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize)) //#ifdef BENCH_CBLAS
// #endif // std::vector<float> cx(N), cy(N);
// /* BLAS */ // ad::copy(x, cx);
// #ifdef BENCH_CBLAS // ad::copy(y, cy);
// std::vector<float> cA(N*M), cx(N), cy(M); // BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
// ad::copy(x, cx); //#endif
// ad::copy(y, cy); //#ifdef BENCH_CUBLAS
// ad::copy(A, cA); // T *cux, *cuy;
// CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize)); // T result;
// #endif // cudaMalloc((void**) &cux, N * sizeof(T));
// std::cout << std::endl; // cudaMalloc((void**) &cuy, N * sizeof(T));
// } // BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
// std::cout << "\n\n" << std::flush; // cudaFree(cux);
// cudaFree(cuy);
//#endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
/*---------*/
/*--BLAS2--*/
/*---------*/
//T-layout
std::cout << "#GEMV-T" << std::endl;
for(int_t N: std::vector<int>{64})
for(int_t M: create_full_range(128, 10000, 64))
{
std::cout << M << "," << N;
/* ATIDLAS */
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
y = dot(trans(A),x); queue.flush(); queue.finish();
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
#ifdef BENCH_CLAMDBLAS
BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
#endif
#ifdef BENCH_CBLAS
std::vector<float> cA(N*M), cx(N), cy(M);
ad::copy(x, cx);
ad::copy(y, cy);
ad::copy(A, cA);
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
#endif
#ifdef BENCH_CUBLAS
T *cuA, *cux, *cuy;
cudaMalloc((void**) &cuA, N * M * sizeof(T));
cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, M * sizeof(T));
BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
cudaFree(cuA);
cudaFree(cux);
cudaFree(cuy);
#endif
std::cout << std::endl;
}
std::cout << "\n\n" << std::flush;
//// /*---------*/ //// /*---------*/
//// /*--BLAS3--*/ //// /*--BLAS3--*/

View File

@@ -34,9 +34,11 @@ public:
//General constructor //General constructor
array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context()); array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
array(array_expression const & proxy); array(array_expression const & proxy);
array(array const &); array(array const &);
template<class T>
array(controller<T> const &);
//Getters //Getters
numeric_type dtype() const; numeric_type dtype() const;
size4 shape() const; size4 shape() const;

View File

@@ -284,6 +284,10 @@ private:
compilation_options_type compilation_options_; compilation_options_type compilation_options_;
}; };
template<class TYPE>
controller<TYPE> control(TYPE const & x, execution_options_type const& execution_options = execution_options_type(),
dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type())
{ return controller<TYPE>(x, execution_options, dispatcher_options, compilation_options); }
class expressions_tuple class expressions_tuple
{ {

View File

@@ -83,23 +83,22 @@ array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const
ld_(ld), context_(context), data_(data) ld_(ld), context_(context), data_(data)
{ } { }
array::array(array_expression const & proxy) : array::array(array_expression const & proxy) : array(control(proxy)){}
dtype_(proxy.dtype()), array::array(array const & other) : array(control(other)){}
shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
{
*this = proxy;
}
array::array(array const & other) : template<class TYPE>
dtype_(other.dtype()), array::array(controller<TYPE> const & other) :
shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1), dtype_(other.x().dtype()),
context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize()) shape_(other.x().shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
context_(other.x().context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
{ {
*this = other; *this = other;
} }
template array::array(controller<array> const&);
template array::array(controller<array_expression> const&);
/*--- Getters ---*/ /*--- Getters ---*/
numeric_type array::dtype() const numeric_type array::dtype() const
{ return dtype_; } { return dtype_; }

View File

@@ -213,6 +213,7 @@ class ArgumentsHandler:
self.blas3_size = map(int, self.blas3_size) self.blas3_size = map(int, self.blas3_size)
if __name__ == "__main__": if __name__ == "__main__":
atd.state.queue_properties = atd.queue_properties_type.CL_QUEUE_PROFILING_ENABLE
platforms = atd.get_platforms() platforms = atd.get_platforms()
devices = [d for platform in platforms for d in platform.get_devices()] devices = [d for platform in platforms for d in platform.get_devices()]

View File

@@ -222,13 +222,9 @@ def benchmark(template, symbolic):
atd.synchronize(symbolic.context) atd.synchronize(symbolic.context)
current_time = 0 current_time = 0
timings = [] timings = []
while current_time < 1e-1: x, event, cache = atd.flush(symbolic)
time_before = time.time() atd.synchronize(symbolic.context)
x = atd.array(symbolic) return 1e-9*(event.end - event.start)
atd.synchronize(symbolic.context)
timings.append(time.time() - time_before)
current_time = current_time + timings[-1]
return np.median(timings)
def sanitize_string(string, keep_chars = ['_']): def sanitize_string(string, keep_chars = ['_']):

View File

@@ -166,6 +166,16 @@ namespace detail
return res; return res;
} }
template<class T>
std::vector<T> to_vector(bp::list const & list)
{
std::size_t len = bp::len(list);
std::vector<T> res; res.reserve(len);
for(int i = 0 ; i < len ; ++i)
res.push_back(boost::python::extract<T>(list[i]));
return res;
}
bp::list nv_compute_capability(cl::Device const & device) bp::list nv_compute_capability(cl::Device const & device)
{ {
bp::list res; bp::list res;
@@ -288,6 +298,10 @@ namespace detail
wrap_command_queue_info(cl::CommandQueue const & x) wrap_command_queue_info(cl::CommandQueue const & x)
{ return x.getInfo<INFO>(NULL); } { return x.getInfo<INFO>(NULL); }
template<cl_int INFO>
typename cl::detail::param_traits<cl::detail::cl_profiling_info, INFO>::param_type
wrap_profiling_info(cl::Event const & x)
{ return x.getProfilingInfo<INFO>(NULL); }
std::string to_string(cl_device_type type) std::string to_string(cl_device_type type)
{ {
@@ -301,8 +315,20 @@ namespace detail
boost::shared_ptr<cl::Context> make_context(cl::Device const & dev) boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
{ return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); } { return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }
bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
{
cl::Event event;
atd::operation_cache cache;
std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &event, &cache, &cdependencies),
atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
return bp::make_tuple(*parray, event, cache);
}
} }
struct state_type{ };
state_type state;
void export_cl() void export_cl()
{ {
@@ -362,9 +388,32 @@ void export_cl()
.add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>())); .add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
; ;
bp::class_<cl::Event>("event")
#define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_profiling_info<NAME>)
WRAP("start", CL_PROFILING_COMMAND_START)
WRAP("submit", CL_PROFILING_COMMAND_SUBMIT)
WRAP("end", CL_PROFILING_COMMAND_END)
;
bp::class_<atd::operation_cache>("operation_cache", bp::no_init)
.def("enqueue", &atd::operation_cache::enqueue)
;
bp::def("synchronize", &atd::cl_ext::synchronize); bp::def("synchronize", &atd::cl_ext::synchronize);
bp::def("get_platforms", &detail::get_platforms); bp::def("get_platforms", &detail::get_platforms);
bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
bp::enum_<cl_command_queue_properties>("queue_properties_type")
.value("CL_QUEUE_PROFILING_ENABLE", CL_QUEUE_PROFILING_ENABLE)
.value("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
;
bp::class_<state_type>("state_type")
.def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
;
bp::scope().attr("state") = bp::object(bp::ptr(&state));
} }
namespace detail namespace detail