Now using events to time autotuning

This commit is contained in:
Philippe Tillet
2015-02-06 22:11:03 -05:00
parent 385f007c0b
commit b768e913c9
7 changed files with 163 additions and 95 deletions

View File

@@ -30,7 +30,6 @@ void bench(ad::numeric_type dtype)
while(total_time*1e-9 < 1e-1){\
cl::Event event;\
OP;\
queue.flush();\
queue.finish();\
times.push_back(event.getProfilingInfo<CL_PROFILING_COMMAND_END>() - event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>());\
total_time+=times.back();\
@@ -70,96 +69,114 @@ void bench(ad::numeric_type dtype)
std::cout << " " << PERF << std::flush;\
}
/*---------*/
/*--BLAS1--*/
/*---------*/
std::cout << "#AXPY" << std::endl;
for(auto N : BLAS1_N)
{
std::cout << N;
ad::array x(N, dtype), y(N, dtype);
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
/* ATIDLAS */
y = x + y; queue.flush(); queue.finish();
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
/* clAmdBlas */
#ifdef BENCH_CLAMDBLAS
BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
#endif
/* BLAS */
#ifdef BENCH_CBLAS
std::vector<float> cx(N), cy(N);
ad::copy(x, cx);
ad::copy(y, cy);
BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
#endif
/* CuBLAS */
#ifdef BENCH_CUBLAS
T *cux, *cuy;
cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, N * sizeof(T));
BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
cudaFree(cux);
cudaFree(cuy);
#endif
std::cout << std::endl;
}
std::cout << "\n\n" << std::flush;
// std::cout << "#DOT" << std::endl;
// for(std::vector<int_t>::const_iterator it = BLAS1_N.begin() ; it != BLAS1_N.end() ; ++it)
// /*---------*/
// /*--BLAS1--*/
// /*---------*/
// std::cout << "#AXPY" << std::endl;
// for(int_t N : create_log_range(1e3, 2e7, 50, 64))
// {
// int_t N = *it;
// std::cout << N;
// /* ATIDLAS */
// ad::array x(N, dtype), y(N, dtype);
// ad::array scratch(N, dtype);
// ad::scalar s(dtype);
// CL_BENCHMARK(s = dot(x,y), bandwidth(2*N, tres, dtsize));
// cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
// /* ATIDLAS */
// y = x + y; queue.flush(); queue.finish();
// BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(x + y, ad::execution_options_type(0, &event)), 3*N*dtsize/t)
// /* clAmdBlas */
//#ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &ad::cl_ext::get_queue(x.context(), 0)(), 0, NULL, NULL), bandwidth(2*N, tres, dtsize))
// BENCHMARK_OPENCL(clAmdBlasSaxpy(N, 1, x.data()(), 0, 1, y.data()(), 0, 1, 1, &queue(), 0, NULL, &event()), 3*N*dtsize/t)
//#endif
// /* BLAS */
//#ifdef BENCH_CBLAS
// std::vector<float> cx(N), cy(N);
// ad::copy(x, cx);
// ad::copy(y, cy);
// CPU_BENCHMARK(cblas_sdot(N, cx.data(), 1, cy.data(), 1), bandwidth(2*N, tres, dtsize));
// BENCHMARK_HOST(cblas_saxpy(N, 1, cx.data(), 1, cy.data(), 1), 3*N*dtsize/t);
//#endif
// /* CuBLAS */
//#ifdef BENCH_CUBLAS
// T *cux, *cuy;
// cudaMalloc((void**) &cux, N * sizeof(T));
// cudaMalloc((void**) &cuy, N * sizeof(T));
// BENCHMARK_CUDA(cublasSaxpy(N, 2, cux, 1, cuy, 1), 3*N*dtsize/t)
// cudaFree(cux);
// cudaFree(cuy);
//#endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
// /*---------*/
// /*--BLAS2--*/
// /*---------*/
// //T-layout
// std::cout << "#GEMV-T" << std::endl;
// for(std::vector<int>::const_iterator Mit = BLAS2_M.begin() ; Mit != BLAS2_M.end() ; ++Mit)
// for(std::vector<int_t>::const_iterator Nit = BLAS2_N.begin() ; Nit != BLAS2_N.end() ; ++Nit)
// {
// int_t M = *Mit;
// int_t N = *Nit;
// std::cout << M << "," << N;
// /* ATIDLAS */
// ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
// CL_BENCHMARK(y = dot(trans(A),x), bandwidth(M*N + M + N, tres, dtsize));
// /* clAmdBlas */
// #ifdef BENCH_CLAMDBLAS
// CL_BENCHMARK(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &ad::cl_ext::get_queue(x.context(), 0)(),0, NULL, NULL), bandwidth(M*N + M + N, tres, dtsize))
// #endif
// /* BLAS */
// #ifdef BENCH_CBLAS
// std::vector<float> cA(N*M), cx(N), cy(M);
// ad::copy(x, cx);
// ad::copy(y, cy);
// ad::copy(A, cA);
// CPU_BENCHMARK(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), bandwidth(M*N + M + N, tres, dtsize));
// #endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
// std::cout << "#DOT" << std::endl;
// for(int_t N : create_log_range(1e3, 2e7, 50, 64))
// {
// std::cout << N;
// /* ATIDLAS */
// ad::array x(N, dtype), y(N, dtype);
// cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
// ad::array scratch(N, dtype);
// ad::scalar s(dtype);
// s = dot(x,y); queue.flush(); queue.finish();
// BENCHMARK_OPENCL(s = ad::controller<atidlas::array_expression>(dot(x,y), ad::execution_options_type(0, &event)), 2*N*dtsize/t)
// /* clAmdBlas */
//#ifdef BENCH_CLAMDBLAS
// BENCHMARK_OPENCL(clAmdBlasSdot(N, s.data()(), 0, x.data()(), 0, 1, y.data()(), 0, 1, scratch.data()(), 1, &queue(), 0, NULL, &event()), 2*N*dtsize/t)
//#endif
// /* BLAS */
//#ifdef BENCH_CBLAS
// std::vector<float> cx(N), cy(N);
// ad::copy(x, cx);
// ad::copy(y, cy);
// BENCHMARK_HOST(cblas_sdot(N, cx.data(), 1, cy.data(), 1), 2*N*dtsize/t);
//#endif
//#ifdef BENCH_CUBLAS
// T *cux, *cuy;
// T result;
// cudaMalloc((void**) &cux, N * sizeof(T));
// cudaMalloc((void**) &cuy, N * sizeof(T));
// BENCHMARK_CUDA(cublasSdot(N, cux, 1, cuy, 1, &result), 2*N*dtsize/t)
// cudaFree(cux);
// cudaFree(cuy);
//#endif
// std::cout << std::endl;
// }
// std::cout << "\n\n" << std::flush;
/*---------*/
/*--BLAS2--*/
/*---------*/
//T-layout
std::cout << "#GEMV-T" << std::endl;
for(int_t N: std::vector<int>{64})
for(int_t M: create_full_range(128, 10000, 64))
{
std::cout << M << "," << N;
/* ATIDLAS */
ad::array A(N, M, dtype), y(M, dtype), x(N, dtype);
cl::CommandQueue & queue = ad::cl_ext::queues[x.context()][0];
y = dot(trans(A),x); queue.flush(); queue.finish();
BENCHMARK_OPENCL(y = ad::controller<atidlas::array_expression>(dot(trans(A),x), ad::execution_options_type(0, &event)),(M*N + M + N)*dtsize/t);
#ifdef BENCH_CLAMDBLAS
BENCHMARK_OPENCL(clAmdBlasSgemv(clAmdBlasColumnMajor, clAmdBlasTrans, N, M, 1, A.data()(), A.ld(), x.data()(), 0, 1, 0, y.data()(), 0, 1, 1, &queue(),0, NULL, &event()), (M*N + M + N)*dtsize/t)
#endif
#ifdef BENCH_CBLAS
std::vector<float> cA(N*M), cx(N), cy(M);
ad::copy(x, cx);
ad::copy(y, cy);
ad::copy(A, cA);
BENCHMARK_HOST(cblas_sgemv(CblasColMajor, CblasTrans, N, M, 1, cA.data(), N, cx.data(), 1, 0, cy.data(), 1), (M*N + M + N)*dtsize/t);
#endif
#ifdef BENCH_CUBLAS
T *cuA, *cux, *cuy;
cudaMalloc((void**) &cuA, N * M * sizeof(T));
cudaMalloc((void**) &cux, N * sizeof(T));
cudaMalloc((void**) &cuy, M * sizeof(T));
BENCHMARK_CUDA(cublasSgemv(cublasTrans, N, M, 1, cuA, N, cux, 1, 0, cuy, 1), (M*N + M + N)*dtsize/t)
cudaFree(cuA);
cudaFree(cux);
cudaFree(cuy);
#endif
std::cout << std::endl;
}
std::cout << "\n\n" << std::flush;
//// /*---------*/
//// /*--BLAS3--*/

View File

@@ -34,9 +34,11 @@ public:
//General constructor
array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const & s2, int_t ld, cl::Context context = cl_ext::default_context());
array(array_expression const & proxy);
array(array const &);
template<class T>
array(controller<T> const &);
//Getters
numeric_type dtype() const;
size4 shape() const;

View File

@@ -284,6 +284,10 @@ private:
compilation_options_type compilation_options_;
};
template<class TYPE>
controller<TYPE> control(TYPE const & x, execution_options_type const& execution_options = execution_options_type(),
dispatcher_options_type const & dispatcher_options = dispatcher_options_type(), compilation_options_type const & compilation_options = compilation_options_type())
{ return controller<TYPE>(x, execution_options, dispatcher_options, compilation_options); }
class expressions_tuple
{

View File

@@ -83,23 +83,22 @@ array::array(numeric_type dtype, cl::Buffer data, slice const & s1, slice const
ld_(ld), context_(context), data_(data)
{ }
array::array(array_expression const & proxy) :
dtype_(proxy.dtype()),
shape_(proxy.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
context_(proxy.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
{
*this = proxy;
}
array::array(array_expression const & proxy) : array(control(proxy)){}
array::array(array const & other) : array(control(other)){}
array::array(array const & other) :
dtype_(other.dtype()),
shape_(other.shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
context_(other.context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
template<class TYPE>
array::array(controller<TYPE> const & other) :
dtype_(other.x().dtype()),
shape_(other.x().shape()), start_(0,0), stride_(1, 1), ld_(shape_._1),
context_(other.x().context()), data_(context_, CL_MEM_READ_WRITE, size_of(dtype_)*dsize())
{
*this = other;
}
template array::array(controller<array> const&);
template array::array(controller<array_expression> const&);
/*--- Getters ---*/
numeric_type array::dtype() const
{ return dtype_; }

View File

@@ -213,7 +213,8 @@ class ArgumentsHandler:
self.blas3_size = map(int, self.blas3_size)
if __name__ == "__main__":
atd.state.queue_properties = atd.queue_properties_type.CL_QUEUE_PROFILING_ENABLE
platforms = atd.get_platforms()
devices = [d for platform in platforms for d in platform.get_devices()]

View File

@@ -222,13 +222,9 @@ def benchmark(template, symbolic):
atd.synchronize(symbolic.context)
current_time = 0
timings = []
while current_time < 1e-1:
time_before = time.time()
x = atd.array(symbolic)
atd.synchronize(symbolic.context)
timings.append(time.time() - time_before)
current_time = current_time + timings[-1]
return np.median(timings)
x, event, cache = atd.flush(symbolic)
atd.synchronize(symbolic.context)
return 1e-9*(event.end - event.start)
def sanitize_string(string, keep_chars = ['_']):

View File

@@ -166,6 +166,16 @@ namespace detail
return res;
}
template<class T>
std::vector<T> to_vector(bp::list const & list)
{
std::size_t len = bp::len(list);
std::vector<T> res; res.reserve(len);
for(int i = 0 ; i < len ; ++i)
res.push_back(boost::python::extract<T>(list[i]));
return res;
}
bp::list nv_compute_capability(cl::Device const & device)
{
bp::list res;
@@ -288,6 +298,10 @@ namespace detail
wrap_command_queue_info(cl::CommandQueue const & x)
{ return x.getInfo<INFO>(NULL); }
template<cl_int INFO>
typename cl::detail::param_traits<cl::detail::cl_profiling_info, INFO>::param_type
wrap_profiling_info(cl::Event const & x)
{ return x.getProfilingInfo<INFO>(NULL); }
std::string to_string(cl_device_type type)
{
@@ -301,8 +315,20 @@ namespace detail
boost::shared_ptr<cl::Context> make_context(cl::Device const & dev)
{ return boost::shared_ptr<cl::Context>(new cl::Context(std::vector<cl::Device>(1, dev))); }
bp::tuple flush(atd::array_expression const & expression, unsigned int queue_id, bp::list dependencies, int label, std::string const & program_name, bool force_recompile)
{
cl::Event event;
atd::operation_cache cache;
std::vector<cl::Event> cdependencies = to_vector<cl::Event>(dependencies);
boost::shared_ptr<atd::array> parray(new atd::array(atd::control(expression, atd::execution_options_type(queue_id, &event, &cache, &cdependencies),
atd::dispatcher_options_type(label), atd::compilation_options_type(program_name, force_recompile))));
return bp::make_tuple(*parray, event, cache);
}
}
struct state_type{ };
state_type state;
void export_cl()
{
@@ -362,9 +388,32 @@ void export_cl()
.add_property("models", bp::make_function(&atd::get_model_map, bp::return_internal_reference<>()));
;
bp::class_<cl::Event>("event")
#define WRAP(PYNAME, NAME) .add_property(PYNAME, &detail::wrap_profiling_info<NAME>)
WRAP("start", CL_PROFILING_COMMAND_START)
WRAP("submit", CL_PROFILING_COMMAND_SUBMIT)
WRAP("end", CL_PROFILING_COMMAND_END)
;
bp::class_<atd::operation_cache>("operation_cache", bp::no_init)
.def("enqueue", &atd::operation_cache::enqueue)
;
bp::def("synchronize", &atd::cl_ext::synchronize);
bp::def("get_platforms", &detail::get_platforms);
bp::def("flush", &detail::flush, (bp::arg("expression"), bp::arg("queue_id") = 0, bp::arg("dependencies")=bp::list(), bp::arg("label")=-1, bp::arg("program_name")="", bp::arg("recompile") = false));
bp::enum_<cl_command_queue_properties>("queue_properties_type")
.value("CL_QUEUE_PROFILING_ENABLE", CL_QUEUE_PROFILING_ENABLE)
.value("CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE", CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE)
;
bp::class_<state_type>("state_type")
.def_readwrite("queue_properties",&atd::cl_ext::queue_properties)
;
bp::scope().attr("state") = bp::object(bp::ptr(&state));
}
namespace detail