Bench: now using host timer rather than events
This commit is contained in:
@@ -21,16 +21,12 @@
|
|||||||
namespace sc = isaac;
|
namespace sc = isaac;
|
||||||
typedef sc::int_t int_t;
|
typedef sc::int_t int_t;
|
||||||
|
|
||||||
static long time_event(long sum, sc::driver::Event const & e)
|
|
||||||
{
|
|
||||||
return sum + e.elapsed_time();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
void bench(sc::numeric_type dtype, std::string operation)
|
void bench(sc::numeric_type dtype, std::string operation)
|
||||||
{
|
{
|
||||||
|
|
||||||
|
Timer tmr;
|
||||||
|
|
||||||
//
|
//
|
||||||
// MACROS FOR BENCHMARKING
|
// MACROS FOR BENCHMARKING
|
||||||
//
|
//
|
||||||
@@ -39,16 +35,18 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
|
|
||||||
#define BENCHMARK_ISAAC(OP, PERF) \
|
#define BENCHMARK_ISAAC(OP, PERF) \
|
||||||
{\
|
{\
|
||||||
std::vector<double> times;\
|
std::vector<long> times;\
|
||||||
double total_time = 0;\
|
double total_time = 0;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
|
||||||
std::list<sc::driver::Event> events;\
|
|
||||||
OP;\
|
OP;\
|
||||||
queue.synchronize();\
|
queue.synchronize();\
|
||||||
times.push_back((double)std::accumulate(events.begin(), events.end(), 0, &time_event));\
|
while(total_time*1e-9 < 1e-1){\
|
||||||
|
tmr.start();\
|
||||||
|
OP;\
|
||||||
|
queue.synchronize();\
|
||||||
|
times.push_back(tmr.get().count());\
|
||||||
total_time+=times.back();\
|
total_time+=times.back();\
|
||||||
}\
|
}\
|
||||||
double t = mean(times);\
|
double t = min(times);\
|
||||||
std::cout << " " << (int)(PERF) << std::flush;\
|
std::cout << " " << (int)(PERF) << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -56,30 +54,32 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
{\
|
{\
|
||||||
std::vector<long> times;\
|
std::vector<long> times;\
|
||||||
double total_time = 0;\
|
double total_time = 0;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
|
||||||
cl_event event;\
|
|
||||||
OP;\
|
OP;\
|
||||||
queue.synchronize();\
|
queue.synchronize();\
|
||||||
times.push_back(sc::driver::Event(event).elapsed_time());\
|
while(total_time*1e-9 < 1e-1){\
|
||||||
|
tmr.start();\
|
||||||
|
OP;\
|
||||||
|
queue.synchronize();\
|
||||||
|
times.push_back(tmr.get().count());\
|
||||||
total_time+=times.back();\
|
total_time+=times.back();\
|
||||||
}\
|
}\
|
||||||
double t = mean(times);\
|
double t = min(times);\
|
||||||
std::cout << " " << (int)(PERF) << std::flush;\
|
std::cout << " " << (int)(PERF) << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
#define BENCHMARK_HOST(OP, PERF) \
|
#define BENCHMARK_HOST(OP, PERF) \
|
||||||
{\
|
{\
|
||||||
Timer tmr;\
|
|
||||||
long total_time = 0;\
|
long total_time = 0;\
|
||||||
std::vector<long> times;\
|
std::vector<long> times;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
OP;\
|
||||||
|
while(total_time*1e-9 < 1e-1){\
|
||||||
tmr.start();\
|
tmr.start();\
|
||||||
OP;\
|
OP;\
|
||||||
long time = tmr.get().count();\
|
long time = tmr.get().count();\
|
||||||
times.push_back(time);\
|
times.push_back(time);\
|
||||||
total_time += time;\
|
total_time += time;\
|
||||||
}\
|
}\
|
||||||
double t = mean(times);\
|
double t = min(times);\
|
||||||
std::cout << " " << (int)(PERF) << std::flush;\
|
std::cout << " " << (int)(PERF) << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,7 +91,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
cudaEvent_t start, stop;\
|
cudaEvent_t start, stop;\
|
||||||
cudaEventCreate(&start);\
|
cudaEventCreate(&start);\
|
||||||
cudaEventCreate(&stop);\
|
cudaEventCreate(&stop);\
|
||||||
while(total_time*1e-3 < 1e-2){\
|
while(total_time*1e-3 < 1e-1){\
|
||||||
cudaEventRecord(start,0);\
|
cudaEventRecord(start,0);\
|
||||||
OP;\
|
OP;\
|
||||||
cudaEventRecord(stop,0);\
|
cudaEventRecord(stop,0);\
|
||||||
@@ -100,7 +100,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
times.push_back(time*1e6);\
|
times.push_back(time*1e6);\
|
||||||
total_time+=time;\
|
total_time+=time;\
|
||||||
}\
|
}\
|
||||||
double t = mean(times);\
|
double t = min(times);\
|
||||||
std::cout << " " << (int)(PERF) << std::flush;\
|
std::cout << " " << (int)(PERF) << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,13 +138,12 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
std::cout << N;
|
std::cout << N;
|
||||||
sc::array x(N, dtype), y(N, dtype);
|
sc::array x(N, dtype), y(N, dtype);
|
||||||
/* ISAAC */
|
/* ISAAC */
|
||||||
std::list<sc::driver::Event> events;
|
BENCHMARK_ISAAC(y = sc::execution_handler(x + alpha*y), 3*N*dtsize/t)
|
||||||
BENCHMARK_ISAAC(y = sc::execution_handler(x + alpha*y, sc::execution_options_type(0, &events)), 3*N*dtsize/t)
|
BENCHMARK_ISAAC(y = sc::execution_handler(x + alpha*y, sc::execution_options_type(), sc::dispatcher_options_type(true)), 3*N*dtsize/t)
|
||||||
BENCHMARK_ISAAC(y = sc::execution_handler(x + alpha*y, sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), 3*N*dtsize/t)
|
|
||||||
/* clblas */
|
/* clblas */
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
if(x.context().backend()==sc::driver::OPENCL)
|
if(x.context().backend()==sc::driver::OPENCL)
|
||||||
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, &event), 3*N*dtsize/t);
|
BENCHMARK_CLBLAS(clblasSaxpy(N, alpha, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue), 0, NULL, NULL), 3*N*dtsize/t);
|
||||||
#endif
|
#endif
|
||||||
/* BLAS */
|
/* BLAS */
|
||||||
#ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
@@ -171,11 +170,11 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
sc::array scratch(N, dtype);
|
sc::array scratch(N, dtype);
|
||||||
sc::scalar s(dtype);
|
sc::scalar s(dtype);
|
||||||
s = dot(x,y); queue.synchronize();
|
s = dot(x,y); queue.synchronize();
|
||||||
BENCHMARK_ISAAC(s = sc::execution_handler(dot(x,y), sc::execution_options_type(0, &events)), 2*N*dtsize/t)
|
BENCHMARK_ISAAC(s = sc::execution_handler(dot(x,y)), 2*N*dtsize/t)
|
||||||
/* clblas */
|
/* clblas */
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
if(x.context().backend()==sc::driver::OPENCL)
|
if(x.context().backend()==sc::driver::OPENCL)
|
||||||
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, &event), 2*N*dtsize/t)
|
BENCHMARK_CLBLAS(clblasSdot(N, CL_HANDLE(s.data()), 0, CL_HANDLE(x.data()), 0, 1, CL_HANDLE(y.data()), 0, 1, CL_HANDLE(scratch.data()), 1, &CL_HANDLE(queue), 0, NULL, NULL), 2*N*dtsize/t)
|
||||||
#endif
|
#endif
|
||||||
/* BLAS */
|
/* BLAS */
|
||||||
#ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
@@ -234,11 +233,11 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
#ifdef HAS_A_BLAS
|
#ifdef HAS_A_BLAS
|
||||||
int_t lda = A.stride()[1];
|
int_t lda = A.stride()[1];
|
||||||
#endif
|
#endif
|
||||||
BENCHMARK_ISAAC(y = sc::execution_handler(AT?dot(A.T,x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
BENCHMARK_ISAAC(y = sc::execution_handler(AT?dot(A.T,x):dot(A,x)),(M*N + M + N)*dtsize/t);
|
||||||
BENCHMARK_ISAAC(y = sc::execution_handler(AT?dot(A.T,x):dot(A,x), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
BENCHMARK_ISAAC(y = sc::execution_handler(AT?dot(A.T,x):dot(A,x), sc::execution_options_type(), sc::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
if(y.context().backend()==sc::driver::OPENCL)
|
if(y.context().backend()==sc::driver::OPENCL)
|
||||||
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t)
|
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, NULL), (M*N + M + N)*dtsize/t)
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
std::vector<float> cA(M*N), cx(N), cy(M);
|
std::vector<float> cA(M*N), cx(N), cy(M);
|
||||||
@@ -313,13 +312,13 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
#ifdef HAS_A_BLAS
|
#ifdef HAS_A_BLAS
|
||||||
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
|
int_t lda = A.stride()[1], ldb = B.stride()[1], ldc = C.stride()[1];
|
||||||
#endif
|
#endif
|
||||||
BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B))), (double)2*M*N*K/t);
|
||||||
// BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
// BENCHMARK_ISAAC(C = sc::execution_handler(AT?(BT?dot(A.T,B.T):dot(A.T,B)):(BT?dot(A,B.T):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
||||||
/* clblas */
|
/* clblas */
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
if(C.context().backend()==sc::driver::OPENCL)
|
if(C.context().backend()==sc::driver::OPENCL)
|
||||||
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
BENCHMARK_CLBLAS(clblasSgemm(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, BT?clblasTrans:clblasNoTrans, M, N, K, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(B.data()), 0, ldb,
|
||||||
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, &event), (double)2*M*N*K/t)
|
0, CL_HANDLE(C.data()), 0, ldc, 1, &CL_HANDLE(queue),0, NULL, NULL), (double)2*M*N*K/t)
|
||||||
#endif
|
#endif
|
||||||
/* BLAS */
|
/* BLAS */
|
||||||
#ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
|
@@ -2,6 +2,7 @@
|
|||||||
#define ISAAC_BENCH_COMMON_HPP_
|
#define ISAAC_BENCH_COMMON_HPP_
|
||||||
|
|
||||||
#include <chrono>
|
#include <chrono>
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
template<std::size_t> struct int_{};
|
template<std::size_t> struct int_{};
|
||||||
|
|
||||||
@@ -61,6 +62,12 @@ T median(std::vector<T> x)
|
|||||||
return x[size / 2];
|
return x[size / 2];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<class T>
|
||||||
|
T min(std::vector<T> x)
|
||||||
|
{
|
||||||
|
return *std::min_element(x.begin(), x.end());
|
||||||
|
}
|
||||||
|
|
||||||
template<class T>
|
template<class T>
|
||||||
T mean(std::vector<T> x)
|
T mean(std::vector<T> x)
|
||||||
{
|
{
|
||||||
|
Reference in New Issue
Block a user