GEMV: bugfix with CUDA
This commit is contained in:
@@ -41,7 +41,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
{\
|
{\
|
||||||
std::vector<double> times;\
|
std::vector<double> times;\
|
||||||
double total_time = 0;\
|
double total_time = 0;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
while(total_time*1e-9 < 1e-1){\
|
||||||
std::list<sc::driver::Event> events;\
|
std::list<sc::driver::Event> events;\
|
||||||
OP;\
|
OP;\
|
||||||
queue.synchronize();\
|
queue.synchronize();\
|
||||||
@@ -56,7 +56,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
{\
|
{\
|
||||||
std::vector<long> times;\
|
std::vector<long> times;\
|
||||||
double total_time = 0;\
|
double total_time = 0;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
while(total_time*1e-9 < 1e-1){\
|
||||||
cl_event event;\
|
cl_event event;\
|
||||||
OP;\
|
OP;\
|
||||||
queue.synchronize();\
|
queue.synchronize();\
|
||||||
@@ -72,7 +72,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
Timer tmr;\
|
Timer tmr;\
|
||||||
long total_time = 0;\
|
long total_time = 0;\
|
||||||
std::vector<long> times;\
|
std::vector<long> times;\
|
||||||
while(total_time*1e-9 < 1e-2){\
|
while(total_time*1e-9 < 1e-1){\
|
||||||
tmr.start();\
|
tmr.start();\
|
||||||
OP;\
|
OP;\
|
||||||
long time = tmr.get().count();\
|
long time = tmr.get().count();\
|
||||||
@@ -93,7 +93,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
cudaEventCreate(&stop);\
|
cudaEventCreate(&stop);\
|
||||||
OP;\
|
OP;\
|
||||||
cudaThreadSynchronize();\
|
cudaThreadSynchronize();\
|
||||||
while(total_time*1e-3 < 1e-2){\
|
while(total_time*1e-3 < 1e-1){\
|
||||||
cudaEventRecord(start,0);\
|
cudaEventRecord(start,0);\
|
||||||
OP;\
|
OP;\
|
||||||
cudaEventRecord(stop,0);\
|
cudaEventRecord(stop,0);\
|
||||||
@@ -103,7 +103,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
total_time+=time;\
|
total_time+=time;\
|
||||||
}\
|
}\
|
||||||
double t = mean(times);\
|
double t = mean(times);\
|
||||||
std::cout << "\t" << (int)(PERF) << std::flush;\
|
std::cout << " " << (int)(PERF) << std::flush;\
|
||||||
}
|
}
|
||||||
|
|
||||||
unsigned int dtsize = sc::size_of(dtype);
|
unsigned int dtsize = sc::size_of(dtype);
|
||||||
@@ -111,16 +111,17 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
|
std::map<std::string, std::string> metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}};
|
||||||
sc::array flush((int)1e6, sc::FLOAT_TYPE);
|
sc::array flush((int)1e6, sc::FLOAT_TYPE);
|
||||||
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl;
|
||||||
std::cout << "N";
|
std::cout << "\"N\"";
|
||||||
std::cout << "\tISAAC";
|
std::cout << " \"ISAAC (Pred impl.)\"";
|
||||||
|
std::cout << " \"ISAAC (Best impl.)\"";
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
std::cout << "\tclBLAS";
|
std::cout << " \"clBLAS\"";
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CBLAS
|
#ifdef BENCH_CBLAS
|
||||||
std::cout << "\tBLAS";
|
std::cout << " \"BLAS\"";
|
||||||
#endif
|
#endif
|
||||||
#ifdef BENCH_CUBLAS
|
#ifdef BENCH_CUBLAS
|
||||||
std::cout << "\tcuBLAS";
|
std::cout << " \"cuBLAS\"";
|
||||||
#endif
|
#endif
|
||||||
std::cout << std::endl;
|
std::cout << std::endl;
|
||||||
//
|
//
|
||||||
@@ -194,21 +195,23 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
if(operation.substr(0, 4)=="gemv")
|
if(operation.substr(0, 4)=="gemv")
|
||||||
{
|
{
|
||||||
std::vector<std::tuple<char,int_t, int_t> > MNs;
|
std::vector<std::tuple<char,int_t, int_t> > MNs;
|
||||||
MNs.push_back(std::make_tuple('N',896,896));
|
//Linear System
|
||||||
MNs.push_back(std::make_tuple('N',3072,3072));
|
MNs.push_back(std::make_tuple('N',153,153));
|
||||||
//AlexNet
|
MNs.push_back(std::make_tuple('N',1024,1024));
|
||||||
MNs.push_back(std::make_tuple('N',1000,256));
|
MNs.push_back(std::make_tuple('N',2867,2867));
|
||||||
MNs.push_back(std::make_tuple('N',4096,256));
|
|
||||||
|
|
||||||
MNs.push_back(std::make_tuple('T',169,256));
|
//Normalization
|
||||||
MNs.push_back(std::make_tuple('T',169,384));
|
MNs.push_back(std::make_tuple('N', 32, 60000));
|
||||||
MNs.push_back(std::make_tuple('T',729,256));
|
MNs.push_back(std::make_tuple('N', 256, 60000));
|
||||||
MNs.push_back(std::make_tuple('T',3025,96));
|
|
||||||
|
//Householder
|
||||||
|
MNs.push_back(std::make_tuple('N', 100, 60000));
|
||||||
|
MNs.push_back(std::make_tuple('N', 90, 60000));
|
||||||
|
MNs.push_back(std::make_tuple('N', 50, 60000));
|
||||||
|
|
||||||
/*---------*/
|
/*---------*/
|
||||||
/*--BLAS2--*/
|
/*--BLAS2--*/
|
||||||
/*---------*/
|
/*---------*/
|
||||||
//T-layout
|
|
||||||
for(std::tuple<char, int_t, int_t> MN: MNs)
|
for(std::tuple<char, int_t, int_t> MN: MNs)
|
||||||
{
|
{
|
||||||
bool AT = std::get<0>(MN) == 'T';
|
bool AT = std::get<0>(MN) == 'T';
|
||||||
@@ -224,6 +227,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
int_t lda = A.ld();
|
int_t lda = A.ld();
|
||||||
#endif
|
#endif
|
||||||
BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t);
|
||||||
|
BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t);
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
if(y.context().backend()==sc::driver::OPENCL)
|
if(y.context().backend()==sc::driver::OPENCL)
|
||||||
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t)
|
BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t)
|
||||||
@@ -274,14 +278,14 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
// MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128));
|
// MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128));
|
||||||
// MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50));
|
// MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50));
|
||||||
|
|
||||||
//Covariance (e.g., ICA, 10minutes/1khz)
|
//Covariance (e.g., ICA, 10minutes/100Hz)
|
||||||
MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,600000));
|
MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,60000));
|
||||||
MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,600000));
|
MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,60000));
|
||||||
|
|
||||||
// //Bi-diagonalization
|
// //Bi-diagonalization
|
||||||
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 1]",'N','T',4096,4096,32));
|
MNKs.push_back(std::make_tuple("Householder [Iteration 1]",'N','T',4096,4096,32));
|
||||||
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 10]",'N','T',3456,3456,32));
|
MNKs.push_back(std::make_tuple("Householder [Iteration 10]",'N','T',3456,3456,32));
|
||||||
MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 50]",'N','T',896,896,32));
|
MNKs.push_back(std::make_tuple("Householder [Iteration 50]",'N','T',896,896,32));
|
||||||
|
|
||||||
/*---------*/
|
/*---------*/
|
||||||
/*--BLAS3--*/
|
/*--BLAS3--*/
|
||||||
@@ -305,6 +309,7 @@ void bench(sc::numeric_type dtype, std::string operation)
|
|||||||
#ifdef HAS_A_BLAS
|
#ifdef HAS_A_BLAS
|
||||||
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld();
|
||||||
#endif
|
#endif
|
||||||
|
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t);
|
||||||
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), (double)2*M*N*K/t);
|
||||||
/* clblas */
|
/* clblas */
|
||||||
#ifdef BENCH_CLBLAS
|
#ifdef BENCH_CLBLAS
|
||||||
|
@@ -105,7 +105,7 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co
|
|||||||
for (const auto & e : dots){
|
for (const auto & e : dots){
|
||||||
std::string data_type = append_width("#scalartype",col_simd_width);
|
std::string data_type = append_width("#scalartype",col_simd_width);
|
||||||
|
|
||||||
stream << e->process(data_type + " #name_acc = " + neutral_element((e)->root_op(), backend, "#scalartype") + ";") << std::endl;
|
stream << e->process(data_type + " #name_acc = " + InitPrefix(backend, data_type).get() + "(" + neutral_element((e)->root_op(), backend, "#scalartype") + ");") << std::endl;
|
||||||
}
|
}
|
||||||
|
|
||||||
stream << "if (r < M)" << std::endl;
|
stream << "if (r < M)" << std::endl;
|
||||||
@@ -122,13 +122,13 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co
|
|||||||
if(dot_type_==REDUCE_COLUMNS)
|
if(dot_type_==REDUCE_COLUMNS)
|
||||||
{
|
{
|
||||||
std::string data_type = append_width("#scalartype",row_simd_width);
|
std::string data_type = append_width("#scalartype",row_simd_width);
|
||||||
accessors["array2"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "c*#stride", "#pointer + r*#ld", backend)+";";
|
accessors["array2"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "c*#stride", "#pointer + r*#ld", backend,false)+";";
|
||||||
accessors["repeat"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ", backend)+";";
|
accessors["repeat"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ", backend,false)+";";
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
std::string data_type = append_width("#scalartype",col_simd_width);
|
std::string data_type = append_width("#scalartype",col_simd_width);
|
||||||
accessors["array2"] = data_type + " #namereg = " + vload(col_simd_width, "#scalartype", "0", "#pointer + r*#stride + c*#ld", backend) + ";";
|
accessors["array2"] = data_type + " #namereg = " + vload(col_simd_width, "#scalartype", "0", "#pointer + r*#stride + c*#ld", backend,false) + ";";
|
||||||
accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};";
|
accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};";
|
||||||
}
|
}
|
||||||
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
e->process_recursive(stream, PARENT_NODE_TYPE, accessors);
|
||||||
@@ -206,8 +206,8 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co
|
|||||||
if(col_simd_width > 1)
|
if(col_simd_width > 1)
|
||||||
stream << "if(M - r > " << col_simd_width << "){" << std::endl;
|
stream << "if(M - r > " << col_simd_width << "){" << std::endl;
|
||||||
if (e->is_index_dot())
|
if (e->is_index_dot())
|
||||||
stream << e->process(vstore(col_simd_width,"uint", "#name_buf_value[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp_value + r + M*" + GroupIdx0(backend).get(),backend)) << ";" << std::endl;
|
stream << e->process(vstore(col_simd_width,"uint", "#name_buf_value[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp_value + r + M*" + GroupIdx0(backend).get(),backend, false)) << ";" << std::endl;
|
||||||
stream << e->process(vstore(col_simd_width,"#scalartype", "#name_buf[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp + r + M*" + GroupIdx0(backend).get(),backend)) << ";" << std::endl;
|
stream << e->process(vstore(col_simd_width,"#scalartype", "#name_buf[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp + r + M*" + GroupIdx0(backend).get(),backend, false)) << ";" << std::endl;
|
||||||
if(col_simd_width > 1)
|
if(col_simd_width > 1)
|
||||||
{
|
{
|
||||||
stream << "}" << std::endl;
|
stream << "}" << std::endl;
|
||||||
|
@@ -39,7 +39,7 @@ inline std::string append_width(std::string const & str, unsigned int width)
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline std::string vstore(unsigned int simd_width, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, driver::backend_type backend)
|
inline std::string vstore(unsigned int simd_width, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, driver::backend_type backend, bool aligned = true)
|
||||||
{
|
{
|
||||||
std::string vdtype = append_width(dtype,simd_width);
|
std::string vdtype = append_width(dtype,simd_width);
|
||||||
if (simd_width==1)
|
if (simd_width==1)
|
||||||
@@ -49,7 +49,15 @@ inline std::string vstore(unsigned int simd_width, std::string const & dtype, st
|
|||||||
switch(backend)
|
switch(backend)
|
||||||
{
|
{
|
||||||
case driver::CUDA:
|
case driver::CUDA:
|
||||||
return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value;
|
if(aligned)
|
||||||
|
return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
std::string res;
|
||||||
|
for(unsigned int s = 0 ; s < simd_width ; ++s)
|
||||||
|
res += (s>0?";(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + "] = " + access_vector_type(value, s);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
case driver::OPENCL:
|
case driver::OPENCL:
|
||||||
return append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
|
return append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")";
|
||||||
default:
|
default:
|
||||||
@@ -75,7 +83,7 @@ inline std::string vload(unsigned int simd_width, std::string const & dtype, std
|
|||||||
{
|
{
|
||||||
std::string res = "make_" + vdtype + "(";
|
std::string res = "make_" + vdtype + "(";
|
||||||
for(unsigned int s = 0 ; s < simd_width ; ++s)
|
for(unsigned int s = 0 ; s < simd_width ; ++s)
|
||||||
res += ((s>0)?",(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + "]";
|
res += ((s>0)?",(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + "]";
|
||||||
res += ")";
|
res += ")";
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
@@ -31,12 +31,12 @@ def train(X, Y, profiles):
|
|||||||
Y = Y[p,:]
|
Y = Y[p,:]
|
||||||
|
|
||||||
#Train the.profile
|
#Train the.profile
|
||||||
cut = int(.9*M)
|
cut = int(.5*M)
|
||||||
XTr, YTr = X[:cut,:], Y[:cut,:]
|
XTr, YTr = X[:cut,:], Y[:cut,:]
|
||||||
XCv, YCv = X[cut:,:], Y[cut:,:]
|
XCv, YCv = X[cut:,:], Y[cut:,:]
|
||||||
|
|
||||||
nrmses = {}
|
nrmses = {}
|
||||||
for N in range(1,min(M+1,10)):
|
for N in range(1,min(M+1,20)):
|
||||||
for depth in range(1,min(M+1,20)):
|
for depth in range(1,min(M+1,20)):
|
||||||
clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
|
clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr)
|
||||||
t = np.argmax(clf.predict(XCv), axis = 1)
|
t = np.argmax(clf.predict(XCv), axis = 1)
|
||||||
|
@@ -93,7 +93,7 @@ class Tuner:
|
|||||||
(1200,128,729),
|
(1200,128,729),
|
||||||
(363,96,3025)]
|
(363,96,3025)]
|
||||||
elif level=='full':
|
elif level=='full':
|
||||||
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15))
|
sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17))
|
||||||
|
|
||||||
#Remove duplicates and or too small/big tuples
|
#Remove duplicates and or too small/big tuples
|
||||||
sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1]
|
sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1]
|
||||||
|
Reference in New Issue
Block a user