diff --git a/bench/blas.cpp b/bench/blas.cpp index 21093308a..af516c2fd 100644 --- a/bench/blas.cpp +++ b/bench/blas.cpp @@ -41,7 +41,7 @@ void bench(sc::numeric_type dtype, std::string operation) {\ std::vector times;\ double total_time = 0;\ - while(total_time*1e-9 < 1e-2){\ + while(total_time*1e-9 < 1e-1){\ std::list events;\ OP;\ queue.synchronize();\ @@ -56,7 +56,7 @@ void bench(sc::numeric_type dtype, std::string operation) {\ std::vector times;\ double total_time = 0;\ - while(total_time*1e-9 < 1e-2){\ + while(total_time*1e-9 < 1e-1){\ cl_event event;\ OP;\ queue.synchronize();\ @@ -72,7 +72,7 @@ void bench(sc::numeric_type dtype, std::string operation) Timer tmr;\ long total_time = 0;\ std::vector times;\ - while(total_time*1e-9 < 1e-2){\ + while(total_time*1e-9 < 1e-1){\ tmr.start();\ OP;\ long time = tmr.get().count();\ @@ -93,7 +93,7 @@ void bench(sc::numeric_type dtype, std::string operation) cudaEventCreate(&stop);\ OP;\ cudaThreadSynchronize();\ - while(total_time*1e-3 < 1e-2){\ + while(total_time*1e-3 < 1e-1){\ cudaEventRecord(start,0);\ OP;\ cudaEventRecord(stop,0);\ @@ -103,7 +103,7 @@ void bench(sc::numeric_type dtype, std::string operation) total_time+=time;\ }\ double t = mean(times);\ - std::cout << "\t" << (int)(PERF) << std::flush;\ + std::cout << " " << (int)(PERF) << std::flush;\ } unsigned int dtsize = sc::size_of(dtype); @@ -111,16 +111,17 @@ void bench(sc::numeric_type dtype, std::string operation) std::map metric{ {"axpy", "GB/s"}, {"dot", "GB/s"}, {"gemv", "GB/s"}, {"gemm", "GFLOPS"}}; sc::array flush((int)1e6, sc::FLOAT_TYPE); std::cout << "#" << operation << " (" << metric[operation] << ")" << std::endl; - std::cout << "N"; - std::cout << "\tISAAC"; + std::cout << "\"N\""; + std::cout << " \"ISAAC (Pred impl.)\""; + std::cout << " \"ISAAC (Best impl.)\""; #ifdef BENCH_CLBLAS - std::cout << "\tclBLAS"; + std::cout << " \"clBLAS\""; #endif #ifdef BENCH_CBLAS - std::cout << "\tBLAS"; + std::cout << " \"BLAS\""; #endif #ifdef BENCH_CUBLAS - std::cout << "\tcuBLAS"; + std::cout << " \"cuBLAS\""; #endif std::cout << std::endl; // @@ -194,21 +195,23 @@ void bench(sc::numeric_type dtype, std::string operation) if(operation.substr(0, 4)=="gemv") { std::vector > MNs; - MNs.push_back(std::make_tuple('N',896,896)); - MNs.push_back(std::make_tuple('N',3072,3072)); - //AlexNet - MNs.push_back(std::make_tuple('N',1000,256)); - MNs.push_back(std::make_tuple('N',4096,256)); + //Linear System + MNs.push_back(std::make_tuple('N',153,153)); + MNs.push_back(std::make_tuple('N',1024,1024)); + MNs.push_back(std::make_tuple('N',2867,2867)); - MNs.push_back(std::make_tuple('T',169,256)); - MNs.push_back(std::make_tuple('T',169,384)); - MNs.push_back(std::make_tuple('T',729,256)); - MNs.push_back(std::make_tuple('T',3025,96)); + //Normalization + MNs.push_back(std::make_tuple('N', 32, 60000)); + MNs.push_back(std::make_tuple('N', 256, 60000)); + + //Householder + MNs.push_back(std::make_tuple('N', 100, 60000)); + MNs.push_back(std::make_tuple('N', 90, 60000)); + MNs.push_back(std::make_tuple('N', 50, 60000)); /*---------*/ /*--BLAS2--*/ /*---------*/ - //T-layout for(std::tuple MN: MNs) { bool AT = std::get<0>(MN) == 'T'; @@ -224,6 +227,7 @@ void bench(sc::numeric_type dtype, std::string operation) int_t lda = A.ld(); #endif BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events)),(M*N + M + N)*dtsize/t); + BENCHMARK_ISAAC(y = sc::control(AT?dot(A.T(),x):dot(A,x), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)),(M*N + M + N)*dtsize/t); #ifdef BENCH_CLBLAS if(y.context().backend()==sc::driver::OPENCL) BENCHMARK_CLBLAS(clblasSgemv(clblasColumnMajor, AT?clblasTrans:clblasNoTrans, As1, As2, 1, CL_HANDLE(A.data()), 0, lda, CL_HANDLE(x.data()), 0, 1, 0, CL_HANDLE(y.data()), 0, 1, 1, &CL_HANDLE(queue),0, NULL, &event), (M*N + M + N)*dtsize/t) @@ -274,14 +278,14 @@ void bench(sc::numeric_type dtype, std::string operation) // MNKs.push_back(std::make_tuple("Convolution Gradient-2 [AlexNet-2]",'N','T',729,1200,128)); // MNKs.push_back(std::make_tuple("Conv. Gradient-2 [LeNet-2]",'N','T',64,500,50)); - //Covariance (e.g., ICA, 10minutes/1khz) - MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,600000)); - MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,600000)); + //Covariance (e.g., ICA, 10minutes/100Hz) + MNKs.push_back(std::make_tuple("ICA [32 channels]",'N','T',32,32,60000)); + MNKs.push_back(std::make_tuple("ICA [256 channels]",'N','T',256,256,60000)); // //Bi-diagonalization - MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 1]",'N','T',4096,4096,32)); - MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 10]",'N','T',3456,3456,32)); - MNKs.push_back(std::make_tuple("Bidiagonalization [Iteration 50]",'N','T',896,896,32)); + MNKs.push_back(std::make_tuple("Householder [Iteration 1]",'N','T',4096,4096,32)); + MNKs.push_back(std::make_tuple("Householder [Iteration 10]",'N','T',3456,3456,32)); + MNKs.push_back(std::make_tuple("Householder [Iteration 50]",'N','T',896,896,32)); /*---------*/ /*--BLAS3--*/ @@ -305,6 +309,7 @@ void bench(sc::numeric_type dtype, std::string operation) #ifdef HAS_A_BLAS int_t lda = A.ld(), ldb = B.ld(), ldc = C.ld(); #endif + BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(false)), (double)2*M*N*K/t); BENCHMARK_ISAAC(C = sc::control(AT?(BT?dot(A.T(),B.T()):dot(A.T(),B)):(BT?dot(A,B.T()):dot(A,B)), sc::execution_options_type(0, &events), sc::dispatcher_options_type(true)), (double)2*M*N*K/t); /* clblas */ #ifdef BENCH_CLBLAS diff --git a/lib/kernels/templates/gemv.cpp b/lib/kernels/templates/gemv.cpp index 9930725ed..ee85d6b74 100644 --- a/lib/kernels/templates/gemv.cpp +++ b/lib/kernels/templates/gemv.cpp @@ -105,7 +105,7 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co for (const auto & e : dots){ std::string data_type = append_width("#scalartype",col_simd_width); - stream << e->process(data_type + " #name_acc = " + neutral_element((e)->root_op(), backend, "#scalartype") + ";") << std::endl; + stream << e->process(data_type + " #name_acc = " + InitPrefix(backend, data_type).get() + "(" + neutral_element((e)->root_op(), backend, "#scalartype") + ");") << std::endl; } stream << "if (r < M)" << std::endl; @@ -122,13 +122,13 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co if(dot_type_==REDUCE_COLUMNS) { std::string data_type = append_width("#scalartype",row_simd_width); - accessors["array2"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "c*#stride", "#pointer + r*#ld", backend)+";"; - accessors["repeat"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ", backend)+";"; + accessors["array2"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "c*#stride", "#pointer + r*#ld", backend,false)+";"; + accessors["repeat"] = data_type + " #namereg = " + vload(row_simd_width, "#scalartype", "(c%#tuplearg0)*#stride", "#pointer + (r%#tuplearg1)*#stride ", backend,false)+";"; } else { std::string data_type = append_width("#scalartype",col_simd_width); - accessors["array2"] = data_type + " #namereg = " + vload(col_simd_width, "#scalartype", "0", "#pointer + r*#stride + c*#ld", backend) + ";"; + accessors["array2"] = data_type + " #namereg = " + vload(col_simd_width, "#scalartype", "0", "#pointer + r*#stride + c*#ld", backend,false) + ";"; accessors["repeat"] = "#scalartype #namereg = $VALUE{(r%#tuplearg0)*#stride, (c%#tuplearg1)*#stride};"; } e->process_recursive(stream, PARENT_NODE_TYPE, accessors); @@ -206,8 +206,8 @@ std::string gemv::generate_impl(std::string const & suffix, expressions_tuple co if(col_simd_width > 1) stream << "if(M - r > " << col_simd_width << "){" << std::endl; if (e->is_index_dot()) - stream << e->process(vstore(col_simd_width,"uint", "#name_buf_value[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp_value + r + M*" + GroupIdx0(backend).get(),backend)) << ";" << std::endl; - stream << e->process(vstore(col_simd_width,"#scalartype", "#name_buf[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp + r + M*" + GroupIdx0(backend).get(),backend)) << ";" << std::endl; + stream << e->process(vstore(col_simd_width,"uint", "#name_buf_value[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp_value + r + M*" + GroupIdx0(backend).get(),backend, false)) << ";" << std::endl; + stream << e->process(vstore(col_simd_width,"#scalartype", "#name_buf[lidy*" + local_size_0_ld_str + "]", "0", "#name_temp + r + M*" + GroupIdx0(backend).get(),backend, false)) << ";" << std::endl; if(col_simd_width > 1) { stream << "}" << std::endl; diff --git a/lib/kernels/templates/tools/vector_types.hpp b/lib/kernels/templates/tools/vector_types.hpp index bb01273b7..b79d3ee46 100644 --- a/lib/kernels/templates/tools/vector_types.hpp +++ b/lib/kernels/templates/tools/vector_types.hpp @@ -39,7 +39,7 @@ inline std::string append_width(std::string const & str, unsigned int width) } -inline std::string vstore(unsigned int simd_width, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, driver::backend_type backend) +inline std::string vstore(unsigned int simd_width, std::string const & dtype, std::string const & value, std::string const & offset, std::string const & ptr, driver::backend_type backend, bool aligned = true) { std::string vdtype = append_width(dtype,simd_width); if (simd_width==1) @@ -49,7 +49,15 @@ inline std::string vstore(unsigned int simd_width, std::string const & dtype, st switch(backend) { case driver::CUDA: - return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value; + if(aligned) + return "reinterpret_cast<" + vdtype + "*>(" + ptr + ")[" + offset + "] = " + value; + else + { + std::string res; + for(unsigned int s = 0 ; s < simd_width ; ++s) + res += (s>0?";(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + "] = " + access_vector_type(value, s); + return res; + } case driver::OPENCL: return append_width("vstore", simd_width) + "(" + value + ", " + offset + ", " + ptr + ")"; default: @@ -75,7 +83,7 @@ inline std::string vload(unsigned int simd_width, std::string const & dtype, std { std::string res = "make_" + vdtype + "("; for(unsigned int s = 0 ; s < simd_width ; ++s) - res += ((s>0)?",(":"(") + ptr + ")[" + offset + " + " + tools::to_string(s) + "]"; + res += ((s>0)?",(":"(") + ptr + ")[" + offset + "*" + tools::to_string(simd_width) + " + " + tools::to_string(s) + "]"; res += ")"; return res; } diff --git a/tune/android/tune/model.py b/tune/android/tune/model.py index 47da5b05a..2828fc713 100644 --- a/tune/android/tune/model.py +++ b/tune/android/tune/model.py @@ -31,12 +31,12 @@ def train(X, Y, profiles): Y = Y[p,:] #Train the.profile - cut = int(.9*M) + cut = int(.5*M) XTr, YTr = X[:cut,:], Y[:cut,:] XCv, YCv = X[cut:,:], Y[cut:,:] nrmses = {} - for N in range(1,min(M+1,10)): + for N in range(1,min(M+1,20)): for depth in range(1,min(M+1,20)): clf = RandomForestRegressor(N, max_depth=depth).fit(XTr, YTr) t = np.argmax(clf.predict(XCv), axis = 1) diff --git a/tune/android/tune/tune.py b/tune/android/tune/tune.py index 2f850da68..c38a2f4fe 100644 --- a/tune/android/tune/tune.py +++ b/tune/android/tune/tune.py @@ -93,7 +93,7 @@ class Tuner: (1200,128,729), (363,96,3025)] elif level=='full': - sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 15)) + sizes = product(pow2range(5, 12), pow2range(5, 12), pow2range(5, 17)) #Remove duplicates and or too small/big tuples sizes = [x for x in sizes if 1e-4 <= tools.memory_footprint(operation, x) <= 2e-1]