#include #include "isaac/driver/backend.h" #include "isaac/driver/cublas.h" #include "isaac/driver/context.h" #include "isaac/driver/buffer.h" #include "isaac/driver/stream.h" #include "isaac/tools/bench.hpp" #include "isaac/api.h" namespace sc = isaac; namespace drv = sc::driver; using sc::param_t; using std::make_tuple; double geometric_mean(std::vector const&data){ double logsum = std::accumulate(data.begin(), data.end(), (double)0, [](double acc, double x){ return acc + std::log(x);}); return std::exp(logsum/data.size()); } void print_results_header(std::vector sections){ std::cout << color_stream(ITALIC) << color_stream(BOLD) ; std::copy(sections.begin(), sections.end(), std::ostream_iterator(std::cout, "\t")); std::cout << "ISAAC\tcuDNN"; std::cout << color_stream(RESET) << std::endl; } void print_results(std::vector const & times, std::vector const & prefix, std::function fn){ std::copy(prefix.begin(), prefix.end(), std::ostream_iterator(std::cout, "\t")); std::vector perf; std::transform(times.begin(), times.end(), std::back_inserter(perf), fn); auto fastest = perf; std::sort(fastest.begin(), fastest.end(), std::greater()); for(auto x: perf){ if(x/fastest[1] >= 1.05) std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET); else std::cout << x; std::cout << "\t"; } std::cout << std::endl; } int main(){ std::cout << std::fixed << std::setprecision(2); auto ctx = drv::backend::contexts::get_default(); drv::Stream stream(ctx); sc::DType dtype = sc::FLOAT_TYPE; int32_t dtsize = sc::size_of(dtype); drv::Device const & device = drv::backend::contexts::get_default().device(); { typedef std::tuple conv_tuple; std::vector shapes; //Cluster 1 for(size_t N: std::vector{4, 8, 16, 32}) shapes.push_back(std::make_tuple(700, 161, 1, N, 32, 5, 20, 0, 0, 2, 2)); //Cluster 2 for(size_t N: std::vector{4, 8, 16, 32}) shapes.push_back(std::make_tuple(341, 79, 32, N, 32, 5, 10, 0, 0, 2, 2)); //Cluster 3 shapes.push_back(std::make_tuple(480, 48, 1, 16, 16, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(240, 24, 16, 16, 32, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(120, 12, 32, 16, 64, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(60, 6, 64, 16, 128, 3, 3, 1, 1, 1, 1)); //Cluster 4 shapes.push_back(std::make_tuple(108, 108, 3, 8, 64, 3, 3, 1, 1, 2, 2)); shapes.push_back(std::make_tuple(54, 54, 64, 8, 64, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(27, 27, 128, 8, 128, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(14, 14, 128, 8, 256, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(7, 7, 256, 8, 512, 3, 3, 1, 1, 1, 1)); //Cluster 5-6 for(size_t N: std::vector{8, 16}){ shapes.push_back(std::make_tuple(224, 224, 3, N, 64, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(112, 112, 64, N, 128, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(56, 56, 128, N, 256, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(28, 28, 256, N, 512, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(14, 14, 512, N, 512, 3, 3, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(7, 7, 512, N, 512, 3, 3, 1, 1, 1, 1)); } //Cluster 7 shapes.push_back(std::make_tuple(224, 224, 3, 16, 64, 7, 7, 3, 3, 2, 2)); shapes.push_back(std::make_tuple(28, 28, 192, 16, 32, 5, 5, 2, 2, 1, 1)); shapes.push_back(std::make_tuple(28, 28, 192, 16, 64, 1, 1, 0, 0, 1, 1)); shapes.push_back(std::make_tuple(14, 14, 512, 16, 48, 5, 5, 2, 2, 1, 1)); shapes.push_back(std::make_tuple(14, 14, 512, 16, 192, 1, 1, 0, 0, 1, 1)); shapes.push_back(std::make_tuple(7, 7, 832, 16, 256, 1, 1, 0, 0, 1, 1)); shapes.push_back(std::make_tuple(7, 7, 832, 16, 128, 5, 5, 2, 2, 1, 1)); param_t W, H, P, Q, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w; std::cout << "======================================================================" << std::endl; std::cout << "FCONV" << std::endl; std::cout << "======================================================================" << std::endl; print_results_header({"N", "K", "P", "Q", "C", "R", "S"}); std::vector speedup; for(auto shape: shapes){ std::tie(W, H, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w) = shape; P = (H - R + 1 + 2*pad_h)/stride_h; Q = (W - S + 1 + 2*pad_w)/stride_w; sc::scalar alpha(1., dtype); sc::scalar beta(0., dtype); drv::Buffer O(ctx, N*K*P*Q*dtsize); drv::Buffer I(ctx, C*H*W*N*dtsize); drv::Buffer F(ctx, K*C*R*S*dtsize); std::vector times; times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); speedup.push_back(times[1]/times[0]); print_results(times, {str(N), str(K), str(P), str(Q), str(C), str(R), str(S)}, [&](double tsec){ return sc::templates::Conv::tflops(P,Q,K,N,C,R,S,tsec);}); } std::cout << "======================================================================" << std::endl; std::cout << "Speedup: " << geometric_mean(speedup) << std::endl; std::cout << std::endl; } //GEMM { typedef std::tuple gemm_tuple; std::vector shapes; // LinPack for(param_t N: std::vector{512, 1024, 2048}) shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N)); // DeepBench [Forward] for(param_t M: std::vector{1760}) for(param_t N: std::vector{8, 16, 32, 64, 128}) shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_N, M, N, M)); // DeepBench [Backward] for(param_t M: std::vector{1760}) for(param_t N: std::vector{8, 16, 32, 64, 128}) shapes.push_back(std::make_tuple(sc::ISAAC_OP_T, sc::ISAAC_OP_N, M, N, M)); // PCA/ICA for(param_t N: std::vector{16, 64, 256}) for(param_t K: std::vector{64000}) shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K)); // LaPACK for(param_t N: std::vector{1024, 2048, 4096}) for(param_t K: std::vector{32}) shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K)); sc::IsaacOperation_t AT, BT; param_t M, N, K; std::cout << "======================================================================" << std::endl; std::cout << "GEMM:" << std::endl; std::cout << "======================================================================" << std::endl; print_results_header({"AT", "BT", "M", "N", "K"}); std::vector speedup; for(auto shape: shapes){ std::tie(AT, BT, M, N, K) = shape; sc::scalar alpha(1., dtype); sc::scalar beta(0., dtype); size_t ldc = M; size_t lda = (AT==sc::ISAAC_OP_N)?M:K; size_t ldb = (BT==sc::ISAAC_OP_N)?K:N; char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N'; char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N'; drv::Buffer C(ctx, M*N*dtsize); drv::Buffer A(ctx, M*K*dtsize); drv::Buffer B(ctx, K*N*dtsize); std::vector times; times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device)); times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, ctx, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device)); speedup.push_back(times[1]/times[0]); print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, [&](double tsec){ return sc::templates::GEMM::tflops(M, N, K, tsec);}); } std::cout << "======================================================================" << std::endl; std::cout << "Speedup: " << geometric_mean(speedup) << std::endl; } }