#include "opts.hpp" #include "isaac/scalar.h" #include "isaac/api.h" #include "isaac/driver/cublas.h" #include "isaac/driver/backend.h" #include "isaac/driver/context.h" #include "isaac/driver/stream.h" #include "isaac/runtime/predict.h" #include "isaac/templates/gemm.h" #include "isaac/templates/error.hpp" #include "isaac/tools/bench.hpp" namespace sc = isaac; namespace drv = sc::driver; using sc::param_t; enum Code { RESET = 0, BOLD = 1, ITALIC = 3, FG_RED = 31, FG_GREEN = 32, FG_YELLOW = 33, FG_BLUE = 34, FG_MAGENTA = 35, FG_CYAN = 36, FG_LIGHT_GRAY = 37, FG_DARK_GRAY = 90, FG_LIGHT_RED = 91, FG_LIGHT_GREEN = 92, FG_LIGHT_YELLOW = 93, FG_LIGHT_BLUE = 94, FG_LIGHT_MAGENTA = 95, FG_LIGHT_CYAN = 96, FG_WHITE = 97 }; class color_stream { Code code; public: color_stream(Code pCode) : code(pCode) {} friend std::ostream& operator<<(std::ostream& os, const color_stream& mod) { return os << "\033[" << mod.code << "m"; } }; /* Helpers for benchmarking */ typedef std::tuple gemm_params_t; typedef std::tuple conv_params_t; typedef std::tuple pool_params_t; struct TestBench{ // GEMM static std::vector gemm(sc::DType dtype){ std::vector shapes; // LinPack for(param_t N: std::vector{512, 1024, 2048}) shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N)); // DeepBench for(sc::IsaacOperation_t AT: std::vector{sc::ISAAC_OP_N, sc::ISAAC_OP_T}) for(param_t M: std::vector{1760}) for(param_t N: std::vector{16, 32, 64, 128}) shapes.push_back(std::make_tuple(dtype, AT, sc::ISAAC_OP_N, M, N, M)); // PCA/ICA for(param_t N: std::vector{16, 64, 256}) for(param_t K: std::vector{64000}) shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K)); // LaPACK for(param_t N: std::vector{1024, 2048, 4096}) for(param_t K: std::vector{32}) shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K)); return shapes; } // CONV static std::vector conv(sc::DType dtype){ // Vector of (dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) std::vector shapes; // DeepSpeech for(size_t N: std::vector{8}) shapes.push_back(std::make_tuple(dtype, 1, 700, 161, 1, N, 32, 1, 5, 20, 0, 0, 0, 1, 1, 1)); for(size_t N: std::vector{8}) shapes.push_back(std::make_tuple(dtype, 1, 341, 79, 32, N, 32, 1, 5, 10, 0, 0, 0, 1, 1, 1)); // OCR shapes.push_back(std::make_tuple(dtype, 1, 480, 48, 1, 16, 16, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 240, 24, 16, 16, 32, 1, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 120, 12, 32, 16, 64, 1, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 60, 6, 64, 16, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1)); // Face Recognition shapes.push_back(std::make_tuple(dtype, 1, 108, 108, 3, 8, 64, 1, 3, 3, 0, 1, 1, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 54, 54, 64, 8, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 27, 27, 128, 8, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 128, 8, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 256, 8, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1)); // Vision for(size_t N: std::vector{8}){ shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, N, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 128, N, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 256, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1)); } shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, 16, 64, 1, 7, 7, 0, 3, 3, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 32, 1, 5, 5, 0, 2, 2, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 48, 1, 5, 5, 0, 2, 2, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 192, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 128, 1, 5, 5, 0, 2, 2, 1, 1, 1)); // Speaker ID shapes.push_back(std::make_tuple(dtype, 1, 350, 80, 64, 16, 128, 1, 5, 5, 0, 1, 1, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 175, 40, 128, 16, 256, 1, 5, 5, 0, 1, 1, 1, 2, 2)); // ResNET for(size_t N: std::vector{8}){ shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 64, N, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 128, 1, 1, 1, 0, 0, 0, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 128, N, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 128, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 256, 1, 1, 1, 0, 0, 0, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 256, N, 1024, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 1024, 1, 1, 1, 0, 0, 0, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 0, 0, 1, 2, 2)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 2048, 1, 1, 1, 0, 1, 1, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 1, 1, 1, 2, 2)); } // 3D-Unet shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 4, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 24, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 72, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 216, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 648, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 648, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 432, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 216, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 144, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 72, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 48, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 3 , 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1)); return shapes; } // POOL static std::vector pool(sc::DType dtype){ std::vector shapes; // 3D-Unet shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 4, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 4, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 4, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 4, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 4, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 4, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 4, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 4, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 4, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 4, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 4, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 4, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 4, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 4, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 4, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 4, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 4, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1)); shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 4, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1)); return shapes; } }; /* Metrics for benchmarking */ struct Metric{ virtual std::function cmp() const = 0; virtual double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const = 0; virtual double gemm(param_t M, param_t N, param_t K, double tsec) const = 0; virtual double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t, param_t, param_t, double tsec) const = 0; }; class FLOPS: public Metric{ public: FLOPS(double scale): scale_(scale){} std::function cmp() const { return std::greater(); } double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const { return sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec) * 1e12 / scale_; } double gemm(param_t M, param_t N, param_t K, double tsec) const { return sc::templates::GEMM::tflops(M, N, K, tsec) * 1e12 / scale_; } double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double tsec) const { return sc::templates::Pool::tflops(P, Q, M, K, N, T, R, S, tsec) * 1e12 / scale_;} private: double scale_; }; class Time: public Metric{ public: Time(double scale): scale_(scale){} std::function cmp() const { return std::less(); } double conv(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; } double gemm(param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; } double pool(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; } private: double scale_; }; void print_results_header(std::vector sections){ std::cout << color_stream(ITALIC) << color_stream(BOLD) ; std::copy(sections.begin(), sections.end(), std::ostream_iterator(std::cout, "\t")); std::cout << color_stream(RESET) << std::endl; } void print_results(std::vector const & times, std::vector const & prefix, std::function cmp, std::function fn){ std::copy(prefix.begin(), prefix.end(), std::ostream_iterator(std::cout, "\t")); std::vector perf; std::transform(times.begin(), times.end(), std::back_inserter(perf), fn); auto fastest = perf; std::sort(fastest.begin(), fastest.end(), cmp); for(auto x: perf){ if(x == fastest[0] && x / fastest[1] > 1.05) std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET); else std::cout << x; std::cout << "\t"; } std::cout << std::endl; } void benchmark_gemm(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream, sc::DType dtype, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, size_t M, size_t N, size_t K, sc::templates::Generator* generator){ size_t ldc = M; size_t lda = (AT==sc::ISAAC_OP_N)?M:K; size_t ldb = (BT==sc::ISAAC_OP_N)?K:N; size_t dtsize = sc::size_of(dtype); sc::scalar alpha(1., dtype); sc::scalar beta(0., dtype); char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N'; char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N'; sc::driver::Buffer C(ctx, M*N*dtsize); sc::driver::Buffer A(ctx, M*K*dtsize); sc::driver::Buffer B(ctx, K*N*dtsize); std::vector times; times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C, (sc::templates::GEMM*)generator); }, [&](){ stream.synchronize(); }, device)); if(sc::driver::dispatch::cublasinit()){ cublasGemmAlgo_t fastest; sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, &fastest); times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, NULL, fastest); }, [&](){ stream.synchronize(); }, device)); //times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device)); } print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, metric.cmp(), [&](double tsec){ return metric.gemm(M, N, K, tsec);}); } void benchmark_conv(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream, sc::DType dtype, size_t D, size_t H, size_t W, size_t C, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w, sc::templates::Generator* generator){ param_t M, P, Q; sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, M, P, Q); size_t dtsize = sc::size_of(dtype); sc::scalar alpha(1., dtype); sc::scalar beta(0., dtype); sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize); sc::driver::Buffer I(ctx, C*D*H*W*N*dtsize); sc::driver::Buffer F(ctx, K*C*T*R*S*dtsize); std::vector times; times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, M, P, Q, C, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O, (sc::templates::Conv*)generator); }, [&](){ stream.synchronize(); }, device)); if(sc::driver::dispatch::cudnninit()) times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, stream, D, H, W, N, K, M, P, Q, C, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(C), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.conv(M, P, Q, K, N, C, T, R, S, tsec);}); } void benchmark_pool(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream, sc::DType dtype, size_t D, size_t H, size_t W, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w, sc::templates::Generator* generator){ param_t M, P, Q; sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, M, P, Q); size_t dtsize = sc::size_of(dtype); sc::scalar alpha(1., dtype); sc::scalar beta(0., dtype); sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize); sc::driver::Buffer I(ctx, K*D*H*W*N*dtsize); std::vector times; times.push_back(bench([&](){ sc::POOL(device, stream, dtype, K, M, P, Q, N, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, I, O, (sc::templates::Pool*)generator); }, [&](){ stream.synchronize(); }, device)); if(sc::driver::dispatch::cudnninit()) times.push_back(bench([&](){ sc::driver::cudnnPool(dtype, stream, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, beta, O); }, [&](){ stream.synchronize(); }, device)); print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.pool(M, P, Q, K, N, T, R, S, tsec);}); } /* ------------------------------- */ void loop_nest(std::vector const & ranges, std::function const &)> const & f){ size_t D = ranges.size(); std::vector values(D, 0); // Start with innermost loop size_t i = D - 1; while(true){ //Execute function f(values); //Increment counters while(values[i]++ == ranges[i] - 1){ if(i == 0) return; values[i--] = 0; } i = D - 1; } } template void loop_nest(std::vector> const & iterates, std::function)> const & f){ //Ranges to iterate over std::vector ranges; for(auto const & x: iterates) ranges.push_back(x.size()); //Proxy function auto proxy = [&](std::vector const & idx){ std::vector x(iterates.size()); for(size_t i = 0; i < x.size(); ++i) x[i] = iterates[i][idx[i]]; f(x); }; //Iterate loop_nest(ranges, proxy); } void search_conv(int32_t W, int32_t H, int32_t C, int32_t N, int32_t K, int32_t R, int32_t S, int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, sc::DType dtype){ auto ctx = drv::backend::contexts::get_default(); size_t dtsize = sc::size_of(dtype); size_t D = 1, T = 1, stride_d = 1, pad_d = 0; size_t P = (H - R + 1 + 2*pad_h + stride_h - 1)/stride_h; size_t Q = (W - S + 1 + 2*pad_w + stride_w - 1)/stride_w; size_t M = (D - T + 1 + 2*pad_d + stride_d - 1)/stride_d; //Setup drv::Buffer O(ctx, K*P*Q*M*N*dtsize); drv::Buffer I(ctx, C*H*W*D*N*dtsize); drv::Buffer F(ctx, C*R*S*T*K*dtsize); drv::Stream stream(ctx); sc::scalar alpha(1., dtype), beta(1., dtype); //Exhaustive search std::vector r1 = {1}; std::vector rv = {4}; std::vector rr = {1, 2, 4, 8}; std::vector rl = {4, 8, 16, 32}; std::vector rs = {4, 8, 16}; double best; loop_nest({rv, rl, rl, rs, rs, rl, rl, r1, rr, rr}, [&](std::vector const & x){ sc::templates::Conv generator(dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]); //Compile try{ std::string src = generator.dump(ctx.device(), "conv"); drv::Module program(ctx, src); drv::Kernel kernel(program, "conv"); double tsec = bench([&](){ generator.enqueue(kernel, stream, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, ctx.device()); double tflops = sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec); best = std::max(tflops, best); std::cout << "//"; std::copy(x.begin(), x.end(), std::ostream_iterator(std::cout, " ")); std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl; }catch(isaac::templates::invalid_parameters const &){ return; }catch(drv::exception::cuda::launch_out_of_resources const &){ return; } }); std::cout << "ISAAC: " << best << std::endl; } void search_gemm(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){ auto ctx = drv::backend::contexts::get_default(); size_t dtsize = sc::size_of(dtype); // Setup size_t ldc = M; size_t lda = (AT==sc::ISAAC_OP_N)?M:K; size_t ldb = (BT==sc::ISAAC_OP_N)?K:N; int32_t offc = 0, offa = 0, offb = 0; drv::Buffer C(ctx, M*N*dtsize); drv::Buffer A(ctx, M*K*dtsize); drv::Buffer B(ctx, K*N*dtsize); drv::Stream stream(ctx); sc::scalar alpha(1., dtype), beta(0., dtype); // Exhaustive search std::vector r1 = {1}; std::vector rv = {4}; std::vector rr = {1, 2, 4, 8}; std::vector rl = {4, 8, 16, 32}; std::vector rs = {4, 8, 16}; double best = 0; loop_nest({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, rr, rr}, [&](std::vector const & x){ isaac::templates::GEMM generator(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]); // Compile try{ std::string src = generator.dump(ctx.device(), "gemm"); drv::Module program(ctx, src); drv::Kernel kernel(program, "gemm"); double time = bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, ctx.device()); double tflops = 2*1e-3*M*N*K/time; best = std::max(tflops, best); std::cout << "//"; std::copy(x.begin(), x.end(), std::ostream_iterator(std::cout, " ")); std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl; }catch(isaac::templates::invalid_parameters const &){ return; }catch(drv::exception::cuda::launch_out_of_resources const &){ return; } }); std::cout << "ISAAC: " << best << std::endl; } /* Helpers for dumping source code */ void dump_source(sc::driver::Device const & device, sc::templates::Generator& generator, opts::Options* options, std::string const & name){ if(options->get("format") == "ptx") std::cout << generator.dump(device, name) << std::endl; else{ auto x = generator.tuning_params(); std::cout << "Tuning parameters: " << std::flush; for(size_t i = 0; i < x.size(); ++i) std::cout << ((i>0)?", ":"") << x[i] << std::flush; std::cout << std::endl; } } /* Application code */ int main(int argc, char* argv[]){ opts::Application program("isaac-tools", "Command-line interface for ISAAC"); // Options opts::Options* options = program.options(); options->add("device", "Device to run on", 0); options->add("dtype", "Data-type to use for computations", "float32", {{"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}}); options->add("name", "Name to give to the generated kernel", "kernel"); options->add_group("search", "Exhaustively search for best tuning parameters"); opts::Options* dump = options->add_group("dump", "Dump source-code generated by ISAAC"); dump->add("format", "Format to generate", "ptx", {"ptx", "params"}); dump->add("target", "Target GPU (sm_xx)", {"sm_50", "sm_52", "sm_60", "sm_61", "sm_70"}); opts::Options* bench = options->add_group("bench", "Benchmark source code generated by ISAAC"); bench->add("suite", "Benchmarking suite to run", "custom", {"custom", "deepbench"}); bench->add>("metric", "performance metric for the results", "tflops", {{"tflops", std::make_shared(1e12)}, {"ms", std::make_shared