216 lines
9.6 KiB
C++
216 lines
9.6 KiB
C++
#include <tuple>
|
|
#include "isaac/driver/backend.h"
|
|
#include "isaac/driver/cublas.h"
|
|
#include "isaac/driver/context.h"
|
|
#include "isaac/driver/buffer.h"
|
|
#include "isaac/driver/stream.h"
|
|
#include "isaac/tools/bench.hpp"
|
|
#include "isaac/api.h"
|
|
|
|
namespace sc = isaac;
|
|
namespace drv = sc::driver;
|
|
using sc::param_t;
|
|
using std::make_tuple;
|
|
|
|
double geometric_mean(std::vector<double> const&data){
|
|
double logsum = std::accumulate(data.begin(), data.end(),
|
|
(double)0, [](double acc, double x){ return acc + std::log(x);});
|
|
return std::exp(logsum/data.size());
|
|
}
|
|
|
|
void handle_misusage(){
|
|
std::cerr << "Usage : blas-bench [--dtype {float16, float32, float64}]" << std::endl;
|
|
std::cerr << "--dtype: data-type to benchmark (default = float32)" << std::endl;
|
|
std::cerr << "--help: display this message" << std::endl;
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
std::string getopt(std::vector<std::string> const & args,
|
|
std::string const & key,
|
|
std::vector<std::string> const & set = {},
|
|
std::string dft = "")
|
|
{
|
|
auto it = std::find(args.begin(), args.end(), key);
|
|
if(it==args.end()){
|
|
if(dft.empty())
|
|
handle_misusage();
|
|
return dft;
|
|
}
|
|
auto next = it + 1;
|
|
if(next==args.end() || next->compare(0, 2, "--")==0)
|
|
handle_misusage();
|
|
if(set.size() && std::find(set.begin(), set.end(), *next)==set.end())
|
|
handle_misusage();
|
|
return *next;
|
|
}
|
|
|
|
void print_results_header(std::vector<std::string> sections){
|
|
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
|
|
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
|
|
std::cout << "ISAAC\tcuDNN";
|
|
std::cout << color_stream(RESET) << std::endl;
|
|
}
|
|
|
|
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<double(double)> fn){
|
|
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
|
|
std::vector<double> perf;
|
|
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
|
|
auto fastest = perf;
|
|
std::sort(fastest.begin(), fastest.end(), std::greater<double>());
|
|
for(auto x: perf){
|
|
if(x/fastest[1] >= 1.05)
|
|
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
|
|
else
|
|
std::cout << x;
|
|
std::cout << "\t";
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
|
|
int main(int argc, char* argv[])
|
|
{
|
|
std::vector<std::string> args(argv, argv + argc);
|
|
std::cout << std::fixed << std::setprecision(2);
|
|
|
|
//Get dtype
|
|
static std::map<std::string, sc::DType> sc_dtype = {{"float16", sc::HALF_TYPE}, {"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}};
|
|
sc::DType dtype = sc_dtype[getopt(args, "--dtype", {"float16", "float32", "float64"}, "float32")];
|
|
int32_t dtsize = sc::size_of(dtype);
|
|
|
|
//Get device
|
|
auto ctx = drv::backend::contexts::get_default();
|
|
drv::Device const & device = drv::backend::contexts::get_default().device();
|
|
drv::Stream stream(ctx);
|
|
|
|
//Benchmark convolution
|
|
{
|
|
typedef std::tuple<param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_tuple;
|
|
std::vector<conv_tuple> shapes;
|
|
//Cluster 1
|
|
for(size_t N: std::vector<size_t>{4, 8, 16, 32})
|
|
shapes.push_back(std::make_tuple(700, 161, 1, N, 32, 5, 20, 0, 0, 2, 2));
|
|
//Cluster 2
|
|
for(size_t N: std::vector<size_t>{4, 8, 16, 32})
|
|
shapes.push_back(std::make_tuple(341, 79, 32, N, 32, 5, 10, 0, 0, 2, 2));
|
|
//Cluster 3
|
|
shapes.push_back(std::make_tuple(480, 48, 1, 16, 16, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(240, 24, 16, 16, 32, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(120, 12, 32, 16, 64, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(60, 6, 64, 16, 128, 3, 3, 1, 1, 1, 1));
|
|
//Cluster 4
|
|
shapes.push_back(std::make_tuple(108, 108, 3, 8, 64, 3, 3, 1, 1, 2, 2));
|
|
shapes.push_back(std::make_tuple(54, 54, 64, 8, 64, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(27, 27, 128, 8, 128, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(14, 14, 128, 8, 256, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(7, 7, 256, 8, 512, 3, 3, 1, 1, 1, 1));
|
|
//Cluster 5-6
|
|
for(size_t N: std::vector<size_t>{8, 16}){
|
|
shapes.push_back(std::make_tuple(224, 224, 3, N, 64, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(112, 112, 64, N, 128, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(56, 56, 128, N, 256, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(28, 28, 256, N, 512, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(14, 14, 512, N, 512, 3, 3, 1, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(7, 7, 512, N, 512, 3, 3, 1, 1, 1, 1));
|
|
}
|
|
//Cluster 7
|
|
shapes.push_back(std::make_tuple(224, 224, 3, 16, 64, 7, 7, 3, 3, 2, 2));
|
|
shapes.push_back(std::make_tuple(28, 28, 192, 16, 32, 5, 5, 2, 2, 1, 1));
|
|
shapes.push_back(std::make_tuple(28, 28, 192, 16, 64, 1, 1, 0, 0, 1, 1));
|
|
shapes.push_back(std::make_tuple(14, 14, 512, 16, 48, 5, 5, 2, 2, 1, 1));
|
|
shapes.push_back(std::make_tuple(14, 14, 512, 16, 192, 1, 1, 0, 0, 1, 1));
|
|
shapes.push_back(std::make_tuple(7, 7, 832, 16, 256, 1, 1, 0, 0, 1, 1));
|
|
shapes.push_back(std::make_tuple(7, 7, 832, 16, 128, 5, 5, 2, 2, 1, 1));
|
|
|
|
param_t W, H, P, Q, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w;
|
|
std::cout << "======================================================================" << std::endl;
|
|
std::cout << "FCONV" << std::endl;
|
|
std::cout << "======================================================================" << std::endl;
|
|
print_results_header({"N", "K", "P", "Q", "C", "R", "S"});
|
|
std::vector<double> speedup;
|
|
for(auto shape: shapes){
|
|
std::tie(W, H, C, N, K, R, S, pad_h, pad_w, stride_h, stride_w) = shape;
|
|
P = (H - R + 1 + 2*pad_h)/stride_h;
|
|
Q = (W - S + 1 + 2*pad_w)/stride_w;
|
|
|
|
sc::scalar alpha(1., dtype);
|
|
sc::scalar beta(0., dtype);
|
|
|
|
drv::Buffer O(ctx, N*K*P*Q*dtsize);
|
|
drv::Buffer I(ctx, C*H*W*N*dtsize);
|
|
drv::Buffer F(ctx, K*C*R*S*dtsize);
|
|
|
|
std::vector<double> times;
|
|
// times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
|
|
times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
|
|
speedup.push_back(times[1]/times[0]);
|
|
print_results(times, {str(N), str(K), str(P), str(Q), str(C), str(R), str(S)}, [&](double tsec){ return sc::templates::Conv::tflops(P,Q,K,N,C,R,S,tsec);});
|
|
}
|
|
std::cout << "======================================================================" << std::endl;
|
|
std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
//Benchmark GEMM
|
|
{
|
|
typedef std::tuple<sc::IsaacOperation_t, sc::IsaacOperation_t, param_t, param_t, param_t> gemm_tuple;
|
|
std::vector<gemm_tuple> shapes;
|
|
|
|
// LinPack
|
|
for(param_t N: std::vector<param_t>{512, 1024, 2048})
|
|
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
|
|
|
|
// DeepBench [Forward]
|
|
for(param_t M: std::vector<param_t>{1760})
|
|
for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
|
|
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_N, M, N, M));
|
|
|
|
// DeepBench [Backward]
|
|
for(param_t M: std::vector<param_t>{1760})
|
|
for(param_t N: std::vector<param_t>{8, 16, 32, 64, 128})
|
|
shapes.push_back(std::make_tuple(sc::ISAAC_OP_T, sc::ISAAC_OP_N, M, N, M));
|
|
|
|
// PCA/ICA
|
|
for(param_t N: std::vector<param_t>{16, 64, 256})
|
|
for(param_t K: std::vector<param_t>{64000})
|
|
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
|
|
|
|
// LaPACK
|
|
for(param_t N: std::vector<param_t>{1024, 2048, 4096})
|
|
for(param_t K: std::vector<param_t>{32})
|
|
shapes.push_back(std::make_tuple(sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
|
|
|
|
sc::IsaacOperation_t AT, BT;
|
|
param_t M, N, K;
|
|
std::cout << "======================================================================" << std::endl;
|
|
std::cout << "GEMM:" << std::endl;
|
|
std::cout << "======================================================================" << std::endl;
|
|
print_results_header({"AT", "BT", "M", "N", "K"});
|
|
std::vector<double> speedup;
|
|
for(auto shape: shapes){
|
|
std::tie(AT, BT, M, N, K) = shape;
|
|
sc::scalar alpha(1., dtype);
|
|
sc::scalar beta(0., dtype);
|
|
|
|
size_t ldc = M;
|
|
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
|
|
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
|
|
|
|
char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
|
|
char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
|
|
|
|
drv::Buffer C(ctx, M*N*dtsize);
|
|
drv::Buffer A(ctx, M*K*dtsize);
|
|
drv::Buffer B(ctx, K*N*dtsize);
|
|
|
|
std::vector<double> times;
|
|
times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device));
|
|
times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device));
|
|
speedup.push_back(times[1]/times[0]);
|
|
print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, [&](double tsec){ return sc::templates::GEMM::tflops(M, N, K, tsec);});
|
|
}
|
|
std::cout << "======================================================================" << std::endl;
|
|
std::cout << "Speedup: " << geometric_mean(speedup) << std::endl;
|
|
}
|
|
}
|