650 lines
34 KiB
C++
Executable File
650 lines
34 KiB
C++
Executable File
#include "opts.hpp"
|
|
#include "isaac/scalar.h"
|
|
#include "isaac/api.h"
|
|
#include "isaac/driver/cublas.h"
|
|
#include "isaac/driver/backend.h"
|
|
#include "isaac/driver/context.h"
|
|
#include "isaac/driver/stream.h"
|
|
#include "isaac/runtime/predict.h"
|
|
#include "isaac/templates/gemm.h"
|
|
#include "isaac/templates/error.hpp"
|
|
#include "isaac/tools/bench.hpp"
|
|
|
|
namespace sc = isaac;
|
|
namespace drv = sc::driver;
|
|
using sc::param_t;
|
|
|
|
enum Code {
|
|
RESET = 0,
|
|
BOLD = 1,
|
|
ITALIC = 3,
|
|
FG_RED = 31,
|
|
FG_GREEN = 32,
|
|
FG_YELLOW = 33,
|
|
FG_BLUE = 34,
|
|
FG_MAGENTA = 35,
|
|
FG_CYAN = 36,
|
|
FG_LIGHT_GRAY = 37,
|
|
FG_DARK_GRAY = 90,
|
|
FG_LIGHT_RED = 91,
|
|
FG_LIGHT_GREEN = 92,
|
|
FG_LIGHT_YELLOW = 93,
|
|
FG_LIGHT_BLUE = 94,
|
|
FG_LIGHT_MAGENTA = 95,
|
|
FG_LIGHT_CYAN = 96,
|
|
FG_WHITE = 97
|
|
};
|
|
|
|
class color_stream {
|
|
Code code;
|
|
public:
|
|
color_stream(Code pCode) : code(pCode) {}
|
|
friend std::ostream&
|
|
operator<<(std::ostream& os, const color_stream& mod) {
|
|
return os << "\033[" << mod.code << "m";
|
|
}
|
|
};
|
|
|
|
/* Helpers for benchmarking */
|
|
typedef std::tuple<sc::DType, sc::IsaacOperation_t, sc::IsaacOperation_t, sc::param_t, sc::param_t, sc::param_t> gemm_params_t;
|
|
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_params_t;
|
|
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> pool_params_t;
|
|
|
|
struct SC17{
|
|
// GEMM
|
|
static std::vector<gemm_params_t> gemm(sc::DType dtype){
|
|
std::vector<gemm_params_t> shapes;
|
|
// LinPack
|
|
for(param_t N: std::vector<param_t>{512, 1024, 2048})
|
|
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
|
|
// DeepBench
|
|
for(sc::IsaacOperation_t AT: std::vector<sc::IsaacOperation_t>{sc::ISAAC_OP_N, sc::ISAAC_OP_T})
|
|
for(param_t M: std::vector<param_t>{1760})
|
|
for(param_t N: std::vector<param_t>{16, 32, 64, 128})
|
|
shapes.push_back(std::make_tuple(dtype, AT, sc::ISAAC_OP_N, M, N, M));
|
|
// PCA/ICA
|
|
for(param_t N: std::vector<param_t>{16, 64, 256})
|
|
for(param_t K: std::vector<param_t>{64000})
|
|
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
|
|
// LaPACK
|
|
for(param_t N: std::vector<param_t>{1024, 2048, 4096})
|
|
for(param_t K: std::vector<param_t>{32})
|
|
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
|
|
return shapes;
|
|
}
|
|
|
|
// CONV
|
|
static std::vector<conv_params_t> conv(sc::DType dtype){
|
|
// Vector of (dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)
|
|
std::vector<conv_params_t> shapes;
|
|
// // DeepSpeech
|
|
// for(size_t N: std::vector<size_t>{8})
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 700, 161, 1, N, 32, 1, 5, 20, 0, 0, 0, 1, 1, 1));
|
|
// for(size_t N: std::vector<size_t>{8})
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 341, 79, 32, N, 32, 1, 5, 10, 0, 0, 0, 1, 1, 1));
|
|
|
|
// // OCR
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 480, 48, 1, 16, 16, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 240, 24, 16, 16, 32, 1, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 120, 12, 32, 16, 64, 1, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 60, 6, 64, 16, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
|
|
// // Face Recognition
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 108, 108, 3, 8, 64, 1, 3, 3, 0, 1, 1, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 54, 54, 64, 8, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 27, 27, 128, 8, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 128, 8, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 256, 8, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
|
|
// // Vision
|
|
// for(size_t N: std::vector<size_t>{8}){
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, N, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 128, N, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 256, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// }
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, 16, 64, 1, 7, 7, 0, 3, 3, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 32, 1, 5, 5, 0, 2, 2, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 48, 1, 5, 5, 0, 2, 2, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 192, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 128, 1, 5, 5, 0, 2, 2, 1, 1, 1));
|
|
|
|
// // Speaker ID
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 350, 80, 64, 16, 128, 1, 5, 5, 0, 1, 1, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 175, 40, 128, 16, 256, 1, 5, 5, 0, 1, 1, 1, 2, 2));
|
|
|
|
// // ResNET
|
|
// for(size_t N: std::vector<size_t>{8}){
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 64, N, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 128, 1, 1, 1, 0, 0, 0, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 128, N, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 128, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 256, 1, 1, 1, 0, 0, 0, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 256, N, 1024, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 1024, 1, 1, 1, 0, 0, 0, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 0, 0, 1, 2, 2));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 2048, 1, 1, 1, 0, 1, 1, 1, 1, 1));
|
|
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 1, 1, 1, 2, 2));
|
|
// }
|
|
|
|
// 3D-Unet
|
|
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 4, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 24, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 72, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 216, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 648, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 648, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 432, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 216, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 144, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 72, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 48, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 4 , 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
return shapes;
|
|
}
|
|
|
|
// POOL
|
|
static std::vector<pool_params_t> pool(sc::DType dtype){
|
|
std::vector<pool_params_t> shapes;
|
|
|
|
// 3D-Unet
|
|
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
|
|
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
|
|
|
|
return shapes;
|
|
}
|
|
|
|
};
|
|
/* Metrics for benchmarking */
|
|
struct Metric{
|
|
virtual std::function<bool(double, double)> cmp() const = 0;
|
|
virtual double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const = 0;
|
|
virtual double gemm(param_t M, param_t N, param_t K, double tsec) const = 0;
|
|
virtual double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t, param_t, param_t, double tsec) const = 0;
|
|
};
|
|
|
|
class FLOPS: public Metric{
|
|
public:
|
|
FLOPS(double scale): scale_(scale){}
|
|
std::function<bool(double, double)> cmp() const { return std::greater<double>(); }
|
|
double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const
|
|
{ return sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec) * 1e12 / scale_; }
|
|
double gemm(param_t M, param_t N, param_t K, double tsec) const
|
|
{ return sc::templates::GEMM::tflops(M, N, K, tsec) * 1e12 / scale_; }
|
|
double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double tsec) const
|
|
{ return sc::templates::Pool::tflops(P, Q, M, K, N, T, R, S, tsec) * 1e12 / scale_;}
|
|
|
|
private:
|
|
double scale_;
|
|
};
|
|
|
|
class Time: public Metric{
|
|
public:
|
|
Time(double scale): scale_(scale){}
|
|
std::function<bool(double, double)> cmp() const { return std::less<double>(); }
|
|
double conv(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
|
|
double gemm(param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
|
|
double pool(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
|
|
|
|
private:
|
|
double scale_;
|
|
};
|
|
|
|
void print_results_header(std::vector<std::string> sections){
|
|
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
|
|
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
|
|
std::cout << color_stream(RESET) << std::endl;
|
|
}
|
|
|
|
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<bool(double, double)> cmp, std::function<double(double)> fn){
|
|
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
|
|
std::vector<double> perf;
|
|
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
|
|
auto fastest = perf;
|
|
std::sort(fastest.begin(), fastest.end(), cmp);
|
|
|
|
for(auto x: perf){
|
|
if(x == fastest[0] && x / fastest[1] > 1.05)
|
|
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
|
|
else
|
|
std::cout << x;
|
|
std::cout << "\t";
|
|
}
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
void benchmark_gemm(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
|
|
sc::DType dtype, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, size_t M, size_t N, size_t K,
|
|
sc::templates::Generator* generator){
|
|
size_t ldc = M;
|
|
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
|
|
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
|
|
|
|
size_t dtsize = sc::size_of(dtype);
|
|
sc::scalar alpha(1., dtype);
|
|
sc::scalar beta(0., dtype);
|
|
char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
|
|
char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
|
|
|
|
sc::driver::Buffer C(ctx, M*N*dtsize);
|
|
sc::driver::Buffer A(ctx, M*K*dtsize);
|
|
sc::driver::Buffer B(ctx, K*N*dtsize);
|
|
|
|
std::vector<double> times;
|
|
times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C, 1., 1., 1., NULL, (sc::templates::GEMM*)generator, 10); }, [&](){ stream.synchronize(); }, device));
|
|
if(sc::driver::dispatch::cublasinit()){
|
|
cublasGemmAlgo_t fastest;
|
|
sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, &fastest);
|
|
times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, NULL, fastest); }, [&](){ stream.synchronize(); }, device));
|
|
//times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device));
|
|
}
|
|
print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, metric.cmp(), [&](double tsec){ return metric.gemm(M, N, K, tsec);});
|
|
}
|
|
|
|
void benchmark_conv(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
|
|
sc::DType in_dtype, sc::DType out_dtype, size_t D, size_t H, size_t W, size_t C, size_t N, size_t K, size_t T, size_t R, size_t S,
|
|
size_t pad_d, size_t pad_h, size_t pad_w,
|
|
size_t stride_d, size_t stride_h, size_t stride_w,
|
|
size_t upsample_d, size_t upsample_h, size_t upsample_w,
|
|
sc::templates::Generator* generator){
|
|
|
|
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
|
|
param_t M, P, Q;
|
|
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
|
|
sc::ActivationType activation = sc::Linear;
|
|
|
|
size_t vect_c = (in_dtype==sc::INT8X4_TYPE)?4:1;
|
|
size_t vect_k = (out_dtype==sc::INT8X4_TYPE)?4:1;
|
|
|
|
sc::DType ab_dtype = (out_dtype==sc::INT8X4_TYPE)?sc::FLOAT_TYPE:out_dtype;
|
|
sc::scalar alpha(1., ab_dtype);
|
|
sc::scalar beta(0., ab_dtype);
|
|
|
|
sc::driver::Buffer O(ctx, N*K/vect_k*M*P*Q*sc::size_of(out_dtype));
|
|
sc::driver::Buffer I(ctx, C/vect_c*D*H*W*N*sc::size_of(in_dtype));
|
|
sc::driver::Buffer F(ctx, K*C/vect_c*T*R*S*sc::size_of(in_dtype));
|
|
|
|
std::vector<double> times;
|
|
times.push_back(bench([&](){ sc::CONV(device, stream, in_dtype, out_dtype, N, K, M, P, Q, C, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, I, F, &O, 1, NULL, activation, 0., 1., 1., {1.}, 1., sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, NULL, (sc::templates::Conv*)generator, 10); }, [&](){ stream.synchronize(); }, device));
|
|
// if(sc::driver::dispatch::cudnninit())
|
|
// times.push_back(bench([&](){ sc::driver::cudnnConv(out_dtype, stream, D, H, W, N, K, M, P, Q, C, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
|
|
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(C), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.conv(M, P, Q, K, N, C, T, R, S, tsec);});
|
|
}
|
|
|
|
void benchmark_pool(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
|
|
sc::DType dtype, size_t D, size_t H, size_t W, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w,
|
|
sc::templates::Generator* generator){
|
|
|
|
param_t M, P, Q;
|
|
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
|
|
|
|
size_t dtsize = sc::size_of(dtype);
|
|
sc::scalar alpha(1., dtype);
|
|
sc::scalar beta(0., dtype);
|
|
|
|
sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize);
|
|
sc::driver::Buffer I(ctx, K*D*H*W*N*dtsize);
|
|
|
|
std::vector<double> times;
|
|
times.push_back(bench([&](){ sc::POOL(device, stream, dtype, dtype, sc::MaxPool, K, M, P, Q, N, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, I, O, 1., 1., (sc::templates::Pool*)generator); }, [&](){ stream.synchronize(); }, device));
|
|
if(sc::driver::dispatch::cudnninit())
|
|
times.push_back(bench([&](){ sc::driver::cudnnPool(dtype, stream, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, beta, O); }, [&](){ stream.synchronize(); }, device));
|
|
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.pool(M, P, Q, K, N, T, R, S, tsec);});
|
|
}
|
|
|
|
|
|
/* ------------------------------- */
|
|
|
|
void loop_nest(std::vector<size_t> const & ranges, std::function<void(std::vector<size_t> const &)> const & f){
|
|
size_t D = ranges.size();
|
|
std::vector<size_t> values(D, 0);
|
|
// Start with innermost loop
|
|
size_t i = D - 1;
|
|
while(true){
|
|
//Execute function
|
|
f(values);
|
|
//Increment counters
|
|
while(values[i]++ == ranges[i] - 1){
|
|
if(i == 0)
|
|
return;
|
|
values[i--] = 0;
|
|
}
|
|
i = D - 1;
|
|
}
|
|
}
|
|
|
|
template<class T>
|
|
void loop_nest(std::vector<std::vector<T>> const & iterates, std::function<void(std::vector<T>)> const & f){
|
|
//Ranges to iterate over
|
|
std::vector<size_t> ranges;
|
|
for(auto const & x: iterates)
|
|
ranges.push_back(x.size());
|
|
//Proxy function
|
|
auto proxy = [&](std::vector<size_t> const & idx){
|
|
std::vector<T> x(iterates.size());
|
|
for(size_t i = 0; i < x.size(); ++i)
|
|
x[i] = iterates[i][idx[i]];
|
|
f(x);
|
|
};
|
|
//Iterate
|
|
loop_nest(ranges, proxy);
|
|
}
|
|
|
|
|
|
void search_conv(int32_t D, int32_t H, int32_t W,
|
|
int32_t C, int32_t N, int32_t K,
|
|
int32_t T, int32_t R, int32_t S,
|
|
int32_t pad_d, int32_t pad_h, int32_t pad_w,
|
|
int32_t stride_d, int32_t stride_h, int32_t stride_w,
|
|
int32_t upsample_d, int32_t upsample_h, int32_t upsample_w,
|
|
sc::ActivationType activation, sc::DType in_dtype, sc::DType out_dtype)
|
|
{
|
|
auto ctx = drv::backend::contexts::get_default();
|
|
|
|
size_t P = (H - R + 1 + 2*pad_h + stride_h - 1)/stride_h;
|
|
size_t Q = (W - S + 1 + 2*pad_w + stride_w - 1)/stride_w;
|
|
size_t M = (D - T + 1 + 2*pad_d + stride_d - 1)/stride_d;
|
|
|
|
size_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
|
|
//Setup
|
|
drv::Buffer O(ctx, K*P*Q*M*N*sc::size_of(out_dtype));
|
|
drv::Buffer I(ctx, C*H*W*D*N*sc::size_of(in_dtype));
|
|
drv::Buffer F(ctx, C*R*S*T*K*sc::size_of(in_dtype));
|
|
drv::Stream stream(ctx);
|
|
|
|
//Exhaustive search
|
|
std::vector<sc::param_t> r1 = {1};
|
|
std::vector<sc::param_t> rv = {4};
|
|
std::vector<sc::param_t> rr = {1, 2, 4, 8};
|
|
std::vector<sc::param_t> rl = {4, 8, 16, 32};
|
|
std::vector<sc::param_t> rs = {4, 8, 16};
|
|
double best;
|
|
loop_nest<sc::param_t>({rv, rl, rl, rs, rs, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
|
|
sc::templates::Conv generator(in_dtype, out_dtype,
|
|
C, D, H, W, N, K, M, P, Q, T, R, S,
|
|
pad_d, pad_h, pad_w,
|
|
stride_d, stride_h, stride_w,
|
|
upsample_d, upsample_h, upsample_w,
|
|
activation, 1,
|
|
sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1,
|
|
x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]);
|
|
//Compile
|
|
try{
|
|
std::string src = generator.dump(ctx.device(), "conv");
|
|
drv::Module program(ctx, src);
|
|
drv::Kernel kernel(program, "conv");
|
|
double tsec = bench([&](){ generator.enqueue(kernel, stream, I, F, &O); }, [&](){ stream.synchronize(); }, ctx.device());
|
|
double tflops = sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec);
|
|
best = std::max(tflops, best);
|
|
std::cout << "//";
|
|
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
|
|
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
|
|
}catch(isaac::templates::invalid_parameters const &){
|
|
return;
|
|
}catch(drv::exception::cuda::launch_out_of_resources const &){
|
|
return;
|
|
}
|
|
});
|
|
std::cout << "ISAAC: " << best << std::endl;
|
|
}
|
|
|
|
void search_gemm(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
|
|
auto ctx = drv::backend::contexts::get_default();
|
|
size_t dtsize = sc::size_of(dtype);
|
|
|
|
// Setup
|
|
size_t ldc = M;
|
|
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
|
|
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
|
|
int32_t offc = 0, offa = 0, offb = 0;
|
|
drv::Buffer C(ctx, M*N*dtsize);
|
|
drv::Buffer A(ctx, M*K*dtsize);
|
|
drv::Buffer B(ctx, K*N*dtsize);
|
|
drv::Stream stream(ctx);
|
|
sc::scalar alpha(1., dtype), beta(0., dtype);
|
|
|
|
// Exhaustive search
|
|
std::vector<sc::param_t> r1 = {1};
|
|
std::vector<sc::param_t> rv = {4};
|
|
std::vector<sc::param_t> rr = {1, 2, 4, 8};
|
|
std::vector<sc::param_t> rl = {4, 8, 16, 32};
|
|
std::vector<sc::param_t> rs = {4, 8, 16};
|
|
double best = 0;
|
|
|
|
loop_nest<sc::param_t>({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
|
|
isaac::templates::GEMM generator(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
|
|
// Compile
|
|
try{
|
|
std::string src = generator.dump(ctx.device(), "gemm");
|
|
drv::Module program(ctx, src);
|
|
drv::Kernel kernel(program, "gemm");
|
|
double time = bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, ctx.device());
|
|
double tflops = 2*1e-3*M*N*K/time;
|
|
best = std::max(tflops, best);
|
|
std::cout << "//";
|
|
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
|
|
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
|
|
}catch(isaac::templates::invalid_parameters const &){
|
|
return;
|
|
}catch(drv::exception::cuda::launch_out_of_resources const &){
|
|
return;
|
|
}
|
|
});
|
|
std::cout << "ISAAC: " << best << std::endl;
|
|
}
|
|
|
|
/* Helpers for dumping source code */
|
|
void dump_source(sc::driver::Device const & device, sc::templates::Generator& generator, opts::Options* options, std::string const & name){
|
|
if(options->get<std::string>("format") == "ptx")
|
|
std::cout << generator.dump(device, name) << std::endl;
|
|
else{
|
|
auto x = generator.tuning_params();
|
|
std::cout << "Tuning parameters: " << std::flush;
|
|
for(size_t i = 0; i < x.size(); ++i)
|
|
std::cout << ((i>0)?", ":"") << x[i] << std::flush;
|
|
std::cout << std::endl;
|
|
}
|
|
}
|
|
|
|
/* Application code */
|
|
int main(int argc, char* argv[]){
|
|
opts::Application program("isaac-tools", "Command-line interface for ISAAC");
|
|
// Options
|
|
opts::Options* options = program.options();
|
|
options->add<size_t>("device", "Device to run on", 0);
|
|
options->add<sc::DType>("dtype", "Data-type to use for computations", "float32", {{"int8x4", sc::INT8X4_TYPE}, {"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}});
|
|
options->add<std::string>("name", "Name to give to the generated kernel", "kernel");
|
|
options->add_group("search", "Exhaustively search for best tuning parameters");
|
|
opts::Options* dump = options->add_group("dump", "Dump source-code generated by ISAAC");
|
|
dump->add("format", "Format to generate", "ptx", {"ptx", "params"});
|
|
dump->add("target", "Target GPU (sm_xx)", {"sm_50", "sm_52", "sm_60", "sm_61", "sm_70"});
|
|
opts::Options* bench = options->add_group("bench", "Benchmark source code generated by ISAAC");
|
|
bench->add("suite", "Benchmarking suite to run", "custom", {"custom", "deepbench"});
|
|
bench->add<std::shared_ptr<Metric>>("metric", "performance metric for the results", "tflops", {{"tflops", std::make_shared<FLOPS>(1e12)}, {"ms", std::make_shared<Time>(1e-3)}, {"us", std::make_shared<Time>(1e-6)}});
|
|
// Constraints
|
|
options->add_constraint(opts::OneOf({"bench", "dump", "search"}));
|
|
options->add_constraint(opts::OneOf({"gemm", "conv", "pool"}));
|
|
// GEMM
|
|
opts::Options* gemm = options->add_group("gemm", "Use matrix-multiplication");
|
|
gemm->add("layout", "Transposition layout for A and B", "NT", {"NN", "NT", "TN", "TT"});
|
|
gemm->add<std::vector<size_t>>("shape", "Matrix shapes (M,N,K)", {2048, 2048, 2048}, opts::SizeConstraint(3));
|
|
gemm->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(14));
|
|
// CONV
|
|
opts::Options* conv = options->add_group("conv", "Use convolutions");
|
|
conv->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 512, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(15));
|
|
conv->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(9));
|
|
// POOL
|
|
opts::Options* pool = options->add_group("pool", "Use pooling");
|
|
pool->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(14));
|
|
pool->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(4));
|
|
program.parse(argc, argv);
|
|
|
|
if(options->has("bench"))
|
|
std::cout << std::fixed << std::setprecision(2);
|
|
//Device
|
|
sc::driver::Device device = sc::driver::backend::devices()[options->get<size_t>("device")];
|
|
if(options->has("dump") && dump->has("target")){
|
|
std::string target = dump->get<std::string>("target");
|
|
char major = target[3];
|
|
char minor = target[4];
|
|
device.interpret_as(std::make_pair((size_t)std::atoi(&major), (size_t)std::atoi(&minor)));
|
|
}
|
|
static sc::driver::Context context(device);
|
|
sc::driver::Stream stream(context);
|
|
// Data-Type
|
|
sc::DType dtype = options->get<sc::DType>("dtype");
|
|
// Kernel name
|
|
std::string name = options->get<std::string>("name");
|
|
|
|
/* Get optimized kernel generator */
|
|
std::unique_ptr<sc::templates::Generator> generator;
|
|
|
|
// GEMM
|
|
if(options->has("gemm")){
|
|
std::string layout = gemm->get<std::string>("layout");
|
|
sc::IsaacOperation_t AT = layout[0]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
|
|
sc::IsaacOperation_t BT = layout[1]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
|
|
auto shape = gemm->get<std::vector<size_t>>("shape");
|
|
size_t M = shape[0], N = shape[1], K = shape[2];
|
|
//Get Source
|
|
size_t ldc = M;
|
|
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
|
|
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
|
|
if(options->has("search")){
|
|
search_gemm(M, N, K, AT, BT, dtype);
|
|
}
|
|
if(gemm->has("kernel")){
|
|
auto x = gemm->get<std::vector<size_t>>("kernel");
|
|
generator.reset(new sc::templates::GEMM(dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]));
|
|
}
|
|
else{
|
|
sc::runtime::GEMMProfile* profile = (sc::runtime::GEMMProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::GEMM}).get();
|
|
generator.reset(new sc::templates::GEMM(profile->predict(stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc)));
|
|
}
|
|
if(options->has("dump"))
|
|
dump_source(device, *generator, dump, name);
|
|
if(options->has("bench")){
|
|
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
|
|
print_results_header({"AT", "BT", "M", "N", "K", "ISAAC", "cuBLAS"});
|
|
std::vector<gemm_params_t> shapes;
|
|
//User provided shapes
|
|
if(bench->get<std::string>("suite")=="custom")
|
|
shapes = {std::make_tuple(dtype, AT, BT, M, N, K)};
|
|
|
|
//SC17 paper shapes
|
|
if(bench->get<std::string>("suite")=="deepbench")
|
|
shapes = SC17::gemm(dtype);
|
|
|
|
//Print results
|
|
for(auto x: shapes){
|
|
std::tie(dtype, AT, BT, M, N, K) = x;
|
|
benchmark_gemm(*metric, context, device, stream, dtype, AT, BT, M, N, K, gemm->has("kernel")?generator.get():NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
// CONV
|
|
if(options->has("conv")){
|
|
sc::DType in_dtype = dtype;
|
|
sc::DType out_dtype = dtype;
|
|
auto x = conv->get<std::vector<size_t>>("shape");
|
|
param_t D = x[0], H = x[1], W = x[2], C = x[3], N = x[4], K = x[5], T = x[6], R = x[7], S = x[8], pad_d = x[9], pad_h = x[10], pad_w = x[11], stride_d = x[12], stride_h = x[13], stride_w = x[14];
|
|
param_t M, P, Q;
|
|
param_t upsample_d = 1, upsample_h = 1, upsample_w = 1;
|
|
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
|
|
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
|
|
sc::ActivationType activation = sc::Linear;
|
|
if(options->has("search"))
|
|
search_conv(D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, in_dtype, out_dtype);
|
|
if(conv->has("kernel")){
|
|
auto x = conv->get<std::vector<size_t>>("kernel");
|
|
generator.reset(new sc::templates::Conv(in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]));
|
|
}
|
|
else{
|
|
sc::runtime::ConvProfile* profile = (sc::runtime::ConvProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::CONV}).get();
|
|
generator.reset(new sc::templates::Conv(profile->predict(stream, in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1)));
|
|
}
|
|
if(options->has("dump"))
|
|
dump_source(device, *generator, dump, name);
|
|
if(options->has("bench")){
|
|
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
|
|
print_results_header({"N", "K", "M", "P", "Q", "C", "T", "R", "S", "ISAAC", "cuDNN"});
|
|
std::vector<conv_params_t> shapes;
|
|
//User provided shapes
|
|
if(bench->get<std::string>("suite")=="custom")
|
|
shapes = {std::make_tuple(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
|
|
//SuperComputing17 shapes
|
|
if(bench->get<std::string>("suite")=="deepbench")
|
|
shapes = SC17::conv(dtype);
|
|
//Print results
|
|
for(auto x: shapes){
|
|
std::tie(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
|
|
benchmark_conv(*metric, context, device, stream, in_dtype, out_dtype, D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, conv->has("kernel")?generator.get():NULL);
|
|
}
|
|
}
|
|
}
|
|
|
|
// POOL
|
|
if(options->has("pool")){
|
|
auto x = pool->get<std::vector<size_t>>("shape");
|
|
param_t D = x[0], W = x[1], H = x[2], N = x[3], K = x[4], T = x[5], R = x[6], S = x[7], pad_d = x[8], pad_h = x[9], pad_w = x[10], stride_d = x[11], stride_h = x[12], stride_w = x[13];
|
|
param_t M, P, Q;
|
|
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
|
|
|
|
if(pool->has("kernel")){
|
|
auto x = pool->get<std::vector<size_t>>("kernel");
|
|
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, x[0], x[1], x[2], x[3]));
|
|
}
|
|
else{
|
|
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w));
|
|
}
|
|
if(options->has("dump"))
|
|
dump_source(device, *generator, dump, name);
|
|
if(options->has("bench")){
|
|
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
|
|
print_results_header({"N", "K", "M", "P", "Q", "T", "R", "S", "ISAAC", "cuDNN"});
|
|
std::vector<pool_params_t> shapes;
|
|
//User provided shapes
|
|
if(bench->get<std::string>("suite")=="custom")
|
|
shapes = {std::make_tuple(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
|
|
//SuperComputing17 shapes
|
|
if(bench->get<std::string>("suite")=="deepbench")
|
|
shapes = SC17::pool(dtype);
|
|
//Print results
|
|
for(auto x: shapes){
|
|
std::tie(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
|
|
benchmark_pool(*metric, context, device, stream, dtype, D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, pool->has("kernel")?generator.get():NULL);
|
|
}
|
|
}
|
|
}
|
|
}
|