Files
triton/examples/isaac-tools.cpp
2018-12-21 22:02:27 -05:00

650 lines
34 KiB
C++
Executable File

#include "opts.hpp"
#include "isaac/scalar.h"
#include "isaac/api.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/backend.h"
#include "isaac/driver/context.h"
#include "isaac/driver/stream.h"
#include "isaac/runtime/predict.h"
#include "isaac/templates/gemm.h"
#include "isaac/templates/error.hpp"
#include "isaac/tools/bench.hpp"
namespace sc = isaac;
namespace drv = sc::driver;
using sc::param_t;
enum Code {
RESET = 0,
BOLD = 1,
ITALIC = 3,
FG_RED = 31,
FG_GREEN = 32,
FG_YELLOW = 33,
FG_BLUE = 34,
FG_MAGENTA = 35,
FG_CYAN = 36,
FG_LIGHT_GRAY = 37,
FG_DARK_GRAY = 90,
FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92,
FG_LIGHT_YELLOW = 93,
FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95,
FG_LIGHT_CYAN = 96,
FG_WHITE = 97
};
class color_stream {
Code code;
public:
color_stream(Code pCode) : code(pCode) {}
friend std::ostream&
operator<<(std::ostream& os, const color_stream& mod) {
return os << "\033[" << mod.code << "m";
}
};
/* Helpers for benchmarking */
typedef std::tuple<sc::DType, sc::IsaacOperation_t, sc::IsaacOperation_t, sc::param_t, sc::param_t, sc::param_t> gemm_params_t;
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> conv_params_t;
typedef std::tuple<sc::DType, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t> pool_params_t;
struct SC17{
// GEMM
static std::vector<gemm_params_t> gemm(sc::DType dtype){
std::vector<gemm_params_t> shapes;
// LinPack
for(param_t N: std::vector<param_t>{512, 1024, 2048})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, N));
// DeepBench
for(sc::IsaacOperation_t AT: std::vector<sc::IsaacOperation_t>{sc::ISAAC_OP_N, sc::ISAAC_OP_T})
for(param_t M: std::vector<param_t>{1760})
for(param_t N: std::vector<param_t>{16, 32, 64, 128})
shapes.push_back(std::make_tuple(dtype, AT, sc::ISAAC_OP_N, M, N, M));
// PCA/ICA
for(param_t N: std::vector<param_t>{16, 64, 256})
for(param_t K: std::vector<param_t>{64000})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
// LaPACK
for(param_t N: std::vector<param_t>{1024, 2048, 4096})
for(param_t K: std::vector<param_t>{32})
shapes.push_back(std::make_tuple(dtype, sc::ISAAC_OP_N, sc::ISAAC_OP_T, N, N, K));
return shapes;
}
// CONV
static std::vector<conv_params_t> conv(sc::DType dtype){
// Vector of (dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)
std::vector<conv_params_t> shapes;
// // DeepSpeech
// for(size_t N: std::vector<size_t>{8})
// shapes.push_back(std::make_tuple(dtype, 1, 700, 161, 1, N, 32, 1, 5, 20, 0, 0, 0, 1, 1, 1));
// for(size_t N: std::vector<size_t>{8})
// shapes.push_back(std::make_tuple(dtype, 1, 341, 79, 32, N, 32, 1, 5, 10, 0, 0, 0, 1, 1, 1));
// // OCR
// shapes.push_back(std::make_tuple(dtype, 1, 480, 48, 1, 16, 16, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 240, 24, 16, 16, 32, 1, 3, 3, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 120, 12, 32, 16, 64, 1, 3, 3, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 60, 6, 64, 16, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// // Face Recognition
// shapes.push_back(std::make_tuple(dtype, 1, 108, 108, 3, 8, 64, 1, 3, 3, 0, 1, 1, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 54, 54, 64, 8, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 27, 27, 128, 8, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 128, 8, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 256, 8, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// // Vision
// for(size_t N: std::vector<size_t>{8}){
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, N, 64, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 128, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 128, N, 256, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 256, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// }
// shapes.push_back(std::make_tuple(dtype, 1, 224, 224, 3, 16, 64, 1, 7, 7, 0, 3, 3, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 32, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 192, 16, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 48, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 512, 16, 192, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 832, 16, 128, 1, 5, 5, 0, 2, 2, 1, 1, 1));
// // Speaker ID
// shapes.push_back(std::make_tuple(dtype, 1, 350, 80, 64, 16, 128, 1, 5, 5, 0, 1, 1, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 175, 40, 128, 16, 256, 1, 5, 5, 0, 1, 1, 1, 2, 2));
// // ResNET
// for(size_t N: std::vector<size_t>{8}){
// shapes.push_back(std::make_tuple(dtype, 1, 112, 112, 64, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 64, N, 256, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 64, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 56, 56, 256, N, 128, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 128, N, 512, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 128, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 256, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 256, N, 1024, 1, 1, 1, 0, 0, 0, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 28, 28, 512, N, 1024, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 0, 0, 1, 2, 2));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 512, 1, 3, 3, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 7, 7, 512, N, 2048, 1, 1, 1, 0, 1, 1, 1, 1, 1));
// shapes.push_back(std::make_tuple(dtype, 1, 14, 14, 1024, N, 2048, 1, 1, 1, 0, 1, 1, 1, 2, 2));
// }
// 3D-Unet
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 4, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 24, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 72, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 216, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 648, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 648, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 432, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 216, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 216, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 144, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 72, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 72, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 48, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 24, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 4 , 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
return shapes;
}
// POOL
static std::vector<pool_params_t> pool(sc::DType dtype){
std::vector<pool_params_t> shapes;
// 3D-Unet
shapes.push_back(std::make_tuple(dtype, 31, 204, 204, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 29, 202, 202, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 27, 100, 100, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 25, 98, 98, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 23, 48, 48, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 21, 46, 46, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 19, 22, 22, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 17, 20, 20, 1, 648, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 15, 36, 36, 1, 432, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 13, 36, 36, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 34, 34, 1, 216, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 144, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 11, 64, 64, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 9 , 62, 62, 1, 72, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 7 , 120, 120, 1, 48, 1, 1, 1, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 5 , 120, 120, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 3 , 118, 118, 1, 24, 3, 3, 3, 0, 0, 0, 1, 1, 1));
shapes.push_back(std::make_tuple(dtype, 1 , 116, 116, 1, 24, 1, 1, 1, 0, 0, 0, 1, 1, 1));
return shapes;
}
};
/* Metrics for benchmarking */
struct Metric{
virtual std::function<bool(double, double)> cmp() const = 0;
virtual double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const = 0;
virtual double gemm(param_t M, param_t N, param_t K, double tsec) const = 0;
virtual double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t, param_t, param_t, double tsec) const = 0;
};
class FLOPS: public Metric{
public:
FLOPS(double scale): scale_(scale){}
std::function<bool(double, double)> cmp() const { return std::greater<double>(); }
double conv(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t C, param_t R, param_t S, param_t T, double tsec) const
{ return sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec) * 1e12 / scale_; }
double gemm(param_t M, param_t N, param_t K, double tsec) const
{ return sc::templates::GEMM::tflops(M, N, K, tsec) * 1e12 / scale_; }
double pool(param_t P, param_t Q, param_t M, param_t K, param_t N, param_t T, param_t R, param_t S, double tsec) const
{ return sc::templates::Pool::tflops(P, Q, M, K, N, T, R, S, tsec) * 1e12 / scale_;}
private:
double scale_;
};
class Time: public Metric{
public:
Time(double scale): scale_(scale){}
std::function<bool(double, double)> cmp() const { return std::less<double>(); }
double conv(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
double gemm(param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
double pool(param_t, param_t, param_t, param_t, param_t, param_t, param_t, param_t, double tsec) const { return tsec*1e-9/scale_; }
private:
double scale_;
};
void print_results_header(std::vector<std::string> sections){
std::cout << color_stream(ITALIC) << color_stream(BOLD) ;
std::copy(sections.begin(), sections.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::cout << color_stream(RESET) << std::endl;
}
void print_results(std::vector<double> const & times, std::vector<std::string> const & prefix, std::function<bool(double, double)> cmp, std::function<double(double)> fn){
std::copy(prefix.begin(), prefix.end(), std::ostream_iterator<std::string>(std::cout, "\t"));
std::vector<double> perf;
std::transform(times.begin(), times.end(), std::back_inserter(perf), fn);
auto fastest = perf;
std::sort(fastest.begin(), fastest.end(), cmp);
for(auto x: perf){
if(x == fastest[0] && x / fastest[1] > 1.05)
std::cout << color_stream(FG_LIGHT_BLUE) << x << color_stream(RESET);
else
std::cout << x;
std::cout << "\t";
}
std::cout << std::endl;
}
void benchmark_gemm(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType dtype, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, size_t M, size_t N, size_t K,
sc::templates::Generator* generator){
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
size_t dtsize = sc::size_of(dtype);
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N';
char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N';
sc::driver::Buffer C(ctx, M*N*dtsize);
sc::driver::Buffer A(ctx, M*K*dtsize);
sc::driver::Buffer B(ctx, K*N*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C, 1., 1., 1., NULL, (sc::templates::GEMM*)generator, 10); }, [&](){ stream.synchronize(); }, device));
if(sc::driver::dispatch::cublasinit()){
cublasGemmAlgo_t fastest;
sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, &fastest);
times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc, NULL, fastest); }, [&](){ stream.synchronize(); }, device));
//times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device));
}
print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, metric.cmp(), [&](double tsec){ return metric.gemm(M, N, K, tsec);});
}
void benchmark_conv(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType in_dtype, sc::DType out_dtype, size_t D, size_t H, size_t W, size_t C, size_t N, size_t K, size_t T, size_t R, size_t S,
size_t pad_d, size_t pad_h, size_t pad_w,
size_t stride_d, size_t stride_h, size_t stride_w,
size_t upsample_d, size_t upsample_h, size_t upsample_w,
sc::templates::Generator* generator){
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
sc::ActivationType activation = sc::Linear;
size_t vect_c = (in_dtype==sc::INT8X4_TYPE)?4:1;
size_t vect_k = (out_dtype==sc::INT8X4_TYPE)?4:1;
sc::DType ab_dtype = (out_dtype==sc::INT8X4_TYPE)?sc::FLOAT_TYPE:out_dtype;
sc::scalar alpha(1., ab_dtype);
sc::scalar beta(0., ab_dtype);
sc::driver::Buffer O(ctx, N*K/vect_k*M*P*Q*sc::size_of(out_dtype));
sc::driver::Buffer I(ctx, C/vect_c*D*H*W*N*sc::size_of(in_dtype));
sc::driver::Buffer F(ctx, K*C/vect_c*T*R*S*sc::size_of(in_dtype));
std::vector<double> times;
times.push_back(bench([&](){ sc::CONV(device, stream, in_dtype, out_dtype, N, K, M, P, Q, C, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, I, F, &O, 1, NULL, activation, 0., 1., 1., {1.}, 1., sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, NULL, (sc::templates::Conv*)generator, 10); }, [&](){ stream.synchronize(); }, device));
// if(sc::driver::dispatch::cudnninit())
// times.push_back(bench([&](){ sc::driver::cudnnConv(out_dtype, stream, D, H, W, N, K, M, P, Q, C, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device));
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(C), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.conv(M, P, Q, K, N, C, T, R, S, tsec);});
}
void benchmark_pool(Metric const & metric, sc::driver::Context& ctx, sc::driver::Device& device, sc::driver::Stream& stream,
sc::DType dtype, size_t D, size_t H, size_t W, size_t N, size_t K, size_t T, size_t R, size_t S, size_t pad_d, size_t pad_h, size_t pad_w, size_t stride_d, size_t stride_h, size_t stride_w,
sc::templates::Generator* generator){
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
size_t dtsize = sc::size_of(dtype);
sc::scalar alpha(1., dtype);
sc::scalar beta(0., dtype);
sc::driver::Buffer O(ctx, N*K*M*P*Q*dtsize);
sc::driver::Buffer I(ctx, K*D*H*W*N*dtsize);
std::vector<double> times;
times.push_back(bench([&](){ sc::POOL(device, stream, dtype, dtype, sc::MaxPool, K, M, P, Q, N, T, R, S, D, H, W, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, I, O, 1., 1., (sc::templates::Pool*)generator); }, [&](){ stream.synchronize(); }, device));
if(sc::driver::dispatch::cudnninit())
times.push_back(bench([&](){ sc::driver::cudnnPool(dtype, stream, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, alpha, I, beta, O); }, [&](){ stream.synchronize(); }, device));
print_results(times, {str(N), str(K), str(M), str(P), str(Q), str(T), str(R), str(S)}, metric.cmp(), [&](double tsec){ return metric.pool(M, P, Q, K, N, T, R, S, tsec);});
}
/* ------------------------------- */
void loop_nest(std::vector<size_t> const & ranges, std::function<void(std::vector<size_t> const &)> const & f){
size_t D = ranges.size();
std::vector<size_t> values(D, 0);
// Start with innermost loop
size_t i = D - 1;
while(true){
//Execute function
f(values);
//Increment counters
while(values[i]++ == ranges[i] - 1){
if(i == 0)
return;
values[i--] = 0;
}
i = D - 1;
}
}
template<class T>
void loop_nest(std::vector<std::vector<T>> const & iterates, std::function<void(std::vector<T>)> const & f){
//Ranges to iterate over
std::vector<size_t> ranges;
for(auto const & x: iterates)
ranges.push_back(x.size());
//Proxy function
auto proxy = [&](std::vector<size_t> const & idx){
std::vector<T> x(iterates.size());
for(size_t i = 0; i < x.size(); ++i)
x[i] = iterates[i][idx[i]];
f(x);
};
//Iterate
loop_nest(ranges, proxy);
}
void search_conv(int32_t D, int32_t H, int32_t W,
int32_t C, int32_t N, int32_t K,
int32_t T, int32_t R, int32_t S,
int32_t pad_d, int32_t pad_h, int32_t pad_w,
int32_t stride_d, int32_t stride_h, int32_t stride_w,
int32_t upsample_d, int32_t upsample_h, int32_t upsample_w,
sc::ActivationType activation, sc::DType in_dtype, sc::DType out_dtype)
{
auto ctx = drv::backend::contexts::get_default();
size_t P = (H - R + 1 + 2*pad_h + stride_h - 1)/stride_h;
size_t Q = (W - S + 1 + 2*pad_w + stride_w - 1)/stride_w;
size_t M = (D - T + 1 + 2*pad_d + stride_d - 1)/stride_d;
size_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
//Setup
drv::Buffer O(ctx, K*P*Q*M*N*sc::size_of(out_dtype));
drv::Buffer I(ctx, C*H*W*D*N*sc::size_of(in_dtype));
drv::Buffer F(ctx, C*R*S*T*K*sc::size_of(in_dtype));
drv::Stream stream(ctx);
//Exhaustive search
std::vector<sc::param_t> r1 = {1};
std::vector<sc::param_t> rv = {4};
std::vector<sc::param_t> rr = {1, 2, 4, 8};
std::vector<sc::param_t> rl = {4, 8, 16, 32};
std::vector<sc::param_t> rs = {4, 8, 16};
double best;
loop_nest<sc::param_t>({rv, rl, rl, rs, rs, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
sc::templates::Conv generator(in_dtype, out_dtype,
C, D, H, W, N, K, M, P, Q, T, R, S,
pad_d, pad_h, pad_w,
stride_d, stride_h, stride_w,
upsample_d, upsample_h, upsample_w,
activation, 1,
sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1,
x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]);
//Compile
try{
std::string src = generator.dump(ctx.device(), "conv");
drv::Module program(ctx, src);
drv::Kernel kernel(program, "conv");
double tsec = bench([&](){ generator.enqueue(kernel, stream, I, F, &O); }, [&](){ stream.synchronize(); }, ctx.device());
double tflops = sc::templates::Conv::tflops(P,Q,M,K,N,C,R,S,T,tsec);
best = std::max(tflops, best);
std::cout << "//";
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
}catch(isaac::templates::invalid_parameters const &){
return;
}catch(drv::exception::cuda::launch_out_of_resources const &){
return;
}
});
std::cout << "ISAAC: " << best << std::endl;
}
void search_gemm(int32_t M, int32_t N, int32_t K, sc::IsaacOperation_t AT, sc::IsaacOperation_t BT, sc::DType dtype){
auto ctx = drv::backend::contexts::get_default();
size_t dtsize = sc::size_of(dtype);
// Setup
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
int32_t offc = 0, offa = 0, offb = 0;
drv::Buffer C(ctx, M*N*dtsize);
drv::Buffer A(ctx, M*K*dtsize);
drv::Buffer B(ctx, K*N*dtsize);
drv::Stream stream(ctx);
sc::scalar alpha(1., dtype), beta(0., dtype);
// Exhaustive search
std::vector<sc::param_t> r1 = {1};
std::vector<sc::param_t> rv = {4};
std::vector<sc::param_t> rr = {1, 2, 4, 8};
std::vector<sc::param_t> rl = {4, 8, 16, 32};
std::vector<sc::param_t> rs = {4, 8, 16};
double best = 0;
loop_nest<sc::param_t>({rv, rl, rl, rl, rs, r1, rs, rl, rl, rl, rl, r1, rr, rr}, [&](std::vector<sc::param_t> const & x){
isaac::templates::GEMM generator(dtype, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]);
// Compile
try{
std::string src = generator.dump(ctx.device(), "gemm");
drv::Module program(ctx, src);
drv::Kernel kernel(program, "gemm");
double time = bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, ctx.device());
double tflops = 2*1e-3*M*N*K/time;
best = std::max(tflops, best);
std::cout << "//";
std::copy(x.begin(), x.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << ": " << tflops << " TFLOPS [BEST: " << best << "]" << std::endl;
}catch(isaac::templates::invalid_parameters const &){
return;
}catch(drv::exception::cuda::launch_out_of_resources const &){
return;
}
});
std::cout << "ISAAC: " << best << std::endl;
}
/* Helpers for dumping source code */
void dump_source(sc::driver::Device const & device, sc::templates::Generator& generator, opts::Options* options, std::string const & name){
if(options->get<std::string>("format") == "ptx")
std::cout << generator.dump(device, name) << std::endl;
else{
auto x = generator.tuning_params();
std::cout << "Tuning parameters: " << std::flush;
for(size_t i = 0; i < x.size(); ++i)
std::cout << ((i>0)?", ":"") << x[i] << std::flush;
std::cout << std::endl;
}
}
/* Application code */
int main(int argc, char* argv[]){
opts::Application program("isaac-tools", "Command-line interface for ISAAC");
// Options
opts::Options* options = program.options();
options->add<size_t>("device", "Device to run on", 0);
options->add<sc::DType>("dtype", "Data-type to use for computations", "float32", {{"int8x4", sc::INT8X4_TYPE}, {"float32", sc::FLOAT_TYPE}, {"float64", sc::DOUBLE_TYPE}});
options->add<std::string>("name", "Name to give to the generated kernel", "kernel");
options->add_group("search", "Exhaustively search for best tuning parameters");
opts::Options* dump = options->add_group("dump", "Dump source-code generated by ISAAC");
dump->add("format", "Format to generate", "ptx", {"ptx", "params"});
dump->add("target", "Target GPU (sm_xx)", {"sm_50", "sm_52", "sm_60", "sm_61", "sm_70"});
opts::Options* bench = options->add_group("bench", "Benchmark source code generated by ISAAC");
bench->add("suite", "Benchmarking suite to run", "custom", {"custom", "deepbench"});
bench->add<std::shared_ptr<Metric>>("metric", "performance metric for the results", "tflops", {{"tflops", std::make_shared<FLOPS>(1e12)}, {"ms", std::make_shared<Time>(1e-3)}, {"us", std::make_shared<Time>(1e-6)}});
// Constraints
options->add_constraint(opts::OneOf({"bench", "dump", "search"}));
options->add_constraint(opts::OneOf({"gemm", "conv", "pool"}));
// GEMM
opts::Options* gemm = options->add_group("gemm", "Use matrix-multiplication");
gemm->add("layout", "Transposition layout for A and B", "NT", {"NN", "NT", "TN", "TT"});
gemm->add<std::vector<size_t>>("shape", "Matrix shapes (M,N,K)", {2048, 2048, 2048}, opts::SizeConstraint(3));
gemm->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(14));
// CONV
opts::Options* conv = options->add_group("conv", "Use convolutions");
conv->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 512, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(15));
conv->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(9));
// POOL
opts::Options* pool = options->add_group("pool", "Use pooling");
pool->add<std::vector<size_t>>("shape", "Tensor shapes (D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)", {1, 70, 14, 128, 64, 1, 7, 7, 0, 0, 0, 1, 1, 1}, opts::SizeConstraint(14));
pool->add<std::vector<size_t>>("kernel", "Bypass predictive model to use given tuning parameters", opts::SizeConstraint(4));
program.parse(argc, argv);
if(options->has("bench"))
std::cout << std::fixed << std::setprecision(2);
//Device
sc::driver::Device device = sc::driver::backend::devices()[options->get<size_t>("device")];
if(options->has("dump") && dump->has("target")){
std::string target = dump->get<std::string>("target");
char major = target[3];
char minor = target[4];
device.interpret_as(std::make_pair((size_t)std::atoi(&major), (size_t)std::atoi(&minor)));
}
static sc::driver::Context context(device);
sc::driver::Stream stream(context);
// Data-Type
sc::DType dtype = options->get<sc::DType>("dtype");
// Kernel name
std::string name = options->get<std::string>("name");
/* Get optimized kernel generator */
std::unique_ptr<sc::templates::Generator> generator;
// GEMM
if(options->has("gemm")){
std::string layout = gemm->get<std::string>("layout");
sc::IsaacOperation_t AT = layout[0]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
sc::IsaacOperation_t BT = layout[1]=='T'?sc::ISAAC_OP_T:sc::ISAAC_OP_N;
auto shape = gemm->get<std::vector<size_t>>("shape");
size_t M = shape[0], N = shape[1], K = shape[2];
//Get Source
size_t ldc = M;
size_t lda = (AT==sc::ISAAC_OP_N)?M:K;
size_t ldb = (BT==sc::ISAAC_OP_N)?K:N;
if(options->has("search")){
search_gemm(M, N, K, AT, BT, dtype);
}
if(gemm->has("kernel")){
auto x = gemm->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::GEMM(dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]));
}
else{
sc::runtime::GEMMProfile* profile = (sc::runtime::GEMMProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::GEMM}).get();
generator.reset(new sc::templates::GEMM(profile->predict(stream, dtype, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc)));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"AT", "BT", "M", "N", "K", "ISAAC", "cuBLAS"});
std::vector<gemm_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, AT, BT, M, N, K)};
//SC17 paper shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::gemm(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, AT, BT, M, N, K) = x;
benchmark_gemm(*metric, context, device, stream, dtype, AT, BT, M, N, K, gemm->has("kernel")?generator.get():NULL);
}
}
}
// CONV
if(options->has("conv")){
sc::DType in_dtype = dtype;
sc::DType out_dtype = dtype;
auto x = conv->get<std::vector<size_t>>("shape");
param_t D = x[0], H = x[1], W = x[2], C = x[3], N = x[4], K = x[5], T = x[6], R = x[7], S = x[8], pad_d = x[9], pad_h = x[10], pad_w = x[11], stride_d = x[12], stride_h = x[13], stride_w = x[14];
param_t M, P, Q;
param_t upsample_d = 1, upsample_h = 1, upsample_w = 1;
param_t Zk = 0, crop_z_m0 = 0, crop_z_m1 = 0, crop_z_p0 = 0, crop_z_p1 = 0, crop_z_q0 = 0, crop_z_q1 = 0;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, M, P, Q);
sc::ActivationType activation = sc::Linear;
if(options->has("search"))
search_conv(D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, in_dtype, out_dtype);
if(conv->has("kernel")){
auto x = conv->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::Conv(in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8]));
}
else{
sc::runtime::ConvProfile* profile = (sc::runtime::ConvProfile*)sc::runtime::database.at({device.architecture(), sc::runtime::CONV}).get();
generator.reset(new sc::templates::Conv(profile->predict(stream, in_dtype, out_dtype, C, D, H, W, N, K, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, activation, 1, sc::NoResidual, Zk, crop_z_m0, crop_z_m1, crop_z_p0, crop_z_p1, crop_z_q0, crop_z_q1)));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"N", "K", "M", "P", "Q", "C", "T", "R", "S", "ISAAC", "cuDNN"});
std::vector<conv_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
//SuperComputing17 shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::conv(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, D, W, H, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
benchmark_conv(*metric, context, device, stream, in_dtype, out_dtype, D, H, W, C, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, upsample_d, upsample_h, upsample_w, conv->has("kernel")?generator.get():NULL);
}
}
}
// POOL
if(options->has("pool")){
auto x = pool->get<std::vector<size_t>>("shape");
param_t D = x[0], W = x[1], H = x[2], N = x[3], K = x[4], T = x[5], R = x[6], S = x[7], pad_d = x[8], pad_h = x[9], pad_w = x[10], stride_d = x[11], stride_h = x[12], stride_w = x[13];
param_t M, P, Q;
sc::templates::Conv::output_shapes(D, H, W, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, 1, 1, 1, M, P, Q);
if(pool->has("kernel")){
auto x = pool->get<std::vector<size_t>>("kernel");
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, x[0], x[1], x[2], x[3]));
}
else{
generator.reset(new sc::templates::Pool(dtype, dtype, sc::MaxPool, K, D, H, W, N, M, P, Q, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w));
}
if(options->has("dump"))
dump_source(device, *generator, dump, name);
if(options->has("bench")){
auto metric = bench->get<std::shared_ptr<Metric>>("metric");
print_results_header({"N", "K", "M", "P", "Q", "T", "R", "S", "ISAAC", "cuDNN"});
std::vector<pool_params_t> shapes;
//User provided shapes
if(bench->get<std::string>("suite")=="custom")
shapes = {std::make_tuple(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w)};
//SuperComputing17 shapes
if(bench->get<std::string>("suite")=="deepbench")
shapes = SC17::pool(dtype);
//Print results
for(auto x: shapes){
std::tie(dtype, D, W, H, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w) = x;
benchmark_pool(*metric, context, device, stream, dtype, D, H, W, N, K, T, R, S, pad_d, pad_h, pad_w, stride_d, stride_h, stride_w, pool->has("kernel")?generator.get():NULL);
}
}
}
}