triton/include/triton/driver/cublas.h

/* Copyright 2015-2017 Philippe Tillet
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files
* (the "Software"), to deal in the Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, sublicense, and/or sell copies of the Software,
* and to permit persons to whom the Software is furnished to do so,
* subject to the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/

#ifndef TDL_INCLUDE_DRIVER_CUBLAS_H
#define TDL_INCLUDE_DRIVER_CUBLAS_H

#include "isaac/templates/common.hpp"
#include "triton/driver/dispatch.h"
#include "triton/driver/buffer.h"
#include "triton/driver/stream.h"
#include "triton/driver/backend.h"
#include "triton/driver/error.h"
#include "triton/tools/bench.hpp"
#include "triton/tools/collections.hpp"

namespace triton
{
namespace driver
{

enum cublasStrategy_t{
    CUBLAS_PREFER_FASTEST,
    CUBLAS_HEURISTICS
};


static const std::vector<cublasGemmAlgo_t> cublasAlgorithms = {
  CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1, CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3,
  CUBLAS_GEMM_ALGO4, CUBLAS_GEMM_ALGO5, CUBLAS_GEMM_ALGO6, CUBLAS_GEMM_ALGO7
};

static const std::map<DType, cudaDataType> cudtype = {{FLOAT_TYPE, CUDA_R_32F}, {DOUBLE_TYPE,CUDA_R_64F}};
static const std::map<char, cublasOperation_t> cuop = {{'N', CUBLAS_OP_N}, {'T', CUBLAS_OP_T}};

inline cublasGemmAlgo_t cublasGemmFastest(stream& stream, cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
                         void* beta, CUdeviceptr C, int32_t ldc){

  typedef std::tuple<cudaDataType_t, cublasOperation_t, cublasOperation_t, int32_t, int32_t, int32_t> key_t;
  // Benchmark fastest algorithm in cublasGemmEx
  auto benchmark_fastest = [&](key_t const &){
    std::vector<double> times;
    for(cublasGemmAlgo_t a: cublasAlgorithms){
      try{
        times.push_back(bench([&](){ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, a); },
        [&](){ stream.synchronize(); },
        stream.context().device()));
      }catch(driver::exception::cublas::base const &){
        times.push_back(INFINITY);
      }
    }
    size_t argmin = std::min_element(times.begin(), times.end()) - times.begin();
    return cublasAlgorithms[argmin];
  };
  // Cache result
  static cpp::CachedMap<key_t, cublasGemmAlgo_t> cache(benchmark_fastest);
  return cache.get(std::make_tuple(cudt, AT, BT, M, N, K));
}

/* Wrapper for cublasGemmEx */
inline void cublasGemmEx(cublasHandle_t handle, cudaDataType cudt, cublasOperation_t AT, cublasOperation_t BT, int32_t M, int32_t N, int32_t K,
                         void* alpha, CUdeviceptr A, int32_t lda, CUdeviceptr B, int32_t ldb,
                         void* beta, CUdeviceptr C, int32_t ldc, cublasGemmAlgo_t algo)
{ dispatch::cublasGemmEx(handle, AT, BT, M, N, K, alpha, (const void*)A, cudt, lda, (const void*)B, cudt, ldb, beta, (void*)C, cudt, ldc, cudt, algo); }


/* Simplified API for default GEMM */
inline void cublasGemm(DType dtype, stream& stream, char cAT, char cBT, int32_t M, int32_t N, int32_t K, scalar alpha, cu_buffer const & A, int32_t lda, cu_buffer const & B, int32_t ldb, scalar beta, cu_buffer& C, int32_t ldc, cublasGemmAlgo_t* fastest = NULL, cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT){
  ContextSwitcher ctx_switch(stream.context());
  cublasHandle_t handle = dispatch::cublasHandle(stream.context());
  dispatch::cublasSetStream_v2(handle, (CUstream)stream);
  if(fastest)
    *fastest = cublasGemmFastest(stream, handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc);
  else
    cublasGemmEx(handle, cudtype.at(dtype), cuop.at(cAT), cuop.at(cBT), M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc, algo);
}

inline cudnnDataType_t cudnnDtype(DType dtype){
  switch(dtype){
    case INT8X4_TYPE: return CUDNN_DATA_INT8x4;
    case INT32_TYPE: return CUDNN_DATA_INT32;
    case FLOAT_TYPE: return CUDNN_DATA_FLOAT;
    case DOUBLE_TYPE: return CUDNN_DATA_DOUBLE;
  }
  throw;
}

inline cudnnTensorFormat_t format(cudnnDataType_t cutype){
  switch(cutype){
    case CUDNN_DATA_INT8x4: return CUDNN_TENSOR_NCHW_VECT_C;
    default: return CUDNN_TENSOR_NCHW;
  }
}

inline void cudnnConv(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t C, int32_t T, int32_t R, int32_t S,
                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, cu_buffer const & F, scalar beta, cu_buffer const & O){
  driver::driver::context const & ctx = stream.context();
  ContextSwitcher switch_ctx(ctx);

  std::vector<int> pad = {pad_d, pad_h, pad_w};
  std::vector<int> stride = {stride_d, stride_h, stride_w};
  std::vector<int> upscale = {1, 1, 1};
  std::vector<int> Oshapes = {N, K, M, P, Q};
  std::vector<int> Fshapes = {K, C, T, R, S};
  std::vector<int> Ishapes = {N, C, D, H, W};
  if(M == 1 && T == 1 && D == 1){
    pad.erase(pad.begin());
    stride.erase(stride.begin());
    upscale.erase(upscale.begin());
    Oshapes.erase(Oshapes.begin() + 2);
    Ishapes.erase(Ishapes.begin() + 2);
    Fshapes.erase(Fshapes.begin() + 2);
  }

  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
  cudnnDataType_t in_cutype = cudnnDtype(dtype);
  cudnnDataType_t conv_cutype = (dtype == INT8X4_TYPE)?CUDNN_DATA_INT32:in_cutype;

  dispatch::cudnnSetStream(handle, (CUstream)stream);
  cudnnTensorDescriptor_t tO, tI;
  cudnnFilterDescriptor_t tF;
  cudnnConvolutionDescriptor_t conv;
  cudnnConvolutionFwdAlgo_t algo;
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnCreateFilterDescriptor(&tF);

  dispatch::cudnnSetTensorNdDescriptorEx(tO, format(in_cutype), in_cutype, Oshapes.size(), Oshapes.data());
  dispatch::cudnnSetFilterNdDescriptor(tF, in_cutype, format(in_cutype), Fshapes.size(), Fshapes.data());
  dispatch::cudnnSetTensorNdDescriptorEx(tI, format(in_cutype), in_cutype, Ishapes.size(), Ishapes.data());

  dispatch::cudnnCreateConvolutionDescriptor(&conv);
  dispatch::cudnnSetConvolutionNdDescriptor(conv, pad.size(), pad.data(), stride.data(), upscale.data(), CUDNN_CROSS_CORRELATION, conv_cutype);
  dispatch::cudnnGetConvolutionForwardAlgorithm(handle, tI, tF, conv, tO, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, 1024*1024*64, &algo);

  size_t workspace_size;
  dispatch::cudnnGetConvolutionForwardWorkspaceSize(handle, tI, tF, conv, tO, algo, &workspace_size);
  static cu_buffer work(ctx, 1024*1024*64);
  CUdeviceptr twork = work;
  CUdeviceptr pI = I, pF = F, pO = O;
  dispatch::cudnnConvolutionForward(handle, alpha.data(), tI, (void*)pI, tF, (void*)pF, conv, algo, (void*)twork, workspace_size, beta.data(), tO, (void*)pO);
}


inline void cudnnPool(DType dtype, stream& stream, int32_t D, int32_t H, int32_t W, int32_t N, int32_t K, int32_t M, int32_t P, int32_t Q, int32_t T, int32_t R, int32_t S,
                      int32_t pad_d, int32_t pad_h, int32_t pad_w, int32_t stride_d, int32_t stride_h, int32_t stride_w, scalar alpha, cu_buffer const & I, scalar beta, cu_buffer const & O){
  driver::driver::context const & ctx = stream.context();
  ContextSwitcher switch_ctx(ctx);

  std::vector<int> pad = {pad_d, pad_h, pad_w};
  std::vector<int> stride = {stride_d, stride_h, stride_w};
  std::vector<int> upscale = {1, 1, 1};
  std::vector<int> Oshapes = {N, K, M, P, Q};
  std::vector<int> Ishapes = {N, K, D, H, W};
  std::vector<int> window = {T, R, S};
  if(M == 1 && T == 1 && D == 1){
    window.erase(window.begin());
    pad.erase(pad.begin());
    stride.erase(stride.begin());
    upscale.erase(upscale.begin());
    Oshapes.erase(Oshapes.begin() + 2);
    Ishapes.erase(Ishapes.begin() + 2);
  }

  cudnnHandle_t handle = dispatch::cudnnHandle(ctx);
  cudnnDataType_t cutype = cudnnDtype(dtype);

  dispatch::cudnnSetStream(handle, (CUstream)stream);
  cudnnTensorDescriptor_t tO, tI;
  cudnnPoolingDescriptor_t desc;
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnCreateTensorDescriptor(&tI);

  dispatch::cudnnSetTensorNdDescriptorEx(tO, CUDNN_TENSOR_NCHW, cutype, Oshapes.size(), Oshapes.data());
  dispatch::cudnnSetTensorNdDescriptorEx(tI, CUDNN_TENSOR_NCHW, cutype, Ishapes.size(), Ishapes.data());

  dispatch::cudnnCreatePoolingDescriptor(&desc);
  dispatch::cudnnSetPoolingNdDescriptor(desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN, window.size(), window.data(), pad.data(), stride.data());

  CUdeviceptr pI = I, pO = O;
  dispatch::cudnnPoolingForward(handle, desc, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
}

inline void cudnnTransformTensor(driver::cu_stream & stream,
               DType in_dtype, DType out_dtype,
               cudnnTensorFormat_t in_layout, cudnnTensorFormat_t out_layout,
               int32_t N, int32_t C, int32_t D, int32_t H, int32_t W,
               scalar alpha, driver::cu_buffer const & I, scalar beta, driver::cu_buffer& O)
{
  cudnnHandle_t handle = dispatch::cudnnHandle(stream.context());
  dispatch::cudnnSetStream(handle, (CUstream)stream);

  cudnnTensorDescriptor_t tO, tI;
  std::vector<int> shapes = {N, C, D, H, W};
  dispatch::cudnnCreateTensorDescriptor(&tI);
  dispatch::cudnnSetTensorNdDescriptorEx(tI, in_layout, cudnnDtype(in_dtype), shapes.size(), shapes.data());
  dispatch::cudnnCreateTensorDescriptor(&tO);
  dispatch::cudnnSetTensorNdDescriptorEx(tO, out_layout, cudnnDtype(out_dtype), shapes.size(), shapes.data());

  CUdeviceptr pI = I, pO = O;
  dispatch::cudnnTransformTensor(handle, alpha.data(), tI, (void*)pI, beta.data(), tO, (void*)pO);
}


}
}


#endif