From 9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Thu, 11 May 2017 16:11:40 -0700 Subject: [PATCH] Code Quality: some cleanups --- examples/bench.cpp | 6 ++--- include/isaac/api.h | 9 +++---- include/isaac/driver/buffer.h | 2 ++ include/isaac/driver/context.h | 8 ++++++ include/isaac/driver/cublas.h | 26 ++++++++++++------- include/isaac/driver/dispatch.h | 13 ++++++---- .../external/CUDA/builtin_types.h | 0 .../external/CUDA/channel_descriptor.h | 0 .../{driver => }/external/CUDA/cuComplex.h | 0 .../isaac/{driver => }/external/CUDA/cublas.h | 0 .../{driver => }/external/CUDA/cublas_api.h | 0 .../{driver => }/external/CUDA/cublas_v2.h | 0 .../isaac/{driver => }/external/CUDA/cuda.h | 0 .../external/CUDA/cuda_device_runtime_api.h | 0 .../{driver => }/external/CUDA/cuda_fp16.h | 0 .../{driver => }/external/CUDA/cuda_runtime.h | 0 .../external/CUDA/cuda_runtime_api.h | 0 .../isaac/{driver => }/external/CUDA/cudnn.h | 0 .../{driver => }/external/CUDA/device_types.h | 0 .../external/CUDA/driver_functions.h | 0 .../{driver => }/external/CUDA/driver_types.h | 0 .../{driver => }/external/CUDA/host_config.h | 0 .../{driver => }/external/CUDA/host_defines.h | 0 .../isaac/{driver => }/external/CUDA/nvml.h | 0 .../isaac/{driver => }/external/CUDA/nvrtc.h | 0 .../external/CUDA/surface_types.h | 0 .../external/CUDA/texture_types.h | 0 .../external/CUDA/vector_functions.h | 0 .../external/CUDA/vector_functions.hpp | 0 .../{driver => }/external/CUDA/vector_types.h | 0 include/isaac/runtime/predict.h | 4 +-- lib/driver/buffer.cpp | 12 ++++++--- lib/driver/context.cpp | 19 +++++++++++++- lib/driver/dispatch.cpp | 2 ++ lib/driver/module.cpp | 11 +++++--- lib/driver/stream.cpp | 11 ++++++-- lib/runtime/predict.cpp | 20 ++++++-------- lib/templates/conv.cpp | 3 ++- tests/conv.cpp | 2 +- tests/gemm.cpp | 24 ++++++++--------- 40 files changed, 112 insertions(+), 60 deletions(-) rename include/isaac/{driver => }/external/CUDA/builtin_types.h (100%) rename include/isaac/{driver => }/external/CUDA/channel_descriptor.h (100%) rename include/isaac/{driver => }/external/CUDA/cuComplex.h (100%) rename include/isaac/{driver => }/external/CUDA/cublas.h (100%) rename include/isaac/{driver => }/external/CUDA/cublas_api.h (100%) rename include/isaac/{driver => }/external/CUDA/cublas_v2.h (100%) rename include/isaac/{driver => }/external/CUDA/cuda.h (100%) rename include/isaac/{driver => }/external/CUDA/cuda_device_runtime_api.h (100%) rename include/isaac/{driver => }/external/CUDA/cuda_fp16.h (100%) rename include/isaac/{driver => }/external/CUDA/cuda_runtime.h (100%) rename include/isaac/{driver => }/external/CUDA/cuda_runtime_api.h (100%) rename include/isaac/{driver => }/external/CUDA/cudnn.h (100%) rename include/isaac/{driver => }/external/CUDA/device_types.h (100%) rename include/isaac/{driver => }/external/CUDA/driver_functions.h (100%) rename include/isaac/{driver => }/external/CUDA/driver_types.h (100%) rename include/isaac/{driver => }/external/CUDA/host_config.h (100%) rename include/isaac/{driver => }/external/CUDA/host_defines.h (100%) rename include/isaac/{driver => }/external/CUDA/nvml.h (100%) rename include/isaac/{driver => }/external/CUDA/nvrtc.h (100%) rename include/isaac/{driver => }/external/CUDA/surface_types.h (100%) rename include/isaac/{driver => }/external/CUDA/texture_types.h (100%) rename include/isaac/{driver => }/external/CUDA/vector_functions.h (100%) rename include/isaac/{driver => }/external/CUDA/vector_functions.hpp (100%) rename include/isaac/{driver => }/external/CUDA/vector_types.h (100%) diff --git a/examples/bench.cpp b/examples/bench.cpp index 6c4a625d8..947c7727f 100644 --- a/examples/bench.cpp +++ b/examples/bench.cpp @@ -141,8 +141,8 @@ int main(int argc, char* argv[]) drv::Buffer F(ctx, K*C*R*S*dtsize); std::vector times; - times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); - times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); +// times.push_back(bench([&](){ sc::CONV(device, stream, dtype, N, K, P, Q, C, R, S, H, W, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); + times.push_back(bench([&](){ sc::driver::cudnnConv(dtype, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device)); speedup.push_back(times[1]/times[0]); print_results(times, {str(N), str(K), str(P), str(Q), str(C), str(R), str(S)}, [&](double tsec){ return sc::templates::Conv::tflops(P,Q,K,N,C,R,S,tsec);}); } @@ -205,7 +205,7 @@ int main(int argc, char* argv[]) std::vector times; times.push_back(bench([&](){ sc::GEMM(device, stream, dtype, AT, BT, M, N, K, 0, lda, 0, ldb, 0, ldc, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device)); - times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, ctx, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device)); + times.push_back(bench([&](){ sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); }, [&](){ stream.synchronize(); }, device)); speedup.push_back(times[1]/times[0]); print_results(times, {str(AT), str(BT), str(M), str(N), str(K)}, [&](double tsec){ return sc::templates::GEMM::tflops(M, N, K, tsec);}); } diff --git a/include/isaac/api.h b/include/isaac/api.h index 9b3a3a5c7..1bac4ee2f 100644 --- a/include/isaac/api.h +++ b/include/isaac/api.h @@ -45,7 +45,7 @@ void GEMM(driver::Device const & device, driver::Stream & stream, static std::function compile = [&](){ //Fetch profile runtime::GEMMProfile* profile = (runtime::GEMMProfile*)runtime::database.at({device.architecture(), runtime::GEMM}).get(); - templates::GEMM generator = profile->predict(device, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc); + templates::GEMM generator = profile->predict(stream, device, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc); //Execute std::string src = generator.dump(device, "gemm"); driver::Module module(stream.context(), src); @@ -69,14 +69,13 @@ void CONV(driver::Device const & device, driver::Stream & stream, static std::function compile = [&](){ //Fetch profile runtime::ConvProfile* profile = (runtime::ConvProfile*)runtime::database.at({device.architecture(), runtime::CONV}).get(); - templates::Conv generator = profile->predict(device, dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w); + templates::Conv generator = profile->predict(stream, device, dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w); //Execute - std::string src = generator.dump(device, "fconv"); + std::string src = generator.dump(device, "conv"); driver::Module module(stream.context(), src); - return value_type(std::make_shared(generator), std::make_shared(module, "fconv")); + return value_type(std::make_shared(generator), std::make_shared(module, "conv")); }; static cpp::CachedMap cache(compile); - //Retrieve profile/kernel and execute value_type const & value = cache.get(key_type(stream, dtype, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w)); value.first->enqueue(*value.second, stream, alpha, I, F, beta, O); diff --git a/include/isaac/driver/buffer.h b/include/isaac/driver/buffer.h index 52a1aaeab..f4fcd9fcb 100644 --- a/include/isaac/driver/buffer.h +++ b/include/isaac/driver/buffer.h @@ -24,6 +24,7 @@ #define ISAAC_DRIVER_BUFFER_H #include "isaac/driver/handle.h" +#include "isaac/driver/context.h" namespace isaac { @@ -41,6 +42,7 @@ public: Handle const & cu() const; private: + Context context_; Handle cu_; size_t size_; }; diff --git a/include/isaac/driver/context.h b/include/isaac/driver/context.h index e9cc4ac84..328d655ca 100644 --- a/include/isaac/driver/context.h +++ b/include/isaac/driver/context.h @@ -52,6 +52,14 @@ private: std::string cache_path_; }; +class ContextSwitcher{ +public: + ContextSwitcher(Context const & ctx); + ~ContextSwitcher(); +private: + Context const & ctx_; +}; + } } diff --git a/include/isaac/driver/cublas.h b/include/isaac/driver/cublas.h index 35c706ab4..3c9672712 100644 --- a/include/isaac/driver/cublas.h +++ b/include/isaac/driver/cublas.h @@ -40,19 +40,20 @@ template void cublasGemm_impl(double, Args... args){ driver::d template -inline void cublasGemm_dispatch(Context const & ctx, Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, void* alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, void* beta, Buffer& C, int32_t ldc){ +inline void cublasGemm_dispatch(Stream& stream, char AT, char BT, int32_t M, int32_t N, int32_t K, void* alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, void* beta, Buffer& C, int32_t ldc){ auto cu_trans = [](char xt) { return (xt=='N')?CUBLAS_OP_N:CUBLAS_OP_T; }; - cublasHandle_t handle = dispatch::cublasHandle(ctx); - dispatch::cublasSetStream_v2(handle, (CUstream)queue); + cublasHandle_t handle = dispatch::cublasHandle(stream.context()); + dispatch::cublasSetStream_v2(handle, (CUstream)stream); CUdeviceptr cuA = A, cuB = B, cuC = C; cublasGemm_impl(cuType(), handle, cu_trans(AT), cu_trans(BT), M, N, K, (cuType*)alpha, (const cuType*)cuA, lda, (const cuType*)cuB, ldb, (cuType*)beta, (cuType*)cuC, ldc); } -inline void cublasGemm(DType dtype, Context const & ctx, Stream& queue, char AT, char BT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc){ +inline void cublasGemm(DType dtype, Stream& stream, char AT, char BT, int32_t M, int32_t N, int32_t K, scalar alpha, Buffer const & A, int32_t lda, Buffer const & B, int32_t ldb, scalar beta, Buffer& C, int32_t ldc){ + ContextSwitcher ctx_switch(stream.context()); switch(dtype){ - case HALF_TYPE: return cublasGemm_dispatch(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); - case FLOAT_TYPE: return cublasGemm_dispatch(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); - case DOUBLE_TYPE: return cublasGemm_dispatch(ctx, queue, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); + case HALF_TYPE: return cublasGemm_dispatch(stream, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); + case FLOAT_TYPE: return cublasGemm_dispatch(stream, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); + case DOUBLE_TYPE: return cublasGemm_dispatch(stream, AT, BT, M, N, K, alpha.data(), A, lda, B, ldb, beta.data(), C, ldc); default: throw; } } @@ -66,12 +67,19 @@ inline cudnnDataType_t cudnnDtype(DType dtype){ throw; } -inline void cudnnConv(DType dtype, Context const & ctx, Stream& queue, int32_t H, int32_t W, int32_t N, int32_t K, int32_t P, int32_t Q, int32_t C, int32_t R, int32_t S, +inline void cudnnConv(DType dtype, Stream& stream, int32_t H, int32_t W, int32_t N, int32_t K, int32_t P, int32_t Q, int32_t C, int32_t R, int32_t S, int32_t pad_h, int32_t pad_w, int32_t stride_h, int32_t stride_w, scalar alpha, Buffer const & I, Buffer const & F, scalar beta, Buffer const & O){ + driver::Context const & ctx = stream.context(); + +// ContextSwitcher switch_ctx(ctx); +// CUcontext cuctx; + dispatch::cuCtxSetCurrent(ctx); +// std::cout << cuctx << " " << CUcontext(ctx) << std::endl; + cudnnHandle_t handle = dispatch::cudnnHandle(ctx); cudnnDataType_t cutype = cudnnDtype(dtype); - dispatch::cudnnSetStream(handle, (CUstream)queue); + dispatch::cudnnSetStream(handle, (CUstream)stream); cudnnTensorDescriptor_t tO, tI; cudnnFilterDescriptor_t tF; cudnnConvolutionDescriptor_t conv; diff --git a/include/isaac/driver/dispatch.h b/include/isaac/driver/dispatch.h index 3a913abfc..a8bf79a50 100644 --- a/include/isaac/driver/dispatch.h +++ b/include/isaac/driver/dispatch.h @@ -27,11 +27,11 @@ #include //CUDA Backend -#include "isaac/driver/external/CUDA/cuda.h" -#include "isaac/driver/external/CUDA/nvrtc.h" -#include "isaac/driver/external/CUDA/cublas.h" -#include "isaac/driver/external/CUDA/cudnn.h" -#include "isaac/driver/external/CUDA/nvml.h" +#include "isaac/external/CUDA/cuda.h" +#include "isaac/external/CUDA/nvrtc.h" +#include "isaac/external/CUDA/cublas.h" +#include "isaac/external/CUDA/cudnn.h" +#include "isaac/external/CUDA/nvml.h" //Exceptions #include @@ -86,6 +86,8 @@ public: //CUDA static CUresult cuCtxGetCurrent(CUcontext *pctx); + static CUresult cuCtxSetCurrent(CUcontext ctx); + static CUresult cuCtxDestroy_v2(CUcontext ctx); static CUresult cuEventCreate(CUevent *phEvent, unsigned int Flags); static CUresult cuDeviceGet(CUdevice *device, int ordinal); @@ -164,6 +166,7 @@ private: //CUDA static void* cuCtxGetCurrent_; + static void* cuCtxSetCurrent_; static void* cuCtxDestroy_v2_; static void* cuEventCreate_; static void* cuDeviceGet_; diff --git a/include/isaac/driver/external/CUDA/builtin_types.h b/include/isaac/external/CUDA/builtin_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/builtin_types.h rename to include/isaac/external/CUDA/builtin_types.h diff --git a/include/isaac/driver/external/CUDA/channel_descriptor.h b/include/isaac/external/CUDA/channel_descriptor.h similarity index 100% rename from include/isaac/driver/external/CUDA/channel_descriptor.h rename to include/isaac/external/CUDA/channel_descriptor.h diff --git a/include/isaac/driver/external/CUDA/cuComplex.h b/include/isaac/external/CUDA/cuComplex.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuComplex.h rename to include/isaac/external/CUDA/cuComplex.h diff --git a/include/isaac/driver/external/CUDA/cublas.h b/include/isaac/external/CUDA/cublas.h similarity index 100% rename from include/isaac/driver/external/CUDA/cublas.h rename to include/isaac/external/CUDA/cublas.h diff --git a/include/isaac/driver/external/CUDA/cublas_api.h b/include/isaac/external/CUDA/cublas_api.h similarity index 100% rename from include/isaac/driver/external/CUDA/cublas_api.h rename to include/isaac/external/CUDA/cublas_api.h diff --git a/include/isaac/driver/external/CUDA/cublas_v2.h b/include/isaac/external/CUDA/cublas_v2.h similarity index 100% rename from include/isaac/driver/external/CUDA/cublas_v2.h rename to include/isaac/external/CUDA/cublas_v2.h diff --git a/include/isaac/driver/external/CUDA/cuda.h b/include/isaac/external/CUDA/cuda.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuda.h rename to include/isaac/external/CUDA/cuda.h diff --git a/include/isaac/driver/external/CUDA/cuda_device_runtime_api.h b/include/isaac/external/CUDA/cuda_device_runtime_api.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuda_device_runtime_api.h rename to include/isaac/external/CUDA/cuda_device_runtime_api.h diff --git a/include/isaac/driver/external/CUDA/cuda_fp16.h b/include/isaac/external/CUDA/cuda_fp16.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuda_fp16.h rename to include/isaac/external/CUDA/cuda_fp16.h diff --git a/include/isaac/driver/external/CUDA/cuda_runtime.h b/include/isaac/external/CUDA/cuda_runtime.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuda_runtime.h rename to include/isaac/external/CUDA/cuda_runtime.h diff --git a/include/isaac/driver/external/CUDA/cuda_runtime_api.h b/include/isaac/external/CUDA/cuda_runtime_api.h similarity index 100% rename from include/isaac/driver/external/CUDA/cuda_runtime_api.h rename to include/isaac/external/CUDA/cuda_runtime_api.h diff --git a/include/isaac/driver/external/CUDA/cudnn.h b/include/isaac/external/CUDA/cudnn.h similarity index 100% rename from include/isaac/driver/external/CUDA/cudnn.h rename to include/isaac/external/CUDA/cudnn.h diff --git a/include/isaac/driver/external/CUDA/device_types.h b/include/isaac/external/CUDA/device_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/device_types.h rename to include/isaac/external/CUDA/device_types.h diff --git a/include/isaac/driver/external/CUDA/driver_functions.h b/include/isaac/external/CUDA/driver_functions.h similarity index 100% rename from include/isaac/driver/external/CUDA/driver_functions.h rename to include/isaac/external/CUDA/driver_functions.h diff --git a/include/isaac/driver/external/CUDA/driver_types.h b/include/isaac/external/CUDA/driver_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/driver_types.h rename to include/isaac/external/CUDA/driver_types.h diff --git a/include/isaac/driver/external/CUDA/host_config.h b/include/isaac/external/CUDA/host_config.h similarity index 100% rename from include/isaac/driver/external/CUDA/host_config.h rename to include/isaac/external/CUDA/host_config.h diff --git a/include/isaac/driver/external/CUDA/host_defines.h b/include/isaac/external/CUDA/host_defines.h similarity index 100% rename from include/isaac/driver/external/CUDA/host_defines.h rename to include/isaac/external/CUDA/host_defines.h diff --git a/include/isaac/driver/external/CUDA/nvml.h b/include/isaac/external/CUDA/nvml.h similarity index 100% rename from include/isaac/driver/external/CUDA/nvml.h rename to include/isaac/external/CUDA/nvml.h diff --git a/include/isaac/driver/external/CUDA/nvrtc.h b/include/isaac/external/CUDA/nvrtc.h similarity index 100% rename from include/isaac/driver/external/CUDA/nvrtc.h rename to include/isaac/external/CUDA/nvrtc.h diff --git a/include/isaac/driver/external/CUDA/surface_types.h b/include/isaac/external/CUDA/surface_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/surface_types.h rename to include/isaac/external/CUDA/surface_types.h diff --git a/include/isaac/driver/external/CUDA/texture_types.h b/include/isaac/external/CUDA/texture_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/texture_types.h rename to include/isaac/external/CUDA/texture_types.h diff --git a/include/isaac/driver/external/CUDA/vector_functions.h b/include/isaac/external/CUDA/vector_functions.h similarity index 100% rename from include/isaac/driver/external/CUDA/vector_functions.h rename to include/isaac/external/CUDA/vector_functions.h diff --git a/include/isaac/driver/external/CUDA/vector_functions.hpp b/include/isaac/external/CUDA/vector_functions.hpp similarity index 100% rename from include/isaac/driver/external/CUDA/vector_functions.hpp rename to include/isaac/external/CUDA/vector_functions.hpp diff --git a/include/isaac/driver/external/CUDA/vector_types.h b/include/isaac/external/CUDA/vector_types.h similarity index 100% rename from include/isaac/driver/external/CUDA/vector_types.h rename to include/isaac/external/CUDA/vector_types.h diff --git a/include/isaac/runtime/predict.h b/include/isaac/runtime/predict.h index 4f0b370a9..a4956e4c7 100644 --- a/include/isaac/runtime/predict.h +++ b/include/isaac/runtime/predict.h @@ -105,14 +105,14 @@ private: class ConvProfile: public Profile{ public: ConvProfile(u_char* data); - templates::Conv predict(driver::Device const & device, DType dtype, param_t C, param_t H, param_t W, param_t N, param_t K, param_t P, param_t Q, param_t R, param_t S, + templates::Conv predict(driver::Stream& stream, driver::Device const & device, DType dtype, param_t C, param_t H, param_t W, param_t N, param_t K, param_t P, param_t Q, param_t R, param_t S, param_t pad_h, param_t pad_w, param_t stride_h, param_t stride_w); }; class GEMMProfile: public Profile{ public: GEMMProfile(u_char* data); - templates::GEMM predict(driver::Device const & device, DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, + templates::GEMM predict(driver::Stream& stream, driver::Device const & device, DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc); }; diff --git a/lib/driver/buffer.cpp b/lib/driver/buffer.cpp index 7bda014e2..e8c6fd6b1 100644 --- a/lib/driver/buffer.cpp +++ b/lib/driver/buffer.cpp @@ -34,11 +34,17 @@ namespace driver { -Buffer::Buffer(Context const & /*context*/, size_t size) : size_(size) -{ dispatch::cuMemAlloc(&*cu_, size); } +Buffer::Buffer(Context const & context, size_t size) : context_(context), size_(size) +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuMemAlloc(&*cu_, size); +} void Buffer::set_zero(Stream const & queue) -{ dispatch::cuMemsetD8Async(*cu_, 0, size_, queue); } +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuMemsetD8Async(*cu_, 0, size_, queue); +} Handle const & Buffer::cu() const { return cu_; } diff --git a/lib/driver/context.cpp b/lib/driver/context.cpp index 120042255..3fa8ccae7 100644 --- a/lib/driver/context.cpp +++ b/lib/driver/context.cpp @@ -21,6 +21,7 @@ */ #include +#include #include "isaac/driver/context.h" #include "isaac/driver/module.h" @@ -63,7 +64,10 @@ Context::Context(CUcontext context, bool take_ownership): cu_(context, take_owne { } Context::Context(Device const & device): device_(device), cache_path_(get_cache_path()) -{ dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device); } +{ + dispatch::cuCtxCreate(&*cu_, CU_CTX_SCHED_AUTO, (CUdevice)device); + dispatch::cuCtxPopCurrent_v2(NULL); +} Device const & Context::device() const { return device_; } @@ -74,5 +78,18 @@ std::string const & Context::cache_path() const Handle const & Context::cu() const { return cu_; } +/* Context Switcher */ +ContextSwitcher::ContextSwitcher(Context const & ctx): ctx_(ctx) +{ dispatch::cuCtxPushCurrent_v2(ctx_); } + +ContextSwitcher::~ContextSwitcher() +{ + CUcontext tmp; + dispatch::cuCtxPopCurrent_v2(&tmp); + assert(tmp==(CUcontext)ctx_ && "Switching back to invalid context!"); +} + + + } } diff --git a/lib/driver/dispatch.cpp b/lib/driver/dispatch.cpp index fd0306011..062b5e3be 100644 --- a/lib/driver/dispatch.cpp +++ b/lib/driver/dispatch.cpp @@ -176,6 +176,7 @@ CUDA_DEFINE2(CUresult, cuMemAlloc_v2, CUdeviceptr*, size_t) CUDA_DEFINE3(CUresult, cuPointerGetAttribute, void*, CUpointer_attribute, CUdeviceptr) CUDA_DEFINE1(CUresult, cuCtxGetDevice, CUdevice*) CUDA_DEFINE1(CUresult, cuCtxGetCurrent, CUcontext*) +CUDA_DEFINE1(CUresult, cuCtxSetCurrent, CUcontext) CUDA_DEFINE4(CUresult, cuMemsetD8Async, CUdeviceptr, unsigned char, size_t, CUstream) CUDA_DEFINE1(CUresult, cuCtxPushCurrent_v2, CUcontext) CUDA_DEFINE1(CUresult, cuCtxPopCurrent_v2, CUcontext*) @@ -260,6 +261,7 @@ void* dispatch::cudnn_; //CUDA void* dispatch::cuCtxGetCurrent_; +void* dispatch::cuCtxSetCurrent_; void* dispatch::cuCtxDestroy_v2_; void* dispatch::cuEventCreate_; void* dispatch::cuDeviceGet_; diff --git a/lib/driver/module.cpp b/lib/driver/module.cpp index feedee939..6335d4d70 100644 --- a/lib/driver/module.cpp +++ b/lib/driver/module.cpp @@ -48,16 +48,19 @@ CUjit_target_enum cutarget(Device::Architecture arch){ } Module::Module(Context const & context, std::string const & source, bool is_ir) : context_(context), source_(source){ + ContextSwitcher ctx_switch(context_); + //PTX passed directly if(is_ir){ - CUjit_option opt[] = {CU_JIT_TARGET, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; + CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER}; unsigned int errbufsize = 8096; std::string errbuf(errbufsize, 0); - CUjit_target_enum target = cutarget(context_.device().architecture()); - void* optval[] = {reinterpret_cast(target), reinterpret_cast(errbufsize), (void*)errbuf.data()}; + //CUjit_target_enum target = cutarget(context.device().architecture()); + void* optval[] = {(void*)(uintptr_t)errbufsize, (void*)errbuf.data()}; try{ - dispatch::cuModuleLoadDataEx(&*cu_, source.data(), 3, opt, optval); + dispatch::cuModuleLoadDataEx(&*cu_, source.data(), 2, opt, optval); }catch(exception::cuda::base const &){ + std::cerr << "Compilation Failed! Log: " << std::endl; std::cerr << errbuf << std::endl; throw; } diff --git a/lib/driver/stream.cpp b/lib/driver/stream.cpp index 43d403c49..b8d5b5a23 100644 --- a/lib/driver/stream.cpp +++ b/lib/driver/stream.cpp @@ -42,15 +42,22 @@ Stream::Stream(CUstream stream, bool take_ownership): cu_(stream, take_ownership {} Stream::Stream(Context const & context): context_(context), cu_(CUstream(), true) -{ dispatch::cuStreamCreate(&*cu_, 0); } +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuStreamCreate(&*cu_, 0); +} void Stream::synchronize() -{ dispatch::cuStreamSynchronize(*cu_); } +{ + ContextSwitcher ctx_switch(context_); + dispatch::cuStreamSynchronize(*cu_); +} Context const & Stream::context() const { return context_; } void Stream::enqueue(Kernel const & kernel, std::array grid, std::array block, std::vector const *, Event* event){ + ContextSwitcher ctx_switch(context_); if(event) dispatch::cuEventRecord(((cu_event_t)*event).first, *cu_); dispatch::cuLaunchKernel(kernel, grid[0], grid[1], grid[2], block[0], block[1], block[2], 0, *cu_,(void**)kernel.cu_params(), NULL); diff --git a/lib/runtime/predict.cpp b/lib/runtime/predict.cpp index 7b05c8217..5e7790ec9 100644 --- a/lib/runtime/predict.cpp +++ b/lib/runtime/predict.cpp @@ -172,16 +172,14 @@ std::vector Profile::predict(driver::Device const & device, std::vector ConvProfile::ConvProfile(u_char* data): Profile(data, 8){} -templates::Conv ConvProfile::predict(driver::Device const & device, DType dtype, param_t C, param_t H, param_t W, param_t N, param_t K, param_t P, param_t Q, param_t R, param_t S, +templates::Conv ConvProfile::predict(driver::Stream& stream, driver::Device const & device, DType dtype, param_t C, param_t H, param_t W, param_t N, param_t K, param_t P, param_t Q, param_t R, param_t S, param_t pad_h, param_t pad_w, param_t stride_h, param_t stride_w) { std::vector shapes{dtype, N, K, P, Q, C, R, S}; - driver::Context ctx(device); - driver::Stream stream(ctx); - driver::Buffer O(ctx, N*K*P*Q*size_of(dtype)); - driver::Buffer I(ctx, C*H*W*N*size_of(dtype)); - driver::Buffer F(ctx, C*K*R*S*size_of(dtype)); + driver::Buffer O(stream.context(), N*K*P*Q*size_of(dtype)); + driver::Buffer I(stream.context(), C*H*W*N*size_of(dtype)); + driver::Buffer F(stream.context(), C*K*R*S*size_of(dtype)); scalar alpha(1., dtype); scalar beta(0., dtype); std::function const&)> benchmark = [&](std::vector const& x){ @@ -201,16 +199,14 @@ templates::Conv ConvProfile::predict(driver::Device const & device, DType dtype, GEMMProfile::GEMMProfile(u_char* data): Profile(data, 6){} -templates::GEMM GEMMProfile::predict(driver::Device const & device, DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, +templates::GEMM GEMMProfile::predict(driver::Stream& stream, driver::Device const & device, DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc) { std::vector shapes{dtype, AT, BT, M, N, K}; - driver::Context ctx(device); - driver::Stream stream(ctx); - driver::Buffer C(ctx, M*N*size_of(dtype)); - driver::Buffer A(ctx, M*K*size_of(dtype)); - driver::Buffer B(ctx, K*N*size_of(dtype)); + driver::Buffer C(stream.context(), M*N*size_of(dtype)); + driver::Buffer A(stream.context(), M*K*size_of(dtype)); + driver::Buffer B(stream.context(), K*N*size_of(dtype)); scalar alpha(1., dtype); scalar beta(0., dtype); std::function const&)> benchmark = [&](std::vector const& x) diff --git a/lib/templates/conv.cpp b/lib/templates/conv.cpp index 5672b9f4e..c5c421ee5 100644 --- a/lib/templates/conv.cpp +++ b/lib/templates/conv.cpp @@ -525,7 +525,7 @@ std::string Conv::dump(drv::Device const & device, std::string const & name){ iss << std::endl; iss << " // Thread ID" << std::endl; iss << " mov.u32 %idpqn, %tid.x;" << std::endl; - iss << format(" mov.u32 %idk, %tid.y;") << std::endl; + iss << format(" mov.u32 %idk, %tid.y;") << std::endl; iss << " mov.u32 %idc, %tid.z;" << std::endl; iss << format(" mad.lo.u32 %idkpqn, %idk, {}, %idpqn;", bpqn) << std::endl; iss << format(" div.u32 %idpq, %idpqn, {};", bn_) << std::endl; @@ -841,6 +841,7 @@ std::string Conv::dump(drv::Device const & device, std::string const & name){ inc_k += step_k; } iss << "}" << std::endl; +// std::cout << iss.str() << std::endl; return iss.str(); } diff --git a/tests/conv.cpp b/tests/conv.cpp index 5d56d7a4a..95e2f24a4 100644 --- a/tests/conv.cpp +++ b/tests/conv.cpp @@ -88,7 +88,7 @@ void do_test_impl(sc::driver::Context const & ctx, size_t N, size_t K, size_t H, stream.write(O, true, 0, iO.size()*dtsize, iO.data()); stream.write(I, true, 0, iI.size()*dtsize, iI_cudnn.data()); stream.write(F, true, 0, iF.size()*dtsize, iF_cudnn.data()); - sc::driver::cudnnConv(dtype, ctx, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); + sc::driver::cudnnConv(dtype, stream, H, W, N, K, P, Q, C, R, S, pad_h, pad_w, stride_h, stride_w, alpha, I, F, beta, O); std::vector rO_cudnn(iO.size()); std::vector rO(iO.size()); stream.read(O, true, 0, rO_cudnn.size()*dtsize, (void*)rO_cudnn.data()); diff --git a/tests/gemm.cpp b/tests/gemm.cpp index 4f65b6d0d..19bcd92b6 100644 --- a/tests/gemm.cpp +++ b/tests/gemm.cpp @@ -52,25 +52,25 @@ void do_test(sc::driver::Context const & ctx, sc::IsaacOperation_t AT, sc::Isaac for(size_t i = 0; i < iA.size(); ++i) iA[i] = (float)rand()/RAND_MAX; for(size_t i = 0; i < iB.size(); ++i) iB[i] = (float)rand()/RAND_MAX; - drv::Stream queue(ctx); - queue.write(C, true, 0, M*N*dtsize, iC.data()); - queue.write(A, true, 0, M*K*dtsize, iA.data()); - queue.write(B, true, 0, K*N*dtsize, iB.data()); + drv::Stream stream(ctx); + stream.write(C, true, 0, M*N*dtsize, iC.data()); + stream.write(A, true, 0, M*K*dtsize, iA.data()); + stream.write(B, true, 0, K*N*dtsize, iB.data()); //Ground result (cuBLAS) char cuAT = (AT==sc::ISAAC_OP_T)?'T':'N'; char cuBT = (BT==sc::ISAAC_OP_T)?'T':'N'; - sc::driver::cublasGemm(dtype, ctx, queue, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); + sc::driver::cublasGemm(dtype, stream, cuAT, cuBT, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); std::vector rC(M*N); - queue.read(C, true, 0, M*N*dtsize, (void*)rC.data()); - queue.write(C, true, 0, M*N*dtsize, iC.data()); + stream.read(C, true, 0, M*N*dtsize, (void*)rC.data()); + stream.write(C, true, 0, M*N*dtsize, iC.data()); //ISAAC result std::vector hC(M*N); //Test selected profile - sc::GEMM(ctx.device(), queue, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, alpha, A, B, beta, C); - queue.read(C, true, 0, M*N*dtsize, (void*)hC.data()); + sc::GEMM(ctx.device(), stream, dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, alpha, A, B, beta, C); + stream.read(C, true, 0, M*N*dtsize, (void*)hC.data()); if(!is_correct(hC, rC, max_rounding_error(DTYPE(K)))) exit(EXIT_FAILURE); @@ -93,11 +93,11 @@ void do_test(sc::driver::Context const & ctx, sc::IsaacOperation_t AT, sc::Isaac drv::Kernel kernel(program, "gemm"); //Launch - gemm.enqueue(kernel, queue, alpha, A, B, beta, C); - queue.synchronize(); + gemm.enqueue(kernel, stream, alpha, A, B, beta, C); + stream.synchronize(); //Test - queue.read(C, true, 0, M*N*dtsize, (void*)hC.data()); + stream.read(C, true, 0, M*N*dtsize, (void*)hC.data()); size_t depth = x[11]*x[12]*x[13]; double eps = max_rounding_error(DTYPE(K/depth))*depth; if(!is_correct(hC, rC, eps))