/* Copyright 2015-2017 Philippe Tillet * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files * (the "Software"), to deal in the Software without restriction, * including without limitation the rights to use, copy, modify, merge, * publish, distribute, sublicense, and/or sell copies of the Software, * and to permit persons to whom the Software is furnished to do so, * subject to the following conditions: * * The above copyright notice and this permission notice shall be * included in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #include #include #include #include #include #include #include "isaac/runtime/predict.h" #include "isaac/templates/conv.h" #include "isaac/tools/bench.hpp" #include "isaac/driver/module.h" #include "isaac/driver/kernel.h" #include "isaac/driver/stream.h" #include "isaac/driver/buffer.h" namespace isaac{ namespace runtime{ // Layers Layer* Layer::read(u_char*& current){ uint32_t type; read_inc((void*)&type, current, 4); if(type==Activation::BINARY_CODE){ read_inc((void*)&type, current, 4); if(type==ReLU::BINARY_CODE) return new ReLU(); throw; } else if(type==Dense::BINARY_CODE) return new Dense(current); throw; } // Activation size_t Activation::n_outs(size_t n_outs_prev) { return n_outs_prev; } // Relu void ReLU::forward(matrix const & X, matrix & Y){ for(size_t i = 0; i < X.shapes()[0]; ++i) for(size_t j = 0; j < X.shapes()[1]; ++j) Y(i, j) = std::max(X(i,j), 0); } // Dense Dense::Dense(u_char*& data) : W_(data){ b_.resize(W_.shapes()[1]); read_inc((void*)b_.data(), data, b_.size()*4); } size_t Dense::n_outs(size_t) { return W_.shapes()[1]; } void Dense::forward(matrix const & X, matrix & Y){ gemm(Y.shapes()[0], Y.shapes()[1], X.shapes()[1], 1, X.data(), X.ld(), W_.data(), W_.ld(), 1, Y.data(), Y.ld(), b_.data()); } // Network Network::Network(u_char* data){ uint32_t nlayers; read_inc((void*)&nlayers, data, 4); for(size_t i = 0; i < nlayers; ++i) layers_.push_back(std::shared_ptr(Layer::read(data))); } void Network::predict(matrix const & X, matrix & Y){ uint32_t N = X.shapes()[0], M = X.shapes()[1]; size_t nlayers = layers_.size(); std::vector n_outs(nlayers+1, M); for(size_t i = 0; i < nlayers; ++i){ n_outs[i+1] = layers_[i]->n_outs(n_outs[i]); } //Pre-allocate a big buffer to stay in cache memory size_t nhid_max = *std::max_element(n_outs.begin(), n_outs.end()); std::vector scratch(2*N*nhid_max); std::vector off(nlayers+1, 0); //Predict for(size_t i = 0; i < nlayers; ++i){ bool is_dense = dynamic_cast(layers_[i].get()); off[i+1] = off[i]; if(is_dense) off[i+1] = (off[i] + scratch.size()/2) % scratch.size(); matrix I({N, n_outs[i]}, (i==0)?n_outs[i]:nhid_max, (i==0)?X.data():(scratch.data() + off[i])); matrix O({N, n_outs[i+1]}, (i==nlayers-1)?n_outs[i+1]:nhid_max, (i==nlayers-1)?Y.data():(scratch.data() + off[i+1])); layers_[i]->forward(I, O); } } // Profile Profile::Profile(u_char* data, size_t nshapes): kernels_(pad_left(matrix(data), nshapes)), predictor_(data) {} matrix const & Profile::kernels() const { return kernels_; } std::vector Profile::predict(driver::Device const & device, std::vector const & shapes, validator_t const & validator, benchmark_t const & benchmark, uint32_t nkeep) { // Get valid profiles uint32_t nkernels = kernels_.shapes()[0]; uint32_t nparams = kernels_.shapes()[1]; for(size_t i = 0; i < nkernels; ++i) for(size_t j = 0; j < shapes.size(); ++j) kernels_(i, j) = shapes[j]; std::vector valid(nkernels); validator(device, nkernels, kernels_.data(), valid.data()); uint32_t nvalid = std::accumulate(valid.begin(), valid.end(), 0); // Get valid indices std::vector map; map.reserve(nvalid); for(size_t i = 0; i < nkernels; ++i) if(valid[i]) map.push_back(i); // Predictor input matrix X({nvalid, nparams}); for(size_t i = 0; i < nvalid; ++i) for(size_t j = 0; j < nparams; ++j) X(i, j) = std::log2(kernels_(map[i], j)); // Do prediction matrix Y({nvalid, 1}); predictor_.predict(X, Y); // Sort prediction std::vector idx(nvalid); std::iota(idx.begin(), idx.end(), 0); std::sort(idx.begin(), idx.end(), [&Y](size_t i1, size_t i2) {return Y(i1,0) > Y(i2, 0);}); // Return best matrix best({nkeep, nparams}); for(size_t i = 0; i < std::min(nvalid, nkeep); ++i) for(size_t j = 0; j < nparams; ++j) best(i, j) = kernels_(map[idx[i]], shapes.size() + j); //Re-Benchmark size_t argmin = 0; std::vector x(nparams); if(nkeep > 1){ std::vector time; for(size_t i = 0; i < std::min(nvalid, nkeep); ++i){ for(size_t j = 0; j < nparams; ++j) x[j] = best(i,j); time.push_back(benchmark(x)); } argmin = std::min_element(time.begin(), time.end()) - time.begin(); } for(size_t j = 0; j < nparams; ++j) x[j] = best(argmin,j); return x; } ConvProfile::ConvProfile(u_char* data): Profile(data, 8){} templates::Conv ConvProfile::predict(driver::Stream& stream, driver::Device const & device, DType dtype, param_t C, param_t H, param_t W, param_t N, param_t K, param_t P, param_t Q, param_t R, param_t S, param_t pad_h, param_t pad_w, param_t stride_h, param_t stride_w) { std::vector shapes{dtype, N, K, P, Q, C, R, S}; driver::Buffer O(stream.context(), N*K*P*Q*size_of(dtype)); driver::Buffer I(stream.context(), C*H*W*N*size_of(dtype)); driver::Buffer F(stream.context(), C*K*R*S*size_of(dtype)); scalar alpha(1., dtype); scalar beta(0., dtype); std::function const&)> benchmark = [&](std::vector const& x){ templates::Conv generator(dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14]); std::string src = generator.dump(device, "conv"); driver::Module module(stream.context(), src); driver::Kernel kernel(module, "conv"); return bench([&](){ generator.enqueue(kernel, stream, alpha, I, F, beta, O); }, [&](){ stream.synchronize(); }, device); }; std::vector x = Profile::predict(device, shapes, templates::Conv::check_valid, benchmark); return templates::Conv(dtype, C, H, W, N, K, P, Q, R, S, pad_h, pad_w, stride_h, stride_w, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14]); } GEMMProfile::GEMMProfile(u_char* data): Profile(data, 6){} templates::GEMM GEMMProfile::predict(driver::Stream& stream, driver::Device const & device, DType dtype, IsaacOperation_t AT, IsaacOperation_t BT, param_t M, param_t N, param_t K, param_t offa, param_t lda, param_t offb, param_t ldb, param_t offc, param_t ldc) { std::vector shapes{dtype, AT, BT, M, N, K}; driver::Buffer C(stream.context(), M*N*size_of(dtype)); driver::Buffer A(stream.context(), M*K*size_of(dtype)); driver::Buffer B(stream.context(), K*N*size_of(dtype)); scalar alpha(1., dtype); scalar beta(0., dtype); std::function const&)> benchmark = [&](std::vector const& x) { templates::GEMM generator(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]); std::string src = generator.dump(device, "gemm"); driver::Module module(stream.context(), src); driver::Kernel kernel(module, "gemm"); return bench([&](){ generator.enqueue(kernel, stream, alpha, A, B, beta, C); }, [&](){ stream.synchronize(); }, device); }; std::vector x = Profile::predict(device, shapes, templates::GEMM::check_valid, benchmark); return templates::GEMM(dtype, AT, BT, M, N, K, offa, lda, offb, ldb, offc, ldc, x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13]); } } }