Database/GEMM: updated profile

This commit is contained in:
Philippe Tillet
2017-09-03 20:10:26 -04:00
parent 6902e19a7f
commit 5e99bbe09d
8 changed files with 1830 additions and 7073 deletions

View File

@@ -38,6 +38,7 @@ class Buffer: public HandleInterface<Buffer, CUdeviceptr>
{ {
public: public:
Buffer(Context const & context, size_t size); Buffer(Context const & context, size_t size);
void set_zero(Stream const & queue, size_t size);
void set_zero(Stream const & queue); void set_zero(Stream const & queue);
Handle<CUdeviceptr> const & cu() const; Handle<CUdeviceptr> const & cu() const;

View File

@@ -40,12 +40,15 @@ Buffer::Buffer(Context const & context, size_t size) : context_(context), size_(
dispatch::cuMemAlloc(&*cu_, size); dispatch::cuMemAlloc(&*cu_, size);
} }
void Buffer::set_zero(Stream const & queue) void Buffer::set_zero(Stream const & queue, size_t size)
{ {
ContextSwitcher ctx_switch(context_); ContextSwitcher ctx_switch(context_);
dispatch::cuMemsetD8Async(*cu_, 0, size_, queue); dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
} }
void Buffer::set_zero(Stream const & queue)
{ set_zero(queue, size_); }
Handle<CUdeviceptr> const & Buffer::cu() const Handle<CUdeviceptr> const & Buffer::cu() const
{ return cu_; } { return cu_; }

File diff suppressed because it is too large Load Diff

View File

@@ -100,9 +100,9 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
DType dtype = (DType)(x[0]); DType dtype = (DType)(x[0]);
IsaacOperation_t AT = (IsaacOperation_t)x[1]; IsaacOperation_t AT = (IsaacOperation_t)x[1];
IsaacOperation_t BT = (IsaacOperation_t)x[2]; IsaacOperation_t BT = (IsaacOperation_t)x[2];
param_t vec = x[6], bm = x[7], kl = x[8], bn = x[9], param_t M = x[3], N = x[4], vec = x[6], bm = x[7], kl = x[8], bn = x[9],
ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16], ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16],
rs = x[17], br = x[18]; rs = x[17], br = x[18], gridr = x[19];
//Features //Features
param_t dtsize = size_of(dtype); param_t dtsize = size_of(dtype);
param_t dtvec = (dtype==HALF_TYPE)?2:1; param_t dtvec = (dtype==HALF_TYPE)?2:1;
@@ -111,6 +111,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
param_t rl = rs*br; param_t rl = rs*br;
param_t ml = bm*ms; param_t ml = bm*ms;
param_t nl = bn*ns; param_t nl = bn*ns;
param_t gridM = ceil(M, ml), gridN = ceil(N, nl);
param_t nthreads = bm*bn*br; param_t nthreads = bm*bn*br;
param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec))); param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec)));
param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec))); param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec)));
@@ -140,7 +141,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
&& ns % (dtvec*vec) == 0 && ns % (dtvec*vec) == 0
&& kl % ks == 0 && kl % ks == 0
&& size_shmem <= device.max_shared_memory() && size_shmem <= device.max_shared_memory()
&& (gridr == 1 || gridM*gridN < 64*64)
&& n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop && n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop
&& bm <= device.max_block_dim()[0] && bm <= device.max_block_dim()[0]
&& bn <= device.max_block_dim()[1] && bn <= device.max_block_dim()[1]
@@ -208,7 +209,7 @@ std::string GEMM::dump(drv::Device const & device, std::string const & name){
if(vec_==1) if(vec_==1)
vs[0] = ""; vs[0] = "";
//Load-Store alignments //Load-Store alignments
io_conf Cio(ldc_, vec_, dtvec, dtsize, kg_>1?false:true); io_conf Cio(ldc_, vec_, dtvec, dtsize, false);
io_conf Aio(lda_, vec_, dtvec, dtsize, false); io_conf Aio(lda_, vec_, dtvec, dtsize, false);
io_conf Bio(ldb_, vec_, dtvec, dtsize, false); io_conf Bio(ldb_, vec_, dtvec, dtsize, false);
@@ -889,10 +890,10 @@ void GEMM::enqueue(driver::Kernel &gemm, driver::Stream &queue, const scalar& al
gemm.setArg(14, bound); gemm.setArg(14, bound);
gemm.setArg(15, locks); gemm.setArg(15, locks);
// std::cout << gridM << " " << gridN << " " << kg_ << std::endl; // std::cout << gridM << " " << gridN << " " << std::endl;
//Launch //Launch
if(kg_ > 1) if(kg_ > 1)
locks.set_zero(queue); locks.set_zero(queue, gridM*gridN*4);
queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_}); queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_});
} }

View File

@@ -29,8 +29,8 @@ def recursive_glob(rootdir='.', suffix=''):
def main(): def main():
#Source #Source
include = [os.path.join('src', 'include')] include = [os.path.join(os.pardir, 'include')]
src = recursive_glob(os.path.join('src','lib'), 'cpp') src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp')
#Bindings #Bindings
include += [os.path.join('src', 'bind')] include += [os.path.join('src', 'bind')]

View File

@@ -24,6 +24,7 @@
#include <pybind11/stl.h> #include <pybind11/stl.h>
#include "isaac/driver/backend.h" #include "isaac/driver/backend.h"
#include "isaac/driver/buffer.h" #include "isaac/driver/buffer.h"
#include "isaac/driver/error.h"
#include "isaac/driver/cublas.h" #include "isaac/driver/cublas.h"
#include "isaac/driver/stream.h" #include "isaac/driver/stream.h"
#include "isaac/driver/context.h" #include "isaac/driver/context.h"
@@ -87,5 +88,9 @@ void export_driver(py::module&& m)
py::class_<drv::Stream>(m, "Stream") py::class_<drv::Stream>(m, "Stream")
.def(py::init<drv::Context>()) .def(py::init<drv::Context>())
.def("synchronize", &drv::Stream::synchronize); .def("synchronize", &drv::Stream::synchronize);
py::register_exception<drv::exception::cuda::misaligned_address>(m, "MisalignedAddress");
} }

View File

@@ -34,7 +34,7 @@ def benchmarks(prefix, OpType, device, nsamples):
op = OpType(params) op = OpType(params)
try: try:
y = op.benchmark(ctx, stream) y = op.benchmark(ctx, stream)
except RuntimeError: except:
continue continue
#Update #Update
bufX[nvalid % step, :] = params bufX[nvalid % step, :] = params

View File

@@ -50,7 +50,8 @@ def maximize(OpType, device, model, shapes, V):
X[:, OpType.nshape_params:] = V X[:, OpType.nshape_params:] = V
X = OpType.get_valid(device, X) X = OpType.get_valid(device, X)
#Model predictions #Model predictions
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0) with tf.device('/cpu:0'):
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
pred_perfs = np.sort(predictions, axis=0)[::-1] pred_perfs = np.sort(predictions, axis=0)[::-1]
pred_idxs = np.argsort(predictions, axis=0)[::-1] pred_idxs = np.argsort(predictions, axis=0)[::-1]
#Evaluate best predicted models #Evaluate best predicted models