Database/GEMM: updated profile

This commit is contained in:
Philippe Tillet
2017-09-03 20:10:26 -04:00
parent 6902e19a7f
commit 5e99bbe09d
8 changed files with 1830 additions and 7073 deletions

View File

@@ -38,6 +38,7 @@ class Buffer: public HandleInterface<Buffer, CUdeviceptr>
{
public:
Buffer(Context const & context, size_t size);
void set_zero(Stream const & queue, size_t size);
void set_zero(Stream const & queue);
Handle<CUdeviceptr> const & cu() const;

View File

@@ -40,12 +40,15 @@ Buffer::Buffer(Context const & context, size_t size) : context_(context), size_(
dispatch::cuMemAlloc(&*cu_, size);
}
void Buffer::set_zero(Stream const & queue)
void Buffer::set_zero(Stream const & queue, size_t size)
{
ContextSwitcher ctx_switch(context_);
dispatch::cuMemsetD8Async(*cu_, 0, size_, queue);
dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
}
void Buffer::set_zero(Stream const & queue)
{ set_zero(queue, size_); }
Handle<CUdeviceptr> const & Buffer::cu() const
{ return cu_; }

File diff suppressed because it is too large Load Diff

View File

@@ -100,9 +100,9 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
DType dtype = (DType)(x[0]);
IsaacOperation_t AT = (IsaacOperation_t)x[1];
IsaacOperation_t BT = (IsaacOperation_t)x[2];
param_t vec = x[6], bm = x[7], kl = x[8], bn = x[9],
param_t M = x[3], N = x[4], vec = x[6], bm = x[7], kl = x[8], bn = x[9],
ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16],
rs = x[17], br = x[18];
rs = x[17], br = x[18], gridr = x[19];
//Features
param_t dtsize = size_of(dtype);
param_t dtvec = (dtype==HALF_TYPE)?2:1;
@@ -111,6 +111,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
param_t rl = rs*br;
param_t ml = bm*ms;
param_t nl = bn*ns;
param_t gridM = ceil(M, ml), gridN = ceil(N, nl);
param_t nthreads = bm*bn*br;
param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec)));
param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec)));
@@ -140,7 +141,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
&& ns % (dtvec*vec) == 0
&& kl % ks == 0
&& size_shmem <= device.max_shared_memory()
&& (gridr == 1 || gridM*gridN < 64*64)
&& n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop
&& bm <= device.max_block_dim()[0]
&& bn <= device.max_block_dim()[1]
@@ -208,7 +209,7 @@ std::string GEMM::dump(drv::Device const & device, std::string const & name){
if(vec_==1)
vs[0] = "";
//Load-Store alignments
io_conf Cio(ldc_, vec_, dtvec, dtsize, kg_>1?false:true);
io_conf Cio(ldc_, vec_, dtvec, dtsize, false);
io_conf Aio(lda_, vec_, dtvec, dtsize, false);
io_conf Bio(ldb_, vec_, dtvec, dtsize, false);
@@ -889,10 +890,10 @@ void GEMM::enqueue(driver::Kernel &gemm, driver::Stream &queue, const scalar& al
gemm.setArg(14, bound);
gemm.setArg(15, locks);
// std::cout << gridM << " " << gridN << " " << kg_ << std::endl;
// std::cout << gridM << " " << gridN << " " << std::endl;
//Launch
if(kg_ > 1)
locks.set_zero(queue);
locks.set_zero(queue, gridM*gridN*4);
queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_});
}

View File

@@ -29,8 +29,8 @@ def recursive_glob(rootdir='.', suffix=''):
def main():
#Source
include = [os.path.join('src', 'include')]
src = recursive_glob(os.path.join('src','lib'), 'cpp')
include = [os.path.join(os.pardir, 'include')]
src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp')
#Bindings
include += [os.path.join('src', 'bind')]

View File

@@ -24,6 +24,7 @@
#include <pybind11/stl.h>
#include "isaac/driver/backend.h"
#include "isaac/driver/buffer.h"
#include "isaac/driver/error.h"
#include "isaac/driver/cublas.h"
#include "isaac/driver/stream.h"
#include "isaac/driver/context.h"
@@ -88,4 +89,8 @@ void export_driver(py::module&& m)
.def(py::init<drv::Context>())
.def("synchronize", &drv::Stream::synchronize);
py::register_exception<drv::exception::cuda::misaligned_address>(m, "MisalignedAddress");
}

View File

@@ -34,7 +34,7 @@ def benchmarks(prefix, OpType, device, nsamples):
op = OpType(params)
try:
y = op.benchmark(ctx, stream)
except RuntimeError:
except:
continue
#Update
bufX[nvalid % step, :] = params

View File

@@ -50,6 +50,7 @@ def maximize(OpType, device, model, shapes, V):
X[:, OpType.nshape_params:] = V
X = OpType.get_valid(device, X)
#Model predictions
with tf.device('/cpu:0'):
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
pred_perfs = np.sort(predictions, axis=0)[::-1]
pred_idxs = np.argsort(predictions, axis=0)[::-1]