Database/GEMM: updated profile
This commit is contained in:
@@ -38,6 +38,7 @@ class Buffer: public HandleInterface<Buffer, CUdeviceptr>
|
||||
{
|
||||
public:
|
||||
Buffer(Context const & context, size_t size);
|
||||
void set_zero(Stream const & queue, size_t size);
|
||||
void set_zero(Stream const & queue);
|
||||
Handle<CUdeviceptr> const & cu() const;
|
||||
|
||||
|
@@ -40,12 +40,15 @@ Buffer::Buffer(Context const & context, size_t size) : context_(context), size_(
|
||||
dispatch::cuMemAlloc(&*cu_, size);
|
||||
}
|
||||
|
||||
void Buffer::set_zero(Stream const & queue)
|
||||
void Buffer::set_zero(Stream const & queue, size_t size)
|
||||
{
|
||||
ContextSwitcher ctx_switch(context_);
|
||||
dispatch::cuMemsetD8Async(*cu_, 0, size_, queue);
|
||||
dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
|
||||
}
|
||||
|
||||
void Buffer::set_zero(Stream const & queue)
|
||||
{ set_zero(queue, size_); }
|
||||
|
||||
Handle<CUdeviceptr> const & Buffer::cu() const
|
||||
{ return cu_; }
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@@ -100,9 +100,9 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
||||
DType dtype = (DType)(x[0]);
|
||||
IsaacOperation_t AT = (IsaacOperation_t)x[1];
|
||||
IsaacOperation_t BT = (IsaacOperation_t)x[2];
|
||||
param_t vec = x[6], bm = x[7], kl = x[8], bn = x[9],
|
||||
param_t M = x[3], N = x[4], vec = x[6], bm = x[7], kl = x[8], bn = x[9],
|
||||
ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16],
|
||||
rs = x[17], br = x[18];
|
||||
rs = x[17], br = x[18], gridr = x[19];
|
||||
//Features
|
||||
param_t dtsize = size_of(dtype);
|
||||
param_t dtvec = (dtype==HALF_TYPE)?2:1;
|
||||
@@ -111,6 +111,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
||||
param_t rl = rs*br;
|
||||
param_t ml = bm*ms;
|
||||
param_t nl = bn*ns;
|
||||
param_t gridM = ceil(M, ml), gridN = ceil(N, nl);
|
||||
param_t nthreads = bm*bn*br;
|
||||
param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec)));
|
||||
param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec)));
|
||||
@@ -140,7 +141,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
||||
&& ns % (dtvec*vec) == 0
|
||||
&& kl % ks == 0
|
||||
&& size_shmem <= device.max_shared_memory()
|
||||
|
||||
&& (gridr == 1 || gridM*gridN < 64*64)
|
||||
&& n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop
|
||||
&& bm <= device.max_block_dim()[0]
|
||||
&& bn <= device.max_block_dim()[1]
|
||||
@@ -208,7 +209,7 @@ std::string GEMM::dump(drv::Device const & device, std::string const & name){
|
||||
if(vec_==1)
|
||||
vs[0] = "";
|
||||
//Load-Store alignments
|
||||
io_conf Cio(ldc_, vec_, dtvec, dtsize, kg_>1?false:true);
|
||||
io_conf Cio(ldc_, vec_, dtvec, dtsize, false);
|
||||
io_conf Aio(lda_, vec_, dtvec, dtsize, false);
|
||||
io_conf Bio(ldb_, vec_, dtvec, dtsize, false);
|
||||
|
||||
@@ -889,10 +890,10 @@ void GEMM::enqueue(driver::Kernel &gemm, driver::Stream &queue, const scalar& al
|
||||
gemm.setArg(14, bound);
|
||||
gemm.setArg(15, locks);
|
||||
|
||||
// std::cout << gridM << " " << gridN << " " << kg_ << std::endl;
|
||||
// std::cout << gridM << " " << gridN << " " << std::endl;
|
||||
//Launch
|
||||
if(kg_ > 1)
|
||||
locks.set_zero(queue);
|
||||
locks.set_zero(queue, gridM*gridN*4);
|
||||
queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_});
|
||||
}
|
||||
|
||||
|
@@ -29,8 +29,8 @@ def recursive_glob(rootdir='.', suffix=''):
|
||||
def main():
|
||||
|
||||
#Source
|
||||
include = [os.path.join('src', 'include')]
|
||||
src = recursive_glob(os.path.join('src','lib'), 'cpp')
|
||||
include = [os.path.join(os.pardir, 'include')]
|
||||
src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp')
|
||||
|
||||
#Bindings
|
||||
include += [os.path.join('src', 'bind')]
|
||||
|
@@ -24,6 +24,7 @@
|
||||
#include <pybind11/stl.h>
|
||||
#include "isaac/driver/backend.h"
|
||||
#include "isaac/driver/buffer.h"
|
||||
#include "isaac/driver/error.h"
|
||||
#include "isaac/driver/cublas.h"
|
||||
#include "isaac/driver/stream.h"
|
||||
#include "isaac/driver/context.h"
|
||||
@@ -88,4 +89,8 @@ void export_driver(py::module&& m)
|
||||
.def(py::init<drv::Context>())
|
||||
.def("synchronize", &drv::Stream::synchronize);
|
||||
|
||||
|
||||
py::register_exception<drv::exception::cuda::misaligned_address>(m, "MisalignedAddress");
|
||||
|
||||
|
||||
}
|
||||
|
@@ -34,7 +34,7 @@ def benchmarks(prefix, OpType, device, nsamples):
|
||||
op = OpType(params)
|
||||
try:
|
||||
y = op.benchmark(ctx, stream)
|
||||
except RuntimeError:
|
||||
except:
|
||||
continue
|
||||
#Update
|
||||
bufX[nvalid % step, :] = params
|
||||
|
@@ -50,7 +50,8 @@ def maximize(OpType, device, model, shapes, V):
|
||||
X[:, OpType.nshape_params:] = V
|
||||
X = OpType.get_valid(device, X)
|
||||
#Model predictions
|
||||
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
|
||||
with tf.device('/cpu:0'):
|
||||
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
|
||||
pred_perfs = np.sort(predictions, axis=0)[::-1]
|
||||
pred_idxs = np.argsort(predictions, axis=0)[::-1]
|
||||
#Evaluate best predicted models
|
||||
|
Reference in New Issue
Block a user