Database/GEMM: updated profile
This commit is contained in:
@@ -38,6 +38,7 @@ class Buffer: public HandleInterface<Buffer, CUdeviceptr>
|
|||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
Buffer(Context const & context, size_t size);
|
Buffer(Context const & context, size_t size);
|
||||||
|
void set_zero(Stream const & queue, size_t size);
|
||||||
void set_zero(Stream const & queue);
|
void set_zero(Stream const & queue);
|
||||||
Handle<CUdeviceptr> const & cu() const;
|
Handle<CUdeviceptr> const & cu() const;
|
||||||
|
|
||||||
|
@@ -40,12 +40,15 @@ Buffer::Buffer(Context const & context, size_t size) : context_(context), size_(
|
|||||||
dispatch::cuMemAlloc(&*cu_, size);
|
dispatch::cuMemAlloc(&*cu_, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
void Buffer::set_zero(Stream const & queue)
|
void Buffer::set_zero(Stream const & queue, size_t size)
|
||||||
{
|
{
|
||||||
ContextSwitcher ctx_switch(context_);
|
ContextSwitcher ctx_switch(context_);
|
||||||
dispatch::cuMemsetD8Async(*cu_, 0, size_, queue);
|
dispatch::cuMemsetD8Async(*cu_, 0, size, queue);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Buffer::set_zero(Stream const & queue)
|
||||||
|
{ set_zero(queue, size_); }
|
||||||
|
|
||||||
Handle<CUdeviceptr> const & Buffer::cu() const
|
Handle<CUdeviceptr> const & Buffer::cu() const
|
||||||
{ return cu_; }
|
{ return cu_; }
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
@@ -100,9 +100,9 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
|||||||
DType dtype = (DType)(x[0]);
|
DType dtype = (DType)(x[0]);
|
||||||
IsaacOperation_t AT = (IsaacOperation_t)x[1];
|
IsaacOperation_t AT = (IsaacOperation_t)x[1];
|
||||||
IsaacOperation_t BT = (IsaacOperation_t)x[2];
|
IsaacOperation_t BT = (IsaacOperation_t)x[2];
|
||||||
param_t vec = x[6], bm = x[7], kl = x[8], bn = x[9],
|
param_t M = x[3], N = x[4], vec = x[6], bm = x[7], kl = x[8], bn = x[9],
|
||||||
ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16],
|
ms = x[10], ks = x[11], ns = x[12], a_bf0 = x[13], a_bf1 = x[14], b_bf0 = x[15], b_bf1 = x[16],
|
||||||
rs = x[17], br = x[18];
|
rs = x[17], br = x[18], gridr = x[19];
|
||||||
//Features
|
//Features
|
||||||
param_t dtsize = size_of(dtype);
|
param_t dtsize = size_of(dtype);
|
||||||
param_t dtvec = (dtype==HALF_TYPE)?2:1;
|
param_t dtvec = (dtype==HALF_TYPE)?2:1;
|
||||||
@@ -111,6 +111,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
|||||||
param_t rl = rs*br;
|
param_t rl = rs*br;
|
||||||
param_t ml = bm*ms;
|
param_t ml = bm*ms;
|
||||||
param_t nl = bn*ns;
|
param_t nl = bn*ns;
|
||||||
|
param_t gridM = ceil(M, ml), gridN = ceil(N, nl);
|
||||||
param_t nthreads = bm*bn*br;
|
param_t nthreads = bm*bn*br;
|
||||||
param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec)));
|
param_t cd_shareda = dtsize*(ml+(A_outer_contig?0:(vec*dtvec)));
|
||||||
param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec)));
|
param_t cd_sharedb = dtsize*(nl+(B_outer_contig?0:(vec*dtvec)));
|
||||||
@@ -140,7 +141,7 @@ void GEMM::check_valid(driver::Device const & device, size_t nkernels, uint32_t*
|
|||||||
&& ns % (dtvec*vec) == 0
|
&& ns % (dtvec*vec) == 0
|
||||||
&& kl % ks == 0
|
&& kl % ks == 0
|
||||||
&& size_shmem <= device.max_shared_memory()
|
&& size_shmem <= device.max_shared_memory()
|
||||||
|
&& (gridr == 1 || gridM*gridN < 64*64)
|
||||||
&& n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop
|
&& n_instructions <= 1024 //Doesn't allow more than 1024 instructions in the inner loop
|
||||||
&& bm <= device.max_block_dim()[0]
|
&& bm <= device.max_block_dim()[0]
|
||||||
&& bn <= device.max_block_dim()[1]
|
&& bn <= device.max_block_dim()[1]
|
||||||
@@ -208,7 +209,7 @@ std::string GEMM::dump(drv::Device const & device, std::string const & name){
|
|||||||
if(vec_==1)
|
if(vec_==1)
|
||||||
vs[0] = "";
|
vs[0] = "";
|
||||||
//Load-Store alignments
|
//Load-Store alignments
|
||||||
io_conf Cio(ldc_, vec_, dtvec, dtsize, kg_>1?false:true);
|
io_conf Cio(ldc_, vec_, dtvec, dtsize, false);
|
||||||
io_conf Aio(lda_, vec_, dtvec, dtsize, false);
|
io_conf Aio(lda_, vec_, dtvec, dtsize, false);
|
||||||
io_conf Bio(ldb_, vec_, dtvec, dtsize, false);
|
io_conf Bio(ldb_, vec_, dtvec, dtsize, false);
|
||||||
|
|
||||||
@@ -889,10 +890,10 @@ void GEMM::enqueue(driver::Kernel &gemm, driver::Stream &queue, const scalar& al
|
|||||||
gemm.setArg(14, bound);
|
gemm.setArg(14, bound);
|
||||||
gemm.setArg(15, locks);
|
gemm.setArg(15, locks);
|
||||||
|
|
||||||
// std::cout << gridM << " " << gridN << " " << kg_ << std::endl;
|
// std::cout << gridM << " " << gridN << " " << std::endl;
|
||||||
//Launch
|
//Launch
|
||||||
if(kg_ > 1)
|
if(kg_ > 1)
|
||||||
locks.set_zero(queue);
|
locks.set_zero(queue, gridM*gridN*4);
|
||||||
queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_});
|
queue.enqueue(gemm, {gridM, gridN, kg_}, {bm_, bn_, bk_});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -29,8 +29,8 @@ def recursive_glob(rootdir='.', suffix=''):
|
|||||||
def main():
|
def main():
|
||||||
|
|
||||||
#Source
|
#Source
|
||||||
include = [os.path.join('src', 'include')]
|
include = [os.path.join(os.pardir, 'include')]
|
||||||
src = recursive_glob(os.path.join('src','lib'), 'cpp')
|
src = recursive_glob(os.path.join(os.pardir,'lib'), 'cpp')
|
||||||
|
|
||||||
#Bindings
|
#Bindings
|
||||||
include += [os.path.join('src', 'bind')]
|
include += [os.path.join('src', 'bind')]
|
||||||
|
@@ -24,6 +24,7 @@
|
|||||||
#include <pybind11/stl.h>
|
#include <pybind11/stl.h>
|
||||||
#include "isaac/driver/backend.h"
|
#include "isaac/driver/backend.h"
|
||||||
#include "isaac/driver/buffer.h"
|
#include "isaac/driver/buffer.h"
|
||||||
|
#include "isaac/driver/error.h"
|
||||||
#include "isaac/driver/cublas.h"
|
#include "isaac/driver/cublas.h"
|
||||||
#include "isaac/driver/stream.h"
|
#include "isaac/driver/stream.h"
|
||||||
#include "isaac/driver/context.h"
|
#include "isaac/driver/context.h"
|
||||||
@@ -87,5 +88,9 @@ void export_driver(py::module&& m)
|
|||||||
py::class_<drv::Stream>(m, "Stream")
|
py::class_<drv::Stream>(m, "Stream")
|
||||||
.def(py::init<drv::Context>())
|
.def(py::init<drv::Context>())
|
||||||
.def("synchronize", &drv::Stream::synchronize);
|
.def("synchronize", &drv::Stream::synchronize);
|
||||||
|
|
||||||
|
|
||||||
|
py::register_exception<drv::exception::cuda::misaligned_address>(m, "MisalignedAddress");
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@@ -34,7 +34,7 @@ def benchmarks(prefix, OpType, device, nsamples):
|
|||||||
op = OpType(params)
|
op = OpType(params)
|
||||||
try:
|
try:
|
||||||
y = op.benchmark(ctx, stream)
|
y = op.benchmark(ctx, stream)
|
||||||
except RuntimeError:
|
except:
|
||||||
continue
|
continue
|
||||||
#Update
|
#Update
|
||||||
bufX[nvalid % step, :] = params
|
bufX[nvalid % step, :] = params
|
||||||
|
@@ -50,7 +50,8 @@ def maximize(OpType, device, model, shapes, V):
|
|||||||
X[:, OpType.nshape_params:] = V
|
X[:, OpType.nshape_params:] = V
|
||||||
X = OpType.get_valid(device, X)
|
X = OpType.get_valid(device, X)
|
||||||
#Model predictions
|
#Model predictions
|
||||||
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
|
with tf.device('/cpu:0'):
|
||||||
|
predictions = model.predict(np.log2(X), batch_size=8192, verbose=0)
|
||||||
pred_perfs = np.sort(predictions, axis=0)[::-1]
|
pred_perfs = np.sort(predictions, axis=0)[::-1]
|
||||||
pred_idxs = np.argsort(predictions, axis=0)[::-1]
|
pred_idxs = np.argsort(predictions, axis=0)[::-1]
|
||||||
#Evaluate best predicted models
|
#Evaluate best predicted models
|
||||||
|
Reference in New Issue
Block a user