[tests][bench] now benchmarking all variants of copy

This commit is contained in:
Philippe Tillet
2019-09-17 17:40:03 -04:00
parent 307c1128d5
commit 1fd9be27ee
10 changed files with 70 additions and 63 deletions

View File

@@ -24,6 +24,8 @@ class layout {
typedef std::map <node_t, std::set<node_t>> graph_t;
private:
// create edge
void connect(ir::value *x, ir::value *y);
// connected components
void connected_components(node_t x, std::set<node_t> &nodes, graph_t &graph, unsigned id);
// list the axes of the given value

View File

@@ -158,7 +158,6 @@ void axes::run(ir::module &mod) {
unsigned group_id = 0;
while(!nodes_.empty())
connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++);
std::cout << "Number of axes: " << group_id << std::endl;
}
}

View File

@@ -53,6 +53,27 @@ const std::vector<ir::value*>& layout::values(unsigned id) const
size_t layout::get_num_groups() const
{ return values_.size(); }
void layout::connect(ir::value *x, ir::value *y) {
if(x == y)
return;
if(!x->get_type()->is_tile_ty())
return;
if(!y->get_type()->is_tile_ty())
return;
std::set<int> x_axes = axes_of(x);
std::set<int> y_axes = axes_of(y);
std::set<int> common;
std::set_intersection(x_axes.begin(), x_axes.end(),
y_axes.begin(), y_axes.end(),
std::inserter(common, common.begin()));
if(!common.empty()){
nodes_.insert(x);
nodes_.insert(y);
dependencies_[x].insert(y);
dependencies_[y].insert(x);
}
}
// run
void layout::run(ir::module &mod) {
nodes_.clear();
@@ -63,26 +84,12 @@ void layout::run(ir::module &mod) {
for(ir::function *fn: mod.get_function_list())
for(ir::basic_block *block: fn->blocks())
for(ir::instruction *i : block->get_inst_list()) {
// skip scalars
if(!i->get_type()->is_tile_ty())
continue;
// add an edge between i and the operands that share an axis
std::set<int> i_axes = axes_of(i);
nodes_.insert(i);
for(ir::value* op: i->ops()){
if(!op->get_type()->is_tile_ty())
continue;
nodes_.insert(op);
std::set<int> op_axes = axes_of(op);
std::set<int> common;
std::set_intersection(i_axes.begin(), i_axes.end(),
op_axes.begin(), op_axes.end(),
std::inserter(common, common.begin()));
if(!common.empty() || !op->get_type()->is_tile_ty()){
dependencies_[i].insert(op);
dependencies_[op].insert(i);
for(ir::value* opx: i->ops())
for(ir::value* opy: i->ops()){
connect(i, opx);
connect(opx, opy);
}
}
}
// Grids
unsigned group_id = 0;

View File

@@ -190,8 +190,6 @@ void tiles::run(ir::module &) {
);
}
order_[i] = order;
std::cout << "order: " << order[0] << " " << order[1] << std::endl;
}
// tiling parameters
for(auto x: largest_){

View File

@@ -1035,11 +1035,17 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio
}
void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) {
shared_tile* result = (shared_tile*)tmap_.at(x);
unsigned vector_size = 1;
auto x_order = tiles_->order(x);
ir::value *arg = x->get_operand(0);
auto arg_order = tiles_->order(arg);
// tiles
shared_tile* result = (shared_tile*)tmap_.at(x);
distributed_tile* in = (distributed_tile*)tmap_.at(arg);
size_t ld = tiles_->order(arg)[0];
unsigned vector_size = in->axis(ld).contiguous;
if(x_order == arg_order){
size_t ld = arg_order[0];
vector_size = std::min(tiles_->nts(x, ld),tiles_->nts(arg, ld));
}
std::map<unsigned, Value*> packets;
in->for_each([&](indices_t idx){

View File

@@ -51,6 +51,11 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder,
auto& inst_list = i->get_parent()->get_inst_list();
auto pos = ++std::find(inst_list.begin(), inst_list.end(), i);
builder.set_insert_point(pos);
if(dynamic_cast<ir::load_inst*>(x)){
ir::value *ret = builder.insert(ir::copy_to_shared_inst::create(x));
// x->replace_all_uses_with(ret);
return ret;
}
// default -- recursive clone
ir::instruction *cloned = builder.insert(i->clone());
seen[i] = cloned;
@@ -97,6 +102,9 @@ void coalesce::run(ir::module &mod) {
r->replace_all_uses_with(cts);
cts->replace_uses_of_with(cts, r);
}
else{
}
}
}

View File

@@ -92,10 +92,10 @@ void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std
file_type_t ft) {
init_llvm();
// debug
llvm::legacy::PassManager pm;
pm.add(llvm::createPrintModulePass(llvm::outs()));
// llvm::legacy::PassManager pm;
// pm.add(llvm::createPrintModulePass(llvm::outs()));
// pm.add(llvm::createVerifierPass());
pm.run(*module);
// pm.run(*module);
// create machine
module->setTargetTriple(triple);
std::string error;
@@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr<llvm::Module> module,
cu_module::cu_module(driver::context * context, std::unique_ptr<llvm::Module> ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { }
cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
std::cout << source_ << std::endl;
cu_context::context_switcher ctx_switch(*context);
// JIT compile source-code
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};

View File

@@ -220,7 +220,7 @@ std::unique_ptr<driver::module> function::make_bin(ir::module &module, driver::c
axes.run(module);
layouts.run(module);
coalesce.run(module);
// ir::print(module, std::cout);
dce.run(module);
align.run(module);
dce.run(module);
tiles.run(module);

View File

@@ -11,19 +11,21 @@
#include "cuda/cublas.h"
std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order){
std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order_x, order_t order_y){
typedef float NumericT;
std::string ty = "float";
size_t dt_nbytes = sizeof(NumericT);
drv::context* context = stream->context();
int32_t ld = order == ROWMAJOR ? N : M;
// create inputs
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, M*N*dt_nbytes));
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, M*N*dt_nbytes));
// create options
rt::function::options_space_t opt;
opt.defines.push_back({"TYPE", {ty}});
opt.defines.push_back({"ORDER", {order==ROWMAJOR?"ROWMAJOR":"COLMAJOR"}});
opt.defines.push_back({"STRIDE_XM", {(order_x == ROWMAJOR)?"M":"1"}});
opt.defines.push_back({"STRIDE_XN", {(order_x == ROWMAJOR)?"1":"N"}});
opt.defines.push_back({"STRIDE_YM", {(order_y == ROWMAJOR)?"M":"1"}});
opt.defines.push_back({"STRIDE_YN", {(order_y == ROWMAJOR)?"1":"N"}});
opt.defines.push_back({"TM", {"32"}});
opt.defines.push_back({"TN", {"32"}});
opt.num_warps = {4};
@@ -33,7 +35,7 @@ std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t
std::vector<double> result;
auto gbps = [&](double ns) { return 2*M*N*dt_nbytes / (ns * 1e-9) * 1e-9; };
// triton
double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N, ld, ld}, grid2d(M, N), stream);}, stream);
double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N}, grid2d(M, N), stream);}, stream);
result.push_back(gbps(triton_ns));
// done
return result;
@@ -44,21 +46,20 @@ int main() {
auto context = triton::driver::backend::contexts::get_default();
triton::driver::stream* stream = triton::driver::stream::create(context);
// shapes to benchmark
typedef std::tuple<int, int, order_t> config_t;
std::vector<config_t> configs;
for(auto x: std::vector<order_t>{COLMAJOR}){
std::vector<config_t> tmp = {
config_t{4096, 4096, x}
};
configs.insert(configs.end(), tmp.begin(), tmp.end());
}
typedef std::tuple<int, int, order_t, order_t> config_t;
std::vector<config_t> configs = {
{4096, 4096, ROWMAJOR, ROWMAJOR},
{4096, 4096, COLMAJOR, ROWMAJOR},
{4096, 4096, ROWMAJOR, COLMAJOR},
{4096, 4096, COLMAJOR, COLMAJOR},
};
// does the work
int32_t M, N;
order_t ord;
order_t ord_x, ord_y;
for(const auto& c: configs){
std::tie(M, N, ord) = c;
std::cout << "// " << M << ", " << N << ", " << ord << std::flush;
for(auto perf: do_bench(stream, M, N, ord))
std::tie(M, N, ord_x, ord_y) = c;
std::cout << "// " << M << ", " << N << ", " << ord_x << ", " << ord_y << std::flush;
for(auto perf: do_bench(stream, M, N, ord_x, ord_y))
std::cout << ", " << perf << std::flush;
std::cout << std::endl;
}

View File

@@ -16,29 +16,16 @@ void copy1d(TYPE * X __noalias __readonly __aligned(16),
const char *copy2d =
R"(
#if ORDER == ROWMAJOR
#define STRIDE_XM ldx
#define STRIDE_XN 1
#define STRIDE_YM ldy
#define STRIDE_YN 1
#else
#define STRIDE_XM 1
#define STRIDE_XN ldx
#define STRIDE_YM 1
#define STRIDE_YN ldy
#endif
void copy2d(TYPE * X __noalias __readonly __aligned(16),
TYPE * Y __noalias __writeonly __aligned(16),
int M, int N,
int ldx __multipleof(8),
int ldy __multipleof(8)) {
int M __multipleof(8),
int N __multipleof(8)) {
int ridm = get_program_id(0);
int ridn = get_program_id(1);
int rm[TM] = ridm * TM + 0 ... TM;
int rn[TN] = ridn * TN + 0 ... TN;
TYPE* px[TM, TN] = X + rm[:, newaxis] * ldx + rn[newaxis, :] ;
TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy;
TYPE* px[TM, TN] = X + rm[:, newaxis] * STRIDE_XM + rn[newaxis, :] * STRIDE_XN;
TYPE* py[TM, TN] = Y + rm[:, newaxis] * STRIDE_YM + rn[newaxis, :] * STRIDE_YN;
*py = *px;
}
)";