[tests][bench] now benchmarking all variants of copy
This commit is contained in:
@@ -24,6 +24,8 @@ class layout {
|
||||
typedef std::map <node_t, std::set<node_t>> graph_t;
|
||||
|
||||
private:
|
||||
// create edge
|
||||
void connect(ir::value *x, ir::value *y);
|
||||
// connected components
|
||||
void connected_components(node_t x, std::set<node_t> &nodes, graph_t &graph, unsigned id);
|
||||
// list the axes of the given value
|
||||
|
@@ -158,7 +158,6 @@ void axes::run(ir::module &mod) {
|
||||
unsigned group_id = 0;
|
||||
while(!nodes_.empty())
|
||||
connected_components(*nodes_.begin(), nodes_, dependencies_, group_id++);
|
||||
std::cout << "Number of axes: " << group_id << std::endl;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -53,6 +53,27 @@ const std::vector<ir::value*>& layout::values(unsigned id) const
|
||||
size_t layout::get_num_groups() const
|
||||
{ return values_.size(); }
|
||||
|
||||
void layout::connect(ir::value *x, ir::value *y) {
|
||||
if(x == y)
|
||||
return;
|
||||
if(!x->get_type()->is_tile_ty())
|
||||
return;
|
||||
if(!y->get_type()->is_tile_ty())
|
||||
return;
|
||||
std::set<int> x_axes = axes_of(x);
|
||||
std::set<int> y_axes = axes_of(y);
|
||||
std::set<int> common;
|
||||
std::set_intersection(x_axes.begin(), x_axes.end(),
|
||||
y_axes.begin(), y_axes.end(),
|
||||
std::inserter(common, common.begin()));
|
||||
if(!common.empty()){
|
||||
nodes_.insert(x);
|
||||
nodes_.insert(y);
|
||||
dependencies_[x].insert(y);
|
||||
dependencies_[y].insert(x);
|
||||
}
|
||||
}
|
||||
|
||||
// run
|
||||
void layout::run(ir::module &mod) {
|
||||
nodes_.clear();
|
||||
@@ -63,26 +84,12 @@ void layout::run(ir::module &mod) {
|
||||
for(ir::function *fn: mod.get_function_list())
|
||||
for(ir::basic_block *block: fn->blocks())
|
||||
for(ir::instruction *i : block->get_inst_list()) {
|
||||
// skip scalars
|
||||
if(!i->get_type()->is_tile_ty())
|
||||
continue;
|
||||
// add an edge between i and the operands that share an axis
|
||||
std::set<int> i_axes = axes_of(i);
|
||||
nodes_.insert(i);
|
||||
for(ir::value* op: i->ops()){
|
||||
if(!op->get_type()->is_tile_ty())
|
||||
continue;
|
||||
nodes_.insert(op);
|
||||
std::set<int> op_axes = axes_of(op);
|
||||
std::set<int> common;
|
||||
std::set_intersection(i_axes.begin(), i_axes.end(),
|
||||
op_axes.begin(), op_axes.end(),
|
||||
std::inserter(common, common.begin()));
|
||||
if(!common.empty() || !op->get_type()->is_tile_ty()){
|
||||
dependencies_[i].insert(op);
|
||||
dependencies_[op].insert(i);
|
||||
for(ir::value* opx: i->ops())
|
||||
for(ir::value* opy: i->ops()){
|
||||
connect(i, opx);
|
||||
connect(opx, opy);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// Grids
|
||||
unsigned group_id = 0;
|
||||
|
@@ -190,8 +190,6 @@ void tiles::run(ir::module &) {
|
||||
);
|
||||
}
|
||||
order_[i] = order;
|
||||
std::cout << "order: " << order[0] << " " << order[1] << std::endl;
|
||||
|
||||
}
|
||||
// tiling parameters
|
||||
for(auto x: largest_){
|
||||
|
@@ -1035,11 +1035,17 @@ void selection::lower_broadcast(ir::broadcast_inst *x, LLVMContext &ctx, Functio
|
||||
}
|
||||
|
||||
void selection::lower_copy_to_shared(ir::copy_to_shared_inst *x, LLVMContext &ctx, Function *fn, IRBuilder<> &builder) {
|
||||
shared_tile* result = (shared_tile*)tmap_.at(x);
|
||||
unsigned vector_size = 1;
|
||||
auto x_order = tiles_->order(x);
|
||||
ir::value *arg = x->get_operand(0);
|
||||
auto arg_order = tiles_->order(arg);
|
||||
// tiles
|
||||
shared_tile* result = (shared_tile*)tmap_.at(x);
|
||||
distributed_tile* in = (distributed_tile*)tmap_.at(arg);
|
||||
size_t ld = tiles_->order(arg)[0];
|
||||
unsigned vector_size = in->axis(ld).contiguous;
|
||||
if(x_order == arg_order){
|
||||
size_t ld = arg_order[0];
|
||||
vector_size = std::min(tiles_->nts(x, ld),tiles_->nts(arg, ld));
|
||||
}
|
||||
|
||||
std::map<unsigned, Value*> packets;
|
||||
in->for_each([&](indices_t idx){
|
||||
|
@@ -51,6 +51,11 @@ ir::value* coalesce::rematerialize(ir::value *x, ir::builder &builder,
|
||||
auto& inst_list = i->get_parent()->get_inst_list();
|
||||
auto pos = ++std::find(inst_list.begin(), inst_list.end(), i);
|
||||
builder.set_insert_point(pos);
|
||||
if(dynamic_cast<ir::load_inst*>(x)){
|
||||
ir::value *ret = builder.insert(ir::copy_to_shared_inst::create(x));
|
||||
// x->replace_all_uses_with(ret);
|
||||
return ret;
|
||||
}
|
||||
// default -- recursive clone
|
||||
ir::instruction *cloned = builder.insert(i->clone());
|
||||
seen[i] = cloned;
|
||||
@@ -97,6 +102,9 @@ void coalesce::run(ir::module &mod) {
|
||||
r->replace_all_uses_with(cts);
|
||||
cts->replace_uses_of_with(cts, r);
|
||||
}
|
||||
else{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -92,10 +92,10 @@ void module::compile_llvm_module(std::unique_ptr<llvm::Module> module, const std
|
||||
file_type_t ft) {
|
||||
init_llvm();
|
||||
// debug
|
||||
llvm::legacy::PassManager pm;
|
||||
pm.add(llvm::createPrintModulePass(llvm::outs()));
|
||||
// llvm::legacy::PassManager pm;
|
||||
// pm.add(llvm::createPrintModulePass(llvm::outs()));
|
||||
// pm.add(llvm::createVerifierPass());
|
||||
pm.run(*module);
|
||||
// pm.run(*module);
|
||||
// create machine
|
||||
module->setTargetTriple(triple);
|
||||
std::string error;
|
||||
@@ -241,7 +241,6 @@ std::string cu_module::compile_llvm_module(std::unique_ptr<llvm::Module> module,
|
||||
cu_module::cu_module(driver::context * context, std::unique_ptr<llvm::Module> ll_module): cu_module(context, compile_llvm_module(std::move(ll_module), context->device())) { }
|
||||
|
||||
cu_module::cu_module(driver::context * context, std::string const & source) : module(context, CUmodule(), true), source_(source){
|
||||
std::cout << source_ << std::endl;
|
||||
cu_context::context_switcher ctx_switch(*context);
|
||||
// JIT compile source-code
|
||||
CUjit_option opt[] = {CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, CU_JIT_ERROR_LOG_BUFFER};
|
||||
|
@@ -220,7 +220,7 @@ std::unique_ptr<driver::module> function::make_bin(ir::module &module, driver::c
|
||||
axes.run(module);
|
||||
layouts.run(module);
|
||||
coalesce.run(module);
|
||||
// ir::print(module, std::cout);
|
||||
dce.run(module);
|
||||
align.run(module);
|
||||
dce.run(module);
|
||||
tiles.run(module);
|
||||
|
@@ -11,19 +11,21 @@
|
||||
#include "cuda/cublas.h"
|
||||
|
||||
|
||||
std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order){
|
||||
std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t order_x, order_t order_y){
|
||||
typedef float NumericT;
|
||||
std::string ty = "float";
|
||||
size_t dt_nbytes = sizeof(NumericT);
|
||||
drv::context* context = stream->context();
|
||||
int32_t ld = order == ROWMAJOR ? N : M;
|
||||
// create inputs
|
||||
auto dx = std::unique_ptr<drv::buffer>(drv::buffer::create(context, M*N*dt_nbytes));
|
||||
auto dy = std::unique_ptr<drv::buffer>(drv::buffer::create(context, M*N*dt_nbytes));
|
||||
// create options
|
||||
rt::function::options_space_t opt;
|
||||
opt.defines.push_back({"TYPE", {ty}});
|
||||
opt.defines.push_back({"ORDER", {order==ROWMAJOR?"ROWMAJOR":"COLMAJOR"}});
|
||||
opt.defines.push_back({"STRIDE_XM", {(order_x == ROWMAJOR)?"M":"1"}});
|
||||
opt.defines.push_back({"STRIDE_XN", {(order_x == ROWMAJOR)?"1":"N"}});
|
||||
opt.defines.push_back({"STRIDE_YM", {(order_y == ROWMAJOR)?"M":"1"}});
|
||||
opt.defines.push_back({"STRIDE_YN", {(order_y == ROWMAJOR)?"1":"N"}});
|
||||
opt.defines.push_back({"TM", {"32"}});
|
||||
opt.defines.push_back({"TN", {"32"}});
|
||||
opt.num_warps = {4};
|
||||
@@ -33,7 +35,7 @@ std::vector<double> do_bench(drv::stream* stream, int32_t M, int32_t N, order_t
|
||||
std::vector<double> result;
|
||||
auto gbps = [&](double ns) { return 2*M*N*dt_nbytes / (ns * 1e-9) * 1e-9; };
|
||||
// triton
|
||||
double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N, ld, ld}, grid2d(M, N), stream);}, stream);
|
||||
double triton_ns = triton::tools::bench([&]() { function({&*dx, &*dy, M, N}, grid2d(M, N), stream);}, stream);
|
||||
result.push_back(gbps(triton_ns));
|
||||
// done
|
||||
return result;
|
||||
@@ -44,21 +46,20 @@ int main() {
|
||||
auto context = triton::driver::backend::contexts::get_default();
|
||||
triton::driver::stream* stream = triton::driver::stream::create(context);
|
||||
// shapes to benchmark
|
||||
typedef std::tuple<int, int, order_t> config_t;
|
||||
std::vector<config_t> configs;
|
||||
for(auto x: std::vector<order_t>{COLMAJOR}){
|
||||
std::vector<config_t> tmp = {
|
||||
config_t{4096, 4096, x}
|
||||
};
|
||||
configs.insert(configs.end(), tmp.begin(), tmp.end());
|
||||
}
|
||||
typedef std::tuple<int, int, order_t, order_t> config_t;
|
||||
std::vector<config_t> configs = {
|
||||
{4096, 4096, ROWMAJOR, ROWMAJOR},
|
||||
{4096, 4096, COLMAJOR, ROWMAJOR},
|
||||
{4096, 4096, ROWMAJOR, COLMAJOR},
|
||||
{4096, 4096, COLMAJOR, COLMAJOR},
|
||||
};
|
||||
// does the work
|
||||
int32_t M, N;
|
||||
order_t ord;
|
||||
order_t ord_x, ord_y;
|
||||
for(const auto& c: configs){
|
||||
std::tie(M, N, ord) = c;
|
||||
std::cout << "// " << M << ", " << N << ", " << ord << std::flush;
|
||||
for(auto perf: do_bench(stream, M, N, ord))
|
||||
std::tie(M, N, ord_x, ord_y) = c;
|
||||
std::cout << "// " << M << ", " << N << ", " << ord_x << ", " << ord_y << std::flush;
|
||||
for(auto perf: do_bench(stream, M, N, ord_x, ord_y))
|
||||
std::cout << ", " << perf << std::flush;
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
@@ -16,29 +16,16 @@ void copy1d(TYPE * X __noalias __readonly __aligned(16),
|
||||
|
||||
const char *copy2d =
|
||||
R"(
|
||||
#if ORDER == ROWMAJOR
|
||||
#define STRIDE_XM ldx
|
||||
#define STRIDE_XN 1
|
||||
#define STRIDE_YM ldy
|
||||
#define STRIDE_YN 1
|
||||
#else
|
||||
#define STRIDE_XM 1
|
||||
#define STRIDE_XN ldx
|
||||
#define STRIDE_YM 1
|
||||
#define STRIDE_YN ldy
|
||||
#endif
|
||||
|
||||
void copy2d(TYPE * X __noalias __readonly __aligned(16),
|
||||
TYPE * Y __noalias __writeonly __aligned(16),
|
||||
int M, int N,
|
||||
int ldx __multipleof(8),
|
||||
int ldy __multipleof(8)) {
|
||||
int M __multipleof(8),
|
||||
int N __multipleof(8)) {
|
||||
int ridm = get_program_id(0);
|
||||
int ridn = get_program_id(1);
|
||||
int rm[TM] = ridm * TM + 0 ... TM;
|
||||
int rn[TN] = ridn * TN + 0 ... TN;
|
||||
TYPE* px[TM, TN] = X + rm[:, newaxis] * ldx + rn[newaxis, :] ;
|
||||
TYPE* py[TM, TN] = Y + rm[:, newaxis] + rn[newaxis, :] * ldy;
|
||||
TYPE* px[TM, TN] = X + rm[:, newaxis] * STRIDE_XM + rn[newaxis, :] * STRIDE_XN;
|
||||
TYPE* py[TM, TN] = Y + rm[:, newaxis] * STRIDE_YM + rn[newaxis, :] * STRIDE_YN;
|
||||
*py = *px;
|
||||
}
|
||||
)";
|
||||
|
Reference in New Issue
Block a user