[CODEGEN] Performance improvement on A100 (#125)

Improved codegen for the Ampere GPUs.

    * Make the layout pass recognize the multistage pipelined pattern.
    * Now the pipeline pass can automate the multistage pipelining transformation.
    * Remove extra barriers (from the prefetch pass & WAR) on Ampere.
    * Update the code generator (generator.cc) to make Triton generate n-buffered shared memory loads/stores.
This commit is contained in:
daadaada
2021-06-21 14:25:13 +08:00
committed by Philippe Tillet
parent 5a51f3e529
commit d8d6b715c8
21 changed files with 855 additions and 174 deletions

View File

@@ -77,6 +77,54 @@ void print(module &mod, std::ostream& os) {
}
}
void print(function &fn, std::ostream &os) {
//
}
void print(basic_block &bb, std::ostream &os) {
auto const &predecessors = bb.get_predecessors();
os << bb.get_name() << ":";
if(!predecessors.empty()){
os << " ";
os << "; preds = ";
auto const &predecessors = bb.get_predecessors();
for(ir::basic_block *pred: predecessors)
os << pred->get_name() << (pred!=predecessors.back()?", ":"");
}
os << std::endl;
for(ir::instruction *inst: bb.get_inst_list()){
print(*inst, os);
}
}
void print(instruction &instr, std::ostream &os) {
instruction *inst = &instr;
os << " ";
if(!inst->get_type()->is_void_ty()){
os << instr.get_name();
os << " = ";
}
ir::type* type = inst->get_type();
os << inst->repr() << " " << type->repr();
ir::instruction::ops_t ops = inst->ops();
size_t num_ops = inst->get_num_operands();
if(num_ops > 0)
os << " ";;
for(unsigned i = 0; i < num_ops; i++){
if(auto *x = dynamic_cast<ir::constant*>(ops[i]))
os << x->repr();
else
os << ops[i]->get_name();
os << (i < num_ops - 1?", ":"");
}
os << ";";
// os << " (";
// for(ir::user* usr: inst->get_users())
// os << get_name(usr, cnt++) << ", " ;
// os << " )";
os << std::endl;
}
}
}