[CODEGEN] Major performance improvements on A100 (#70)

Improved handling of asynchronous copy, scheduling and synchronization for A100. Now achieving CUTLASS-like performance on large square dense matrix multiplication tasks
2021-02-21 15:19:39 -08:00
parent 045ab5d62a
commit 5b83259592
31 changed files with 1331 additions and 1115 deletions
--- a/lib/codegen/analysis/align.cc
+++ b/lib/codegen/analysis/align.cc
@@ -312,7 +312,6 @@ std::vector<unsigned> align::populate_max_contiguous_gep(ir::getelementptr_inst*
    if(rhs_cst_info[d].num_cst)
      rvalue = lhs_max_contiguous[d];
    result[d] = std::max(lvalue, rvalue);
-//    std::cout << "max contiguous: " << x->get_name() << " " << d << " " << result[d] << std::endl;
  }
  return add_to_cache(x, result, max_contiguous_);
 }
@@ -527,8 +526,7 @@ void align::run(ir::module &mod) {
  ir::for_each_value(mod, [this](ir::value* v) { populate(v); } );
 //  ir::for_each_value(mod, [this](ir::value* v) {
 //      if(dynamic_cast<ir::cast_inst*>(v) || dynamic_cast<ir::getelementptr_inst*>(v))
-//        std::cout << "ALIGN: " << v->get_name() << " " << starting_multiple_.at(v)[0] << " " << max_contiguous_.at(v)[0]
-//                                   << " " << starting_multiple_.at(v)[1] << " " << max_contiguous_.at(v)[1] << std::endl;
+//        std::cout << "ALIGN: " << v->get_name() << " " << max_contiguous_.at(v)[0] << " " << max_contiguous_.at(v)[1] << std::endl;
 //  });
 }