[CODEGEN] Major performance improvements on A100 (#70)
Improved handling of asynchronous copy, scheduling and synchronization for A100. Now achieving CUTLASS-like performance on large square dense matrix multiplication tasks
This commit is contained in:
committed by
Philippe Tillet
parent
045ab5d62a
commit
5b83259592
@@ -312,7 +312,6 @@ std::vector<unsigned> align::populate_max_contiguous_gep(ir::getelementptr_inst*
|
||||
if(rhs_cst_info[d].num_cst)
|
||||
rvalue = lhs_max_contiguous[d];
|
||||
result[d] = std::max(lvalue, rvalue);
|
||||
// std::cout << "max contiguous: " << x->get_name() << " " << d << " " << result[d] << std::endl;
|
||||
}
|
||||
return add_to_cache(x, result, max_contiguous_);
|
||||
}
|
||||
@@ -527,8 +526,7 @@ void align::run(ir::module &mod) {
|
||||
ir::for_each_value(mod, [this](ir::value* v) { populate(v); } );
|
||||
// ir::for_each_value(mod, [this](ir::value* v) {
|
||||
// if(dynamic_cast<ir::cast_inst*>(v) || dynamic_cast<ir::getelementptr_inst*>(v))
|
||||
// std::cout << "ALIGN: " << v->get_name() << " " << starting_multiple_.at(v)[0] << " " << max_contiguous_.at(v)[0]
|
||||
// << " " << starting_multiple_.at(v)[1] << " " << max_contiguous_.at(v)[1] << std::endl;
|
||||
// std::cout << "ALIGN: " << v->get_name() << " " << max_contiguous_.at(v)[0] << " " << max_contiguous_.at(v)[1] << std::endl;
|
||||
// });
|
||||
}
|
||||
|
||||
|
@@ -118,15 +118,6 @@ data_layout::data_layout(id_t id,
|
||||
// std::cout << max_contiguous[0] << " " << max_contiguous[1] << std::endl;
|
||||
// std::cout << order_[0] << " " << order_[1] << std::endl;
|
||||
}
|
||||
if(is_recoalesce){
|
||||
if(ptr.size() > 0){
|
||||
// std::cout << "recoalesce: " << order_[0] << " " << order_[1] << " " << ptr.size() << std::endl;
|
||||
// std::cout << max_contiguous[0] << " " << max_contiguous[1] << std::endl;
|
||||
// if(order_[0] == 0)
|
||||
// exit(1);
|
||||
}
|
||||
}
|
||||
// std::cout << "---" << std::endl;
|
||||
}
|
||||
|
||||
int data_layout::find_axis(int to_find) const {
|
||||
@@ -213,14 +204,16 @@ scanline_layout::scanline_layout(size_t num_warps,
|
||||
ir::value *ptr = nullptr;
|
||||
for(ir::value *v: values)
|
||||
for(ir::user *usr: v->get_users())
|
||||
if(auto *st = dynamic_cast<ir::io_inst*>(usr))
|
||||
ptr = st->get_pointer_operand();
|
||||
if(auto *io = dynamic_cast<ir::io_inst*>(usr)){
|
||||
if(!ptr || ptr->get_type()->get_tile_rank() < io->get_pointer_operand()->get_type()->get_tile_rank())
|
||||
ptr = io->get_pointer_operand();
|
||||
}
|
||||
|
||||
unsigned i = order_[0];
|
||||
int contiguous = 1;
|
||||
if(ptr){
|
||||
int nbits = ptr->get_type()->get_pointer_element_ty()->get_scalar_ty()->get_primitive_size_in_bits();
|
||||
contiguous = std::min<int>(align->contiguous(ptr)[i], 128 / nbits);
|
||||
contiguous = std::min<int>(align->get(ptr, i), 128 / nbits);
|
||||
}
|
||||
|
||||
nts_[i] = clamp(size / num_threads, 1, std::min<int>(contiguous, shape_[i]));
|
||||
|
Reference in New Issue
Block a user