Revert "[BACKEND] Various bug fixes; making reductions faster (#533)".

This is a more stable commit that produce bitwise identical code to earlier
versions. Using commits after this one may lead to slightly different numerics
This commit is contained in:
Philippe Tillet
2022-06-03 11:36:06 -07:00
parent efa04cac1f
commit a60374a597
11 changed files with 65 additions and 173 deletions

View File

@@ -319,8 +319,8 @@ std::vector<unsigned> align::populate_max_contiguous_binop(ir::binary_operator*
}
if(x->is_int_add_sub()){
unsigned lvalue = 1, rvalue = 1;
lvalue = gcd(rhs_max_contiguous[d], lhs_cst_info[d].num_cst);
rvalue = gcd(lhs_max_contiguous[d], rhs_cst_info[d].num_cst);
lvalue = gcd(rhs_max_contiguous[d], lhs_starting_multiple[d]);
rvalue = gcd(lhs_max_contiguous[d], rhs_starting_multiple[d]);
value = std::max(lvalue, rvalue);
}
result.push_back(value);

View File

@@ -209,15 +209,14 @@ mma_layout::mma_layout(size_t num_warps,
rep_ = {2*pack_size_0, 2*pack_size_1, 1};
spw_ = {fpw_[0]*4*rep_[0], fpw_[1]*4*rep_[1], 1};
contig_per_thread_ = {1, 1};
order_ = {0, 1};
}
else{
// fpw_ = {1, 1, 1};
spw_ = mma_instr_shape_.at(tensor_core_type_); // e.g., {16, 8, 16} for f32.f16.f16.f32
contig_per_thread_ = {1, 2};
order_ = {1, 0};
// rep_ = {2, 2, 1};
}
order_ = {0, 1};
/* warps per tile */
wpt_ = {1, 1, 1};
@@ -617,9 +616,8 @@ void layouts::run(ir::module &mod) {
unsigned axis = red->get_axis();
// shape
auto shapes = arg->get_type()->get_block_shapes();
distributed_layout* layout = dynamic_cast<analysis::distributed_layout*>(get(arg));
shapes[axis] = layout->shape_per_cta(axis) / layout->contig_per_thread(axis);
scanline_layout *layout = get(arg)->to_scanline();
shapes[axis] = layout->mts(axis);
// create layout
layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_, tgt_);
tmp_[red] = id;