Revert "[BACKEND] Various bug fixes; making reductions faster (#533)".
This is a more stable commit that produce bitwise identical code to earlier versions. Using commits after this one may lead to slightly different numerics
This commit is contained in:
@@ -209,15 +209,14 @@ mma_layout::mma_layout(size_t num_warps,
|
||||
rep_ = {2*pack_size_0, 2*pack_size_1, 1};
|
||||
spw_ = {fpw_[0]*4*rep_[0], fpw_[1]*4*rep_[1], 1};
|
||||
contig_per_thread_ = {1, 1};
|
||||
order_ = {0, 1};
|
||||
}
|
||||
else{
|
||||
// fpw_ = {1, 1, 1};
|
||||
spw_ = mma_instr_shape_.at(tensor_core_type_); // e.g., {16, 8, 16} for f32.f16.f16.f32
|
||||
contig_per_thread_ = {1, 2};
|
||||
order_ = {1, 0};
|
||||
// rep_ = {2, 2, 1};
|
||||
}
|
||||
order_ = {0, 1};
|
||||
|
||||
/* warps per tile */
|
||||
wpt_ = {1, 1, 1};
|
||||
@@ -617,9 +616,8 @@ void layouts::run(ir::module &mod) {
|
||||
unsigned axis = red->get_axis();
|
||||
// shape
|
||||
auto shapes = arg->get_type()->get_block_shapes();
|
||||
distributed_layout* layout = dynamic_cast<analysis::distributed_layout*>(get(arg));
|
||||
shapes[axis] = layout->shape_per_cta(axis) / layout->contig_per_thread(axis);
|
||||
|
||||
scanline_layout *layout = get(arg)->to_scanline();
|
||||
shapes[axis] = layout->mts(axis);
|
||||
// create layout
|
||||
layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_, tgt_);
|
||||
tmp_[red] = id;
|
||||
|
Reference in New Issue
Block a user