[CODEGEN] Various bugfixes and stability improvements in compiler backend (#240)

This commit is contained in:
Philippe Tillet
2021-08-30 11:50:35 -07:00
committed by GitHub
parent 85426dbaf7
commit 4ff3714d61
25 changed files with 568 additions and 399 deletions

View File

@@ -50,7 +50,6 @@ void allocation::run(ir::module &mod) {
J.erase(j_it);
}
}
// Build interference graph
std::map<shared_layout*, std::set<shared_layout*>> interferences;
for(shared_layout* x: V)
@@ -66,13 +65,10 @@ void allocation::run(ir::module &mod) {
&& XS.intersect(YS))
interferences[x].insert(y);
}
// Initialize colors
std::map<shared_layout*, int> colors;
for(shared_layout* X: V)
colors[X] = (X==V[0])?0:-1;
// First-fit graph coloring
std::vector<bool> available(V.size());
for(shared_layout* x: V){
@@ -87,7 +83,6 @@ void allocation::run(ir::module &mod) {
auto It = std::find(available.begin(), available.end(), true);
colors[x] = std::distance(available.begin(), It);
}
// Finalize allocation
for(shared_layout* x: V){
unsigned Adj = 0;
@@ -95,7 +90,6 @@ void allocation::run(ir::module &mod) {
Adj = std::max<unsigned>(Adj, starts[y] + y->get_size());
offsets_[x] = starts[x] + colors[x] * Adj;
}
// Save maximum size of induced memory space
allocated_size_ = 0;
for(shared_layout* x: V)

View File

@@ -105,17 +105,17 @@ void axes::update_graph_no_edge(ir::instruction *i) {
void axes::update_graph(ir::instruction *i) {
switch (i->get_id()) {
case ir::INST_REDUCE: return update_graph_reduce(i);
case ir::INST_RESHAPE: return update_graph_reshape(i);
case ir::INST_SPLAT: return update_graph_no_edge(i);;
case ir::INST_TRANS: return update_graph_trans(i);
case ir::INST_BROADCAST: return update_graph_broadcast(i);
case ir::INST_DOT: return update_graph_dot(i);
case ir::INST_COPY_TO_SHARED: return update_graph_no_edge(i);
case ir::INST_MASKED_LOAD_ASYNC:return update_graph_elementwise(i, false);
case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i);
case ir::INST_RECOALESCE: return update_graph_no_edge(i);
default: return update_graph_elementwise(i);
case ir::INST_REDUCE: return update_graph_reduce(i);
case ir::INST_RESHAPE: return update_graph_reshape(i);
case ir::INST_SPLAT: return update_graph_no_edge(i);;
case ir::INST_TRANS: return update_graph_trans(i);
case ir::INST_BROADCAST: return update_graph_broadcast(i);
case ir::INST_DOT: return update_graph_dot(i);
case ir::INST_COPY_TO_SHARED: return update_graph_no_edge(i);
case ir::INST_MASKED_LOAD_ASYNC: return update_graph_elementwise(i, false);
case ir::INST_COPY_FROM_SHARED: return update_graph_no_edge(i);
case ir::INST_CVT_LAYOUT: return update_graph_no_edge(i);
default: return update_graph_elementwise(i);
}
return;
}
@@ -135,11 +135,15 @@ std::vector<int> axes::get(ir::value *value) {
void axes::run(ir::module &mod) {
// make graph
graph_.clear();
axes_.clear();
ir::for_each_instruction(mod, [this](ir::instruction *x) {
update_graph(x);
});
// find connected components
graph_.connected_components(nullptr, &axes_);
std::set<size_t> uniq;
for(auto x: axes_)
uniq.insert(x.second);
}
}

View File

@@ -109,9 +109,6 @@ data_layout::data_layout(id_t id,
max_contiguous = curr;
}
}
bool is_recoalesce = false;
for(ir::value* v: values)
is_recoalesce = is_recoalesce || dynamic_cast<ir::recoalesce_inst*>(v);
if(max_contiguous.size() > 0){
std::sort(order_.begin(), order_.end(), [&](unsigned a, unsigned b) {
return max_contiguous[a] > max_contiguous[b];
@@ -129,6 +126,13 @@ int data_layout::find_axis(int to_find) const {
}
distributed_layout::distributed_layout(id_t id,
const std::vector<int> &axes,
const std::vector<unsigned> &shape,
const std::vector<ir::value *> &values,
analysis::align* align): data_layout(id, axes, shape, values, align)
{ }
/* -------------------------------- *
* MMA Layout *
* -------------------------------- */
@@ -138,20 +142,11 @@ mma_layout::mma_layout(size_t num_warps,
const std::vector<unsigned>& shape,
const std::vector<ir::value *> &values,
analysis::align* align, target* tgt,
shared_layout *layout_a, shared_layout *layout_b): data_layout(MMA, axes, shape, values, align) {
shared_layout *layout_a, shared_layout *layout_b): distributed_layout(MMA, axes, shape, values, align) {
/* fragments per warp */
// try to make things as square as possible to maximize data re-use
if(tgt->as_nvidia()->sm() < 80){
fpw_ = {2, 2, 1};
// std::vector<int> fpw_nm1;
// unsigned num_fragments = std::min<unsigned>((shape_[0]/8)*(shape_[1]/8), 4);
// do {
// fpw_nm1 = fpw_;
// if(fpw_[0]*fpw_[1] < num_fragments)
// fpw_[0] = clamp(fpw_[0]*2, 1, shape_[0] / 8);
// if(fpw_[0]*fpw_[1] < num_fragments)
// fpw_[1] = clamp(fpw_[1]*2, 1, shape_[1] / 8);
// }while(fpw_nm1 != fpw_);
auto ord_a = layout_a->get_order();
auto ord_b = layout_b->get_order();
bool is_a_row = ord_a[0] != 0;
@@ -168,6 +163,7 @@ mma_layout::mma_layout(size_t num_warps,
spw_ = {16, 8, 1};
rep_ = {2, 2, 1};
}
order_ = {0, 1};
/* warps per tile */
// try to make things as square as possible to maximize data re-use
@@ -182,7 +178,7 @@ mma_layout::mma_layout(size_t num_warps,
}while(wpt_nm1 != wpt_);
/* shape per block */
spt_ = {spw_[0]*wpt_[0], spw_[1]*wpt_[1], 1};
shape_per_cta_ = {spw_[0]*wpt_[0], spw_[1]*wpt_[1], 1};
}
@@ -194,7 +190,7 @@ scanline_layout::scanline_layout(size_t num_warps,
const std::vector<int>& axes,
const std::vector<unsigned>& shape,
const std::vector<ir::value *> &values,
analysis::align* align, target *tgt): data_layout(SCANLINE, axes, shape, values, align){
analysis::align* align, target *tgt): distributed_layout(SCANLINE, axes, shape, values, align){
unsigned size = std::accumulate(shape_.begin(), shape_.end(), 1, std::multiplies<int>());
unsigned num_threads = tgt->is_gpu() ? num_warps * 32 : 1;
nts_.resize(shape_.size());
@@ -230,6 +226,10 @@ scanline_layout::scanline_layout(size_t num_warps,
mts_[i] = clamp(num_threads, 1, shape_[i] / nts_[i]);
num_threads = num_threads / mts_[i];
}
shape_per_cta_.resize(shape_.size());
for(size_t d = 0; d < shape_.size(); d++)
shape_per_cta_[d] = mts_[d]*nts_[d];
}
@@ -489,6 +489,9 @@ void layouts::create(size_t id, const std::vector<ir::value*>& values) {
void layouts::run(ir::module &mod) {
// make graph
graph_.clear();
layouts_.clear();
groups_.clear();
ir::for_each_instruction(mod, [this](ir::instruction* i) {
make_graph(i);
});
@@ -515,23 +518,18 @@ void layouts::run(ir::module &mod) {
layouts_[id] = new shared_layout(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_);
tmp_[red] = id;
}
if(auto *recoalasce = dynamic_cast<ir::recoalesce_inst*>(i)){
ir::value *val = recoalasce->get_operand(0);
mma_layout* in_layout = get(val)->to_mma();
scanline_layout* out_layout = get(i)->to_scanline();
if(!in_layout || !out_layout)
return;
if(auto *val = dynamic_cast<ir::cvt_layout_inst*>(i)){
distributed_layout* out_layout = dynamic_cast<distributed_layout*>(get(val));
distributed_layout* in_layout = dynamic_cast<distributed_layout*>(get(i->get_operand(0)));
id++;
ir::type::block_shapes_t in_shape = val->get_type()->get_block_shapes();
ir::type::block_shapes_t shape(in_shape.size());
size_t ld = out_layout->get_order(0);
shape[ld] = in_shape[ld];
for(size_t k = 0; k < in_shape.size(); k++)
if(k != ld)
shape[k] = in_layout->to_mma()->spt(k);
// create layout
layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_);
tmp_[recoalasce] = id;
size_t dim = val->get_type()->get_tile_rank();
ir::type::block_shapes_t shape(dim);
for(size_t k = 0; k < dim; k++){
shape[k] = std::max(in_layout->shape_per_cta(k),
out_layout->shape_per_cta(k));
}
layouts_[id] = new shared_layout(out_layout, axes_->get(val), shape, {val}, val->get_type()->get_scalar_ty(), align_);
tmp_[val] = id;
}
if(auto *atom = dynamic_cast<ir::atomic_inst*>(i)){
id++;