[CODEGEN][ANALYSIS] cleaning: moving towards better polymorphism for

tile layouts
This commit is contained in:
Philippe Tillet
2020-01-20 11:55:34 -05:00
parent fbf2a3f56f
commit 382ca2c745
11 changed files with 130 additions and 109 deletions

View File

@@ -50,31 +50,31 @@ public:
virtual void visit_layout_shared(layout_shared_t*) = 0; virtual void visit_layout_shared(layout_shared_t*) = 0;
}; };
class layout_hmma_884_t;
class layout_scanline_t;
class layout_shared_t;
struct layout_t { struct layout_t {
layout_t(layout_type_t _type, layout_t(layout_type_t _type,
const std::vector<int>& _axes, const std::vector<int>& _axes,
const std::vector<unsigned> &_shapes, const std::vector<unsigned> &_shapes,
const std::vector<ir::value *> &_values, const std::vector<ir::value *> &_values,
ir::type *_ty, ir::type *_ty,
size_t _id,
analysis::align* align); analysis::align* align);
// visitor
virtual void accept(layout_visitor* vst) = 0; virtual void accept(layout_visitor* vst) = 0;
// downcast
layout_hmma_884_t* to_hmma884();
layout_scanline_t* to_scanline();
layout_shared_t* to_shared();
layout_type_t type; layout_type_t type;
std::vector<int> axes; std::vector<int> axes;
std::vector<unsigned> shapes; std::vector<unsigned> shapes;
std::vector<ir::value*> values; std::vector<ir::value*> values;
std::vector<int> order; std::vector<int> order;
size_t id;
size_t size;
std::shared_ptr<double_buffer_info_t> double_buffer;
ir::type *ty; ir::type *ty;
size_t pad;
std::vector<int> mts;
std::vector<int> nts;
std::vector<int> fpw;
std::vector<int> wpt;
}; };
struct layout_hmma_884_t: public layout_t { struct layout_hmma_884_t: public layout_t {
@@ -83,9 +83,11 @@ struct layout_hmma_884_t: public layout_t {
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &_values, const std::vector<ir::value *> &_values,
ir::type *_ty, ir::type *_ty,
size_t _id,
analysis::align* align); analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); } void accept(layout_visitor* vst) { vst->visit_layout_hmma_884(this); }
std::vector<int> fpw;
std::vector<int> wpt;
}; };
struct layout_scanline_t: public layout_t { struct layout_scanline_t: public layout_t {
@@ -94,9 +96,11 @@ struct layout_scanline_t: public layout_t {
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &values, const std::vector<ir::value *> &values,
ir::type *_ty, ir::type *_ty,
size_t _id,
analysis::align* align); analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); } void accept(layout_visitor* vst) { vst->visit_layout_scanline(this); }
std::vector<int> mts;
std::vector<int> nts;
}; };
struct layout_shared_t: public layout_t { struct layout_shared_t: public layout_t {
@@ -105,9 +109,11 @@ struct layout_shared_t: public layout_t {
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &values, const std::vector<ir::value *> &values,
ir::type *ty, ir::type *ty,
size_t _id,
analysis::align* align); analysis::align* align);
void accept(layout_visitor* vst) { vst->visit_layout_shared(this); } void accept(layout_visitor* vst) { vst->visit_layout_shared(this); }
std::shared_ptr<double_buffer_info_t> double_buffer;
size_t size;
}; };
@@ -126,18 +132,6 @@ private:
void create(size_t id, const std::vector<ir::value*>& values); void create(size_t id, const std::vector<ir::value*>& values);
// size_t shared_tmp_req(ir::instruction* i) {
// switch(i->get_id()) {
// case ir::INST_REDUCE: {
// ir::reduce_inst *red = (ir::reduce_inst*)i;
// ir::type *ty = red->get_type();
// }
// default: return 0;
// }
// }
public: public:
// constructor // constructor
layout(analysis::axes *axes, analysis::align *align, size_t num_warps); layout(analysis::axes *axes, analysis::align *align, size_t num_warps);
@@ -146,8 +140,8 @@ public:
unsigned layout_of(ir::value *value) const; unsigned layout_of(ir::value *value) const;
const std::vector<ir::value*>& values_of(unsigned id) const; const std::vector<ir::value*>& values_of(unsigned id) const;
size_t num_layouts() const; size_t num_layouts() const;
const layout_t* get(size_t id) const; layout_t* get(size_t id);
const layout_t* get(ir::value *v) const; layout_t* get(ir::value *v);
std::map<size_t, layout_t*> &get_all(); std::map<size_t, layout_t*> &get_all();
size_t tmp(ir::instruction* i); size_t tmp(ir::instruction* i);

View File

@@ -42,14 +42,14 @@ struct segment {
class liveness { class liveness {
private: private:
typedef std::map<layout_t*, segment> intervals_map_t; typedef std::map<layout_shared_t*, segment> intervals_map_t;
public: public:
// constructor // constructor
liveness(layout *l): layouts_(l){ } liveness(layout *l): layouts_(l){ }
// accessors // accessors
const intervals_map_t& get() const { return intervals_; } const intervals_map_t& get() const { return intervals_; }
segment get(layout_t* v) const { return intervals_.at(v); } segment get(layout_shared_t* v) const { return intervals_.at(v); }
// run // run
void run(ir::module &mod); void run(ir::module &mod);

View File

@@ -71,7 +71,8 @@ public:
class machine_layout_shared_t: public machine_layout_t { class machine_layout_shared_t: public machine_layout_t {
public: public:
machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr, analysis::layout_t* layout, machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, Value *&sh_mem_ptr,
analysis::layout_shared_t* layout,
std::map<ir::value *, Value *>& vmap, std::map<ir::value *, Value *>& vmap,
std::map<ir::value *, tile *>& tmap); std::map<ir::value *, tile *>& tmap);
@@ -82,7 +83,7 @@ public:
target *tgt_; target *tgt_;
analysis::allocation* alloc_; analysis::allocation* alloc_;
Value *&sh_mem_ptr_; Value *&sh_mem_ptr_;
analysis::layout_t* layout_; analysis::layout_shared_t* layout_;
std::map<ir::value *, Value *>& vmap_; std::map<ir::value *, Value *>& vmap_;
std::map<ir::value *, tile *>& tmap_; std::map<ir::value *, tile *>& tmap_;

View File

@@ -15,22 +15,22 @@ void allocation::run(ir::module &mod) {
using std::min; using std::min;
typedef std::multimap<unsigned, segment> triples_map_type; typedef std::multimap<unsigned, segment> triples_map_type;
std::vector<layout_t*> I; std::vector<layout_shared_t*> I;
for(auto x: liveness_->get()) for(auto x: liveness_->get())
I.push_back(x.first); I.push_back(x.first);
std::vector<layout_t*> J = I; std::vector<layout_shared_t*> J = I;
triples_map_type H; triples_map_type H;
H.insert({0, segment{0, INT_MAX}}); H.insert({0, segment{0, INT_MAX}});
std::vector<layout_t*> V; std::vector<layout_shared_t*> V;
std::map<layout_t*, unsigned> starts; std::map<layout_shared_t*, unsigned> starts;
while(!J.empty()){ while(!J.empty()){
auto h_it = H.begin(); auto h_it = H.begin();
unsigned w = h_it->first; unsigned w = h_it->first;
segment xh = h_it->second; segment xh = h_it->second;
H.erase(h_it); H.erase(h_it);
auto j_it = std::find_if(J.begin(), J.end(), [&](layout_t* JJ){ auto j_it = std::find_if(J.begin(), J.end(), [&](layout_shared_t* JJ){
segment xj = liveness_->get(JJ); segment xj = liveness_->get(JJ);
bool res = xj.intersect(xh); bool res = xj.intersect(xh);
for(auto val: H) for(auto val: H)
@@ -52,10 +52,10 @@ void allocation::run(ir::module &mod) {
} }
// Build interference graph // Build interference graph
std::map<layout_t*, std::set<layout_t*>> interferences; std::map<layout_shared_t*, std::set<layout_shared_t*>> interferences;
for(layout_t* x: V) for(layout_shared_t* x: V)
for(layout_t* y: V){ for(layout_shared_t* y: V){
if(x->id == y->id) if(x == y)
continue; continue;
unsigned X0 = starts[x], Y0 = starts[y]; unsigned X0 = starts[x], Y0 = starts[y];
unsigned NX = x->size; unsigned NX = x->size;
@@ -68,17 +68,17 @@ void allocation::run(ir::module &mod) {
} }
// Initialize colors // Initialize colors
std::map<layout_t*, int> colors; std::map<layout_shared_t*, int> colors;
for(layout_t* X: V) for(layout_shared_t* X: V)
colors[X] = (X->id==V[0]->id)?0:-1; colors[X] = (X==V[0])?0:-1;
// First-fit graph coloring // First-fit graph coloring
std::vector<bool> available(V.size()); std::vector<bool> available(V.size());
for(layout_t* x: V){ for(layout_shared_t* x: V){
// Non-neighboring colors are available // Non-neighboring colors are available
std::fill(available.begin(), available.end(), true); std::fill(available.begin(), available.end(), true);
for(layout_t* Y: interferences[x]){ for(layout_shared_t* Y: interferences[x]){
int color = colors[Y]; int color = colors[Y];
if(color >= 0) if(color >= 0)
available[color] = false; available[color] = false;
@@ -89,16 +89,16 @@ void allocation::run(ir::module &mod) {
} }
// Finalize allocation // Finalize allocation
for(layout_t* x: V){ for(layout_shared_t* x: V){
unsigned Adj = 0; unsigned Adj = 0;
for(layout_t* y: interferences[x]) for(layout_shared_t* y: interferences[x])
Adj = std::max<unsigned>(Adj, starts[y] + y->size); Adj = std::max<unsigned>(Adj, starts[y] + y->size);
offsets_[x] = starts[x] + colors[x] * Adj; offsets_[x] = starts[x] + colors[x] * Adj;
} }
// Save maximum size of induced memory space // Save maximum size of induced memory space
allocated_size_ = 0; allocated_size_ = 0;
for(layout_t* x: V) for(layout_shared_t* x: V)
allocated_size_ = std::max<size_t>(allocated_size_, starts[x] + x->size); allocated_size_ = std::max<size_t>(allocated_size_, starts[x] + x->size);
} }

View File

@@ -75,11 +75,11 @@ bool is_hmma_c(ir::value *v){
return result; return result;
} }
const layout_t* layout::get(size_t id) const { layout_t* layout::get(size_t id) {
return layouts_.at(id); return layouts_.at(id);
} }
const layout_t* layout::get(ir::value *v) const { layout_t* layout::get(ir::value *v) {
return layouts_.at(groups_.at(v)); return layouts_.at(groups_.at(v));
} }
@@ -140,8 +140,7 @@ layout_t::layout_t(layout_type_t _type,
const std::vector<int> &_axes, const std::vector<int> &_axes,
const std::vector<unsigned> &_shapes, const std::vector<unsigned> &_shapes,
const std::vector<ir::value *> &_values, ir::type *_ty, const std::vector<ir::value *> &_values, ir::type *_ty,
size_t _id, analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), ty(_ty) {
analysis::align* align): type(_type), axes(_axes), shapes(_shapes), values(_values), id(_id), ty(_ty) {
// io pointer // io pointer
std::set<ir::value*> ptr; std::set<ir::value*> ptr;
for(ir::value* v: values) for(ir::value* v: values)
@@ -159,6 +158,21 @@ layout_t::layout_t(layout_type_t _type,
} }
} }
// downcast
layout_hmma_884_t* layout_t::to_hmma884() {
assert(type == HMMA_884);
return static_cast<layout_hmma_884_t*>(this);
}
layout_scanline_t* layout_t::to_scanline() {
assert(type == SCANLINE);
return static_cast<layout_scanline_t*>(this);
}
layout_shared_t* layout_t::to_shared() {
assert(type == SHARED);
return static_cast<layout_shared_t*>(this);
}
inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) { inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) {
return std::min(std::max(x, lo), hi); return std::min(std::max(x, lo), hi);
@@ -167,8 +181,8 @@ inline unsigned clamp(unsigned x, unsigned lo, unsigned hi) {
layout_hmma_884_t::layout_hmma_884_t(size_t num_warps, layout_hmma_884_t::layout_hmma_884_t(size_t num_warps,
const std::vector<int>& _axes, const std::vector<int>& _axes,
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &values, ir::type *_ty, size_t _id, const std::vector<ir::value *> &values, ir::type *_ty,
analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, _id, align) { analysis::align* align): layout_t(HMMA_884, _axes, _shapes, values, _ty, align) {
unsigned shape_0 = shapes[0]; unsigned shape_0 = shapes[0];
unsigned shape_1 = shapes[1]; unsigned shape_1 = shapes[1];
/* fragments per warp */ /* fragments per warp */
@@ -210,8 +224,7 @@ layout_scanline_t::layout_scanline_t(size_t num_warps,
const std::vector<int>& _axes, const std::vector<int>& _axes,
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &values, ir::type *_ty, const std::vector<ir::value *> &values, ir::type *_ty,
size_t _id, analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, align){
analysis::align* align): layout_t(SCANLINE, _axes, _shapes, values, _ty, _id, align){
unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies<int>()); unsigned size = std::accumulate(shapes.begin(), shapes.end(), 1, std::multiplies<int>());
unsigned num_threads = num_warps * 32; unsigned num_threads = num_warps * 32;
nts.resize(shapes.size()); nts.resize(shapes.size());
@@ -295,8 +308,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg,
const std::vector<unsigned>& _shapes, const std::vector<unsigned>& _shapes,
const std::vector<ir::value *> &values, const std::vector<ir::value *> &values,
ir::type *ty, ir::type *ty,
size_t _id, analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, align) {
analysis::align* align): layout_t(SHARED, _axes, _shapes, values, ty, _id, align) {
size = 0; size = 0;
@@ -335,7 +347,7 @@ layout_shared_t::layout_shared_t(const layout_t *arg,
// else // else
// order = row; // order = row;
// padding // padding
pad = 0; size_t pad = 0;
if(hmma_dot_a){ if(hmma_dot_a){
bool row = is_trans(hmma_dot_a) ^ order[0] != 0; bool row = is_trans(hmma_dot_a) ^ order[0] != 0;
pad = 24 - shapes[row ? 0 : 1] % 32; pad = 24 - shapes[row ? 0 : 1] % 32;
@@ -375,15 +387,15 @@ void layout::create(size_t id, const std::vector<ir::value*>& values) {
}); });
// type // type
if(it_hmma_c != values.end()) if(it_hmma_c != values.end())
layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_hmma_884_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_);
else if(it_cts != values.end()){ else if(it_cts != values.end()){
ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts; ir::copy_to_shared_inst *cts = (ir::copy_to_shared_inst*)*it_cts;
ir::value *arg = cts->get_operand(0); ir::value *arg = cts->get_operand(0);
create(groups_.at(arg), values_.at(groups_.at(arg))); create(groups_.at(arg), values_.at(groups_.at(arg)));
layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_shared_t(get(arg), axes, shapes, values, largest->get_type()->get_scalar_ty(), align_);
} }
else else
layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_scanline_t(num_warps_, axes, shapes, values, largest->get_type()->get_scalar_ty(), align_);
} }
void layout::run(ir::module &mod) { void layout::run(ir::module &mod) {
@@ -410,18 +422,18 @@ void layout::run(ir::module &mod) {
// shape // shape
auto shapes = arg->get_type()->get_tile_shapes(); auto shapes = arg->get_type()->get_tile_shapes();
unsigned shape_ax = shapes[axis]; unsigned shape_ax = shapes[axis];
const layout_t *layout = get(arg); layout_scanline_t *layout = get(arg)->to_scanline();
unsigned per_thread = layout->nts[axis]; unsigned per_thread = layout->nts[axis];
unsigned depth = shape_ax / per_thread; unsigned depth = shape_ax / per_thread;
shapes[axis] = depth; shapes[axis] = depth;
// create layout // create layout
layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_shared_t(layout, axes_->get(arg), shapes, {red}, red->get_type()->get_scalar_ty(), align_);
tmp_[red] = id; tmp_[red] = id;
} }
if(auto *recoalasce = dynamic_cast<ir::recoalesce_inst*>(i)){ if(auto *recoalasce = dynamic_cast<ir::recoalesce_inst*>(i)){
ir::value *val = recoalasce->get_operand(0); ir::value *val = recoalasce->get_operand(0);
const layout_t* in_layout = get(val); layout_t* in_layout = get(val);
const layout_t* out_layout = get(i); layout_t* out_layout = get(i);
if(in_layout->type != HMMA_884) if(in_layout->type != HMMA_884)
return; return;
id++; id++;
@@ -431,14 +443,14 @@ void layout::run(ir::module &mod) {
shape[ld] = in_shape[ld]; shape[ld] = in_shape[ld];
for(size_t k = 0; k < in_shape.size(); k++) for(size_t k = 0; k < in_shape.size(); k++)
if(k != ld) if(k != ld)
shape[k] = 4*in_layout->fpw[k]*in_layout->wpt[k]; shape[k] = 4*in_layout->to_hmma884()->fpw[k]*in_layout->to_hmma884()->wpt[k];
// create layout // create layout
layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_shared_t(out_layout, axes_->get(val), shape, {recoalasce}, val->get_type()->get_scalar_ty(), align_);
tmp_[recoalasce] = id; tmp_[recoalasce] = id;
} }
if(auto *atom = dynamic_cast<ir::atomic_cas_inst*>(i)){ if(auto *atom = dynamic_cast<ir::atomic_cas_inst*>(i)){
id++; id++;
layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), id, align_); layouts_[id] = new layout_shared_t(nullptr, {}, {1}, {atom}, atom->get_type()->get_scalar_ty(), align_);
tmp_[atom] = id; tmp_[atom] = id;
} }
}); });

View File

@@ -27,9 +27,9 @@ void liveness::run(ir::module &mod) {
// create live intervals // create live intervals
for(auto &x: layouts_->get_all()) { for(auto &x: layouts_->get_all()) {
layout_t* layout = x.second; if(x.second->type != SHARED)
if(layout->type != SHARED)
continue; continue;
layout_shared_t* layout = x.second->to_shared();
// users // users
std::set<ir::user*> users; std::set<ir::user*> users;
for(ir::value *v: layout->values){ for(ir::value *v: layout->values){

View File

@@ -655,13 +655,14 @@ void generator::visit_hmma_dot(ir::dot_inst* dot, shared_tile *TA, shared_tile *
"{$8, $9}, " "{$8, $9}, "
"{$10, $11}, " "{$10, $11}, "
"{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false); "{$0, $1, $2, $3, $4, $5, $6, $7};", "=f,=f,=f,=f,=f,=f,=f,=f,r,r,r,r,0,1,2,3,4,5,6,7", false);
analysis::layout_hmma_884_t* layout = layouts_->get(dot)->to_hmma884();
unsigned fpw_0 = layouts_->get(dot)->fpw.at(0); unsigned fpw_0 = layout->fpw.at(0);
unsigned fpw_1 = layouts_->get(dot)->fpw.at(1); unsigned fpw_1 = layout->fpw.at(1);
unsigned wts_0 = fpw_0 * 8; unsigned wts_0 = fpw_0 * 8;
unsigned wts_1 = fpw_1 * 8; unsigned wts_1 = fpw_1 * 8;
unsigned wpt_0 = layouts_->get(dot)->wpt.at(0); unsigned wpt_0 = layout->wpt.at(0);
unsigned wpt_1 = layouts_->get(dot)->wpt.at(1); unsigned wpt_1 = layout->wpt.at(1);
unsigned stride_rep_i = wpt_0 * wts_0; unsigned stride_rep_i = wpt_0 * wts_0;
unsigned stride_rep_j = wpt_1 * wts_1; unsigned stride_rep_j = wpt_1 * wts_1;
unsigned num_rep_i = shapes[0] / stride_rep_i; unsigned num_rep_i = shapes[0] / stride_rep_i;
@@ -925,8 +926,8 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) {
// pointer to temporary shared memory // pointer to temporary shared memory
Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_); Type *ty = llvm_type(rc->get_type()->get_scalar_ty(), *ctx_);
// layouts // layouts
const analysis::layout_t* in_layout = layouts_->get(op); analysis::layout_hmma_884_t* in_layout = layouts_->get(op)->to_hmma884();
const analysis::layout_t* out_layout = layouts_->get(rc); analysis::layout_scanline_t* out_layout = layouts_->get(rc)->to_scanline();
// machine tiles // machine tiles
distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op)); distributed_tile *in_dt = (distributed_tile*)(tmap_.at(op));
distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc)); distributed_tile *out_dt = (distributed_tile*)(tmap_.at(rc));
@@ -1026,14 +1027,14 @@ void generator::visit_recoalesce_inst(ir::recoalesce_inst* rc) {
void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) { void generator::visit_copy_to_shared_inst(ir::copy_to_shared_inst* cts) {
unsigned vector_size = 1; unsigned vector_size = 1;
auto x_order = layouts_->get(cts)->order;
ir::value *arg = cts->get_operand(0); ir::value *arg = cts->get_operand(0);
auto arg_order = layouts_->get(arg)->order; analysis::layout_shared_t* out_layout = layouts_->get(cts)->to_shared();
analysis::layout_scanline_t* in_layout = layouts_->get(arg)->to_scanline();
auto out_order = out_layout->order;
auto in_order = in_layout->order;
// tiles // tiles
if(x_order == arg_order){ if(out_order == in_order)
size_t ld = arg_order[0]; vector_size = in_layout->nts.at(in_order[0]);
vector_size = layouts_->get(arg)->nts.at(ld);
}
std::map<unsigned, Value*> packets; std::map<unsigned, Value*> packets;
for_each(arg, [&](indices_t idx){ for_each(arg, [&](indices_t idx){

View File

@@ -72,7 +72,7 @@ inline int32_t ceil(int32_t num, int32_t div){
machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc, machine_layout_shared_t::machine_layout_shared_t(Module *mod, Builder *builder, target *tgt, analysis::allocation* alloc,
Value *&sh_mem_ptr, analysis::layout_t *layout, Value *&sh_mem_ptr, analysis::layout_shared_t *layout,
std::map<ir::value *, Value *>& vmap, std::map<ir::value *, Value *>& vmap,
std::map<ir::value *, tile *>& tmap) std::map<ir::value *, tile *>& tmap)
: mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) { : mod_(mod), builder_(builder), tgt_(tgt), alloc_(alloc), sh_mem_ptr_(sh_mem_ptr), layout_(layout), vmap_(vmap), tmap_(tmap) {
@@ -132,7 +132,10 @@ machine_layout_distributed_t::machine_layout_distributed_t(Module *mod, Builder
tile *machine_layout_distributed_t::create(ir::value *v) { tile *machine_layout_distributed_t::create(ir::value *v) {
Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext()); Type *ty = llvm_type(v->get_type()->get_scalar_ty(), builder_->getContext());
const auto &shapes = v->get_type()->get_tile_shapes(); const auto &shapes = v->get_type()->get_tile_shapes();
std::vector<distributed_axis> axes(shapes.size()); size_t rank = shapes.size();
std::vector<distributed_axis> axes(rank);
std::vector<int> order(rank);
// compute axes
for(size_t d = 0; d < shapes.size(); d++){ for(size_t d = 0; d < shapes.size(); d++){
if(shapes[d] > 1){ if(shapes[d] > 1){
unsigned x = a_axes_->get(v, d); unsigned x = a_axes_->get(v, d);
@@ -143,7 +146,22 @@ tile *machine_layout_distributed_t::create(ir::value *v) {
axes[d].values = {builder_->getInt32(0)}; axes[d].values = {builder_->getInt32(0)};
} }
} }
return new distributed_tile(ty, shapes, layout_->order, axes, *builder_); // compute order
std::iota(order.begin(), order.end(), 0);
auto cmp = [&](int x, int y) {
unsigned axx = a_axes_->get(v, x);
unsigned axy = a_axes_->get(v, y);
auto itx = std::find(layout_->axes.begin(), layout_->axes.end(), axx);
auto ity = std::find(layout_->axes.begin(), layout_->axes.end(), axy);
size_t posx = std::distance(layout_->axes.begin(), itx);
size_t posy = std::distance(layout_->axes.begin(), ity);
if(posx < rank && posy < rank)
return layout_->order[posx] < layout_->order[posy];
return false;
};
std::sort(order.begin(), order.end(), cmp);
return new distributed_tile(ty, shapes, order, axes, *builder_);
} }
machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder, machine_layout_hmma_884_t::machine_layout_hmma_884_t(Module *mod, Builder *builder,

View File

@@ -11,13 +11,6 @@ using namespace llvm;
/* Distributed Tile */ /* Distributed Tile */
void distributed_tile::init_indices() { void distributed_tile::init_indices() {
std::vector<size_t> id(axes_.size(), 0); std::vector<size_t> id(axes_.size(), 0);
// create iteration order
std::vector<size_t> order(id.size());
std::iota(order.begin(), order.end(), 0);
auto cmp = [&](int x, int y) {
return order_[x] < order_[y];
};
std::sort(order.begin(), order.end(), cmp);
// build // build
size_t k = 0; size_t k = 0;
while(true) { while(true) {
@@ -28,12 +21,12 @@ void distributed_tile::init_indices() {
indices_[current] = sz; indices_[current] = sz;
values_[current] = nullptr; values_[current] = nullptr;
ordered_indices_.push_back(current); ordered_indices_.push_back(current);
id[order[0]]++; id[order_[0]]++;
while(id[order[k]] == axes_[order[k]].values.size()){ while(id[order_[k]] == axes_[order_[k]].values.size()){
if(k == id.size() - 1) if(k == id.size() - 1)
return; return;
id[order[k++]] = 0; id[order_[k++]] = 0;
id[order[k]]++; id[order_[k]]++;
} }
k = 0; k = 0;
} }

View File

@@ -37,7 +37,7 @@ void membar::add_reference(ir::value *v, interval_vec_t &res){
return; return;
if(alloc_->has_offset(layouts_->get(v))){ if(alloc_->has_offset(layouts_->get(v))){
unsigned offset = alloc_->offset(layouts_->get(v)); unsigned offset = alloc_->offset(layouts_->get(v));
unsigned size = layouts_->get(v)->size; unsigned size = layouts_->get(v)->to_shared()->size;
res.push_back(interval_t(offset, offset + size)); res.push_back(interval_t(offset, offset + size));
} }
} }
@@ -119,12 +119,14 @@ void membar::run(ir::module &mod) {
// without needing synchronization // without needing synchronization
std::set<ir::value*> safe_war; std::set<ir::value*> safe_war;
for(const auto& x: layouts_->get_all()){ for(const auto& x: layouts_->get_all()){
if(x.second->double_buffer){ if(x.second->type != analysis::SHARED)
auto info = *x.second->double_buffer; continue;
for(ir::value *v: x.second->values) analysis::layout_shared_t* layout = x.second->to_shared();
if(v != info.phi) if(!layout->double_buffer)
safe_war.insert(v); continue;
} for(ir::value *v: layout->values)
if(v != layout->double_buffer->phi)
safe_war.insert(v);
} }

View File

@@ -10,11 +10,11 @@ int main() {
typedef std::tuple<std::vector<int>, bool, bool, int, int, int> config_t; typedef std::tuple<std::vector<int>, bool, bool, int, int, int> config_t;
std::vector<config_t> configs; std::vector<config_t> configs;
for(auto ord: std::vector<std::vector<int>>{{1, 0}}) for(auto ord: std::vector<std::vector<int>>{{1, 0}})
for(auto x: std::vector<std::array<bool, 2>>{{false, false}}){ for(auto x: std::vector<std::array<bool, 2>>{{false, false}, {true, false}}){
std::vector<config_t> tmp = { std::vector<config_t> tmp = {
// config_t{ord, x[0], x[1], 512, 512, 512}, config_t{ord, x[0], x[1], 512, 512, 512},
// config_t{ord, x[0], x[1], 1024, 1024, 1024}, config_t{ord, x[0], x[1], 2048, 2048, 2048},
config_t{ord, x[0], x[1], 127008, 768, 576}, // config_t{ord, x[0], x[1], 127008, 768, 576},
// config_t{ord, x[0], x[1], 8192, 8192, 8192} // config_t{ord, x[0], x[1], 8192, 8192, 8192}
// config_t{ord, x[0], x[1], 16, 2048, 2048}, // config_t{ord, x[0], x[1], 16, 2048, 2048},
// config_t{ord, x[0], x[1], 32, 2048, 2048}, // config_t{ord, x[0], x[1], 32, 2048, 2048},
@@ -36,7 +36,7 @@ int main() {
for(const auto& c: configs){ for(const auto& c: configs){
std::tie(ord, AT, BT, M, N, K) = c; std::tie(ord, AT, BT, M, N, K) = c;
std::cout << "// " << c ; std::cout << "// " << c ;
for(auto perf: bench_dot(stream, HALF, AT, BT, M, N, K, ord, ord)) for(auto perf: bench_dot(stream, FLOAT, AT, BT, M, N, K, ord, ord))
std::cout << ", " << perf << std::flush; std::cout << ", " << perf << std::flush;
std::cout << std::endl; std::cout << std::endl;
} }