[code generation] added double-buffering

This commit is contained in:
Philippe Tillet
2019-02-12 11:47:52 -05:00
parent e45d6bbb60
commit 41aad4800c
8 changed files with 204 additions and 47 deletions

View File

@@ -212,7 +212,7 @@ int main() {
manager.run(llvm_module);
std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true));
// std::cout << src << std::endl;
std::cout << src << std::endl;
// compile machine code
CUdevice cu_device;
@@ -222,7 +222,7 @@ int main() {
CUstream cu_stream;
int major, minor;
compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test");
// std::cout << src << std::endl;
std::cout << src << std::endl;
// execute machine code
// Allocate buffers

View File

@@ -23,6 +23,9 @@ public:
allocation(liveness *live, buffer_info_pass *buffer_info)
: liveness_(live), buffer_info_(buffer_info){ }
// utilities
unsigned get_num_bytes(ir::value *x);
// accessors
unsigned get_offset(ir::value *x) const { return offsets_.at(x); }
unsigned get_allocated_size() const { return allocated_size_; }

View File

@@ -0,0 +1,34 @@
#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
#include <set>
#include <map>
namespace tdl {
namespace ir {
class module;
class value;
}
namespace codegen{
class buffer_info_pass {
public:
void run(ir::module &mod);
// queries
bool is_double(ir::value *x);
bool is_shared(ir::value *x);
ir::value *get_reference(ir::value *x);
private:
std::set<ir::value*> shared_;
std::set<ir::value*> double_;
std::map<ir::value*, ir::value*> refs_;
};
}
}
#endif

View File

@@ -0,0 +1,27 @@
#ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H
#define TDL_INCLUDE_CODEGEN_VECTORIZE_H
namespace tdl {
namespace ir {
class module;
}
namespace codegen{
class tune;
class vectorize {
public:
vectorize(tune *params): params_(params){}
void run(ir::module &mod);
private:
tune *params_;
};
}
}
#endif

View File

@@ -11,18 +11,18 @@
namespace tdl{
namespace codegen{
unsigned allocation::get_num_bytes(ir::value *x) {
unsigned result = x->get_type()->get_tile_bitwidth();
if(buffer_info_->is_double(x))
result *= 2;
return result;
}
void allocation::run(){
using std::max;
using std::min;
typedef std::multimap<unsigned, segment> triples_map_type;
auto get_num_bytes = [&](ir::value *x){
unsigned result = x->get_type()->get_tile_bitwidth();
if(buffer_info_->is_double(x))
result *= 2;
return result;
};
std::vector<ir::value *> I;
for(auto x: liveness_->intervals())
I.push_back(x.first);

View File

@@ -0,0 +1,65 @@
#include "codegen/buffer_info.h"
#include "ir/module.h"
#include "ir/function.h"
#include "ir/basic_block.h"
#include "ir/instructions.h"
#include "ir/type.h"
namespace tdl {
namespace codegen{
// run pass on module
void buffer_info_pass::run(ir::module &mod) {
for(ir::function *fn: mod.get_function_list())
for(ir::basic_block *block: fn->blocks())
for(ir::instruction *i: block->get_inst_list()) {
if(!i->get_type()->is_tile_ty())
continue;
// handle phi
if(auto *phi = dynamic_cast<ir::phi_node*>(i)){
// determine if the value is in shared memory
bool is_shared = true;
bool is_double = false;
for(unsigned n = 0; n < phi->get_num_incoming(); n++){
ir::value *inc_val = phi->get_incoming_value(n);
ir::value *inc_block = phi->get_incoming_block(n);
is_shared = is_shared && dynamic_cast<ir::copy_to_shared_inst*>(inc_val);
is_double = is_double || inc_block == phi->get_parent();
}
// add to shared
if(is_shared)
shared_.insert(phi);
// add to double-buffered
if(is_double)
double_.insert(phi);
// set references of input
for(unsigned n = 0; n < phi->get_num_incoming(); n++){
ir::value *inc_val = phi->get_incoming_value(n);
assert(refs_[inc_val] == nullptr);
refs_[inc_val] = phi;
}
}
// handle shared copy
if(auto *copy = dynamic_cast<ir::copy_to_shared_inst*>(i))
shared_.insert(copy);
}
}
// query double-buffered status
bool buffer_info_pass::is_double(ir::value *x)
{ return double_.find(x) != double_.end(); }
// query shared status
bool buffer_info_pass::is_shared(ir::value *x)
{ return shared_.find(x) != shared_.end(); }
// get reference if any
ir::value *buffer_info_pass::get_reference(ir::value *x)
{ return refs_[x]; }
}
}

View File

@@ -384,17 +384,42 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder,
// shared copy
PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace());
if(dynamic_cast<ir::copy_to_shared_inst*>(v)) {
size_t offset = alloc_->get_offset(v);
Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset));
ptr = builder.CreateBitCast(ptr, ptr_ty);
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
if(buffer_info_->get_reference(v) == nullptr){
size_t offset = alloc_->get_offset(v);
Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset));
ptr = builder.CreateBitCast(ptr, ptr_ty);
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
}
}
// phi-node (double-buffering)
else if(auto *phi = dynamic_cast<ir::phi_node*>(v)) {
BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()];
builder.SetInsertPoint(parent);
unsigned id_pre = 0, id_loop = 1;
if(phi->get_incoming_block(0) == phi->get_parent())
std::swap(id_pre, id_loop);
ir::value *pre_value = phi->get_incoming_value(id_pre);
ir::value *loop_value = phi->get_incoming_value(id_loop);
BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)];
BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)];
if(parent->empty())
builder.SetInsertPoint(parent);
else
builder.SetInsertPoint(&*parent->getFirstInsertionPt());
PHINode *ptr = builder.CreatePHI(ptr_ty, 2);
// offset
PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2);
Value *next_offset = builder.CreateNeg(offset);
offset->addIncoming(builder.getInt32(alloc_->get_num_bytes(phi) / 2 / 4), pre_block);
offset->addIncoming(next_offset, loop_block);
// next pointer
Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi)));
pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType());
Value *next_ptr = builder.CreateGEP(ptr, offset);
ptr->addIncoming(pre_ptr, pre_block);
ptr->addIncoming(next_ptr, loop_block);
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)});
tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)});
}
else
throw std::runtime_error("unknown shared memory tile");
@@ -633,46 +658,21 @@ void selection::run(ir::module &src, Module &dst){
init_grids(fn, dst_builder, sh_mem_ptr);
// iterate through block
for(ir::basic_block *block: fn->blocks()) {
dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]);
for(ir::instruction *i: block->get_inst_list())
BasicBlock *parent = (BasicBlock*)vmap_[block];
dst_builder.SetInsertPoint(parent);
for(ir::instruction *i: block->get_inst_list()){
if(dynamic_cast<ir::phi_node*>(i))
dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt());
lower_instruction(i, dst_builder);
if(dynamic_cast<ir::phi_node*>(i))
dst_builder.SetInsertPoint(parent);
}
}
// add phi operands
for(ir::basic_block *block: fn->blocks())
for(ir::instruction *inst: block->get_inst_list())
if(auto *phi = dynamic_cast<ir::phi_node*>(inst)){
if(buffer_info_->is_shared(phi)) {
BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()];
unsigned id_pre = 0, id_loop = 1;
if(phi->get_incoming_block(0) == phi->get_parent())
std::swap(id_pre, id_loop);
ir::value *pre_value = phi->get_incoming_value(id_pre);
ir::value *loop_value = phi->get_incoming_value(id_loop);
BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)];
BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)];
int pre_offset = alloc_->get_offset(pre_value);
int loop_offset = alloc_->get_offset(loop_value);
dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt());
PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer());
// offset
PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2);
dst_builder.SetInsertPoint(parent->getFirstNonPHI());
Value *next_offset = dst_builder.CreateNeg(offset);
offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block);
offset->addIncoming(next_offset, loop_block);
// next pointer
Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset));
pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType());
Value *next_ptr = dst_builder.CreateGEP(ptr, offset);
ptr->addIncoming(pre_ptr, pre_block);
ptr->addIncoming(next_ptr, loop_block);
// barrier
Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0);
dst_builder.SetInsertPoint(pre_block->getTerminator());
dst_builder.CreateCall(barrier, {});
dst_builder.SetInsertPoint(loop_block->getTerminator());
dst_builder.CreateCall(barrier, {});
continue;
}
for(unsigned n = 0; n < phi->get_num_incoming(); n++){

28
lib/codegen/vectorize.cpp Normal file
View File

@@ -0,0 +1,28 @@
#include "codegen/vectorize.h"
#include "codegen/tune.h"
#include "ir/module.h"
#include "ir/function.h"
#include "ir/basic_block.h"
#include "ir/instructions.h"
namespace tdl {
namespace codegen{
void vectorize::run(ir::module &mod) {
ir::builder &builder = mod.get_builder();
for(ir::function *fn: mod.get_function_list())
for(ir::basic_block *block: fn->blocks())
for(ir::instruction *i: block->get_inst_list())
if(dynamic_cast<ir::copy_to_shared_inst*>(i)){
builder.set_insert_point(i);
ir::value *x = i->get_operand(0);
ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x);
x->replace_all_uses_with(rx);
rx->set_operand(0, x);
params_->copy(rx, x);
}
}
}
}