[code generation] added double-buffering
This commit is contained in:
@@ -212,7 +212,7 @@ int main() {
|
||||
manager.run(llvm_module);
|
||||
|
||||
std::string src = generate_machine_code(llvm_module, "nvptx64-nvidia-cuda", compute_data_layout(true, true));
|
||||
// std::cout << src << std::endl;
|
||||
std::cout << src << std::endl;
|
||||
|
||||
// compile machine code
|
||||
CUdevice cu_device;
|
||||
@@ -222,7 +222,7 @@ int main() {
|
||||
CUstream cu_stream;
|
||||
int major, minor;
|
||||
compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test");
|
||||
// std::cout << src << std::endl;
|
||||
std::cout << src << std::endl;
|
||||
|
||||
// execute machine code
|
||||
// Allocate buffers
|
||||
|
@@ -23,6 +23,9 @@ public:
|
||||
allocation(liveness *live, buffer_info_pass *buffer_info)
|
||||
: liveness_(live), buffer_info_(buffer_info){ }
|
||||
|
||||
// utilities
|
||||
unsigned get_num_bytes(ir::value *x);
|
||||
|
||||
// accessors
|
||||
unsigned get_offset(ir::value *x) const { return offsets_.at(x); }
|
||||
unsigned get_allocated_size() const { return allocated_size_; }
|
||||
|
34
include/codegen/buffer_info.h
Normal file
34
include/codegen/buffer_info.h
Normal file
@@ -0,0 +1,34 @@
|
||||
#ifndef TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
||||
#define TDL_INCLUDE_CODEGEN_BUFFER_INFO_PASS_H
|
||||
|
||||
#include <set>
|
||||
#include <map>
|
||||
|
||||
namespace tdl {
|
||||
|
||||
namespace ir {
|
||||
class module;
|
||||
class value;
|
||||
}
|
||||
|
||||
namespace codegen{
|
||||
|
||||
class buffer_info_pass {
|
||||
public:
|
||||
void run(ir::module &mod);
|
||||
// queries
|
||||
bool is_double(ir::value *x);
|
||||
bool is_shared(ir::value *x);
|
||||
ir::value *get_reference(ir::value *x);
|
||||
|
||||
private:
|
||||
std::set<ir::value*> shared_;
|
||||
std::set<ir::value*> double_;
|
||||
std::map<ir::value*, ir::value*> refs_;
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
27
include/codegen/vectorize.h
Normal file
27
include/codegen/vectorize.h
Normal file
@@ -0,0 +1,27 @@
|
||||
#ifndef TDL_INCLUDE_CODEGEN_VECTORIZE_H
|
||||
#define TDL_INCLUDE_CODEGEN_VECTORIZE_H
|
||||
|
||||
namespace tdl {
|
||||
|
||||
namespace ir {
|
||||
class module;
|
||||
}
|
||||
|
||||
namespace codegen{
|
||||
|
||||
class tune;
|
||||
|
||||
class vectorize {
|
||||
public:
|
||||
vectorize(tune *params): params_(params){}
|
||||
void run(ir::module &mod);
|
||||
|
||||
private:
|
||||
tune *params_;
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
@@ -11,18 +11,18 @@
|
||||
namespace tdl{
|
||||
namespace codegen{
|
||||
|
||||
unsigned allocation::get_num_bytes(ir::value *x) {
|
||||
unsigned result = x->get_type()->get_tile_bitwidth();
|
||||
if(buffer_info_->is_double(x))
|
||||
result *= 2;
|
||||
return result;
|
||||
}
|
||||
|
||||
void allocation::run(){
|
||||
using std::max;
|
||||
using std::min;
|
||||
typedef std::multimap<unsigned, segment> triples_map_type;
|
||||
|
||||
auto get_num_bytes = [&](ir::value *x){
|
||||
unsigned result = x->get_type()->get_tile_bitwidth();
|
||||
if(buffer_info_->is_double(x))
|
||||
result *= 2;
|
||||
return result;
|
||||
};
|
||||
|
||||
std::vector<ir::value *> I;
|
||||
for(auto x: liveness_->intervals())
|
||||
I.push_back(x.first);
|
||||
|
65
lib/codegen/buffer_info.cpp
Normal file
65
lib/codegen/buffer_info.cpp
Normal file
@@ -0,0 +1,65 @@
|
||||
#include "codegen/buffer_info.h"
|
||||
#include "ir/module.h"
|
||||
#include "ir/function.h"
|
||||
#include "ir/basic_block.h"
|
||||
#include "ir/instructions.h"
|
||||
#include "ir/type.h"
|
||||
|
||||
namespace tdl {
|
||||
|
||||
namespace codegen{
|
||||
|
||||
|
||||
// run pass on module
|
||||
void buffer_info_pass::run(ir::module &mod) {
|
||||
for(ir::function *fn: mod.get_function_list())
|
||||
for(ir::basic_block *block: fn->blocks())
|
||||
for(ir::instruction *i: block->get_inst_list()) {
|
||||
if(!i->get_type()->is_tile_ty())
|
||||
continue;
|
||||
// handle phi
|
||||
if(auto *phi = dynamic_cast<ir::phi_node*>(i)){
|
||||
// determine if the value is in shared memory
|
||||
bool is_shared = true;
|
||||
bool is_double = false;
|
||||
for(unsigned n = 0; n < phi->get_num_incoming(); n++){
|
||||
ir::value *inc_val = phi->get_incoming_value(n);
|
||||
ir::value *inc_block = phi->get_incoming_block(n);
|
||||
is_shared = is_shared && dynamic_cast<ir::copy_to_shared_inst*>(inc_val);
|
||||
is_double = is_double || inc_block == phi->get_parent();
|
||||
}
|
||||
// add to shared
|
||||
if(is_shared)
|
||||
shared_.insert(phi);
|
||||
// add to double-buffered
|
||||
if(is_double)
|
||||
double_.insert(phi);
|
||||
// set references of input
|
||||
for(unsigned n = 0; n < phi->get_num_incoming(); n++){
|
||||
ir::value *inc_val = phi->get_incoming_value(n);
|
||||
assert(refs_[inc_val] == nullptr);
|
||||
refs_[inc_val] = phi;
|
||||
}
|
||||
}
|
||||
// handle shared copy
|
||||
if(auto *copy = dynamic_cast<ir::copy_to_shared_inst*>(i))
|
||||
shared_.insert(copy);
|
||||
}
|
||||
}
|
||||
|
||||
// query double-buffered status
|
||||
bool buffer_info_pass::is_double(ir::value *x)
|
||||
{ return double_.find(x) != double_.end(); }
|
||||
|
||||
// query shared status
|
||||
bool buffer_info_pass::is_shared(ir::value *x)
|
||||
{ return shared_.find(x) != shared_.end(); }
|
||||
|
||||
// get reference if any
|
||||
ir::value *buffer_info_pass::get_reference(ir::value *x)
|
||||
{ return refs_[x]; }
|
||||
|
||||
|
||||
|
||||
}
|
||||
}
|
@@ -384,17 +384,42 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder,
|
||||
// shared copy
|
||||
PointerType *ptr_ty = ty->getPointerTo(sh_mem_ptr->getType()->getPointerAddressSpace());
|
||||
if(dynamic_cast<ir::copy_to_shared_inst*>(v)) {
|
||||
size_t offset = alloc_->get_offset(v);
|
||||
Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset));
|
||||
ptr = builder.CreateBitCast(ptr, ptr_ty);
|
||||
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
|
||||
if(buffer_info_->get_reference(v) == nullptr){
|
||||
size_t offset = alloc_->get_offset(v);
|
||||
Value *ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(offset));
|
||||
ptr = builder.CreateBitCast(ptr, ptr_ty);
|
||||
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
|
||||
}
|
||||
}
|
||||
// phi-node (double-buffering)
|
||||
else if(auto *phi = dynamic_cast<ir::phi_node*>(v)) {
|
||||
BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()];
|
||||
builder.SetInsertPoint(parent);
|
||||
unsigned id_pre = 0, id_loop = 1;
|
||||
if(phi->get_incoming_block(0) == phi->get_parent())
|
||||
std::swap(id_pre, id_loop);
|
||||
ir::value *pre_value = phi->get_incoming_value(id_pre);
|
||||
ir::value *loop_value = phi->get_incoming_value(id_loop);
|
||||
BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)];
|
||||
BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)];
|
||||
if(parent->empty())
|
||||
builder.SetInsertPoint(parent);
|
||||
else
|
||||
builder.SetInsertPoint(&*parent->getFirstInsertionPt());
|
||||
PHINode *ptr = builder.CreatePHI(ptr_ty, 2);
|
||||
// offset
|
||||
PHINode *offset = builder.CreatePHI(builder.getInt32Ty(), 2);
|
||||
Value *next_offset = builder.CreateNeg(offset);
|
||||
offset->addIncoming(builder.getInt32(alloc_->get_num_bytes(phi) / 2 / 4), pre_block);
|
||||
offset->addIncoming(next_offset, loop_block);
|
||||
// next pointer
|
||||
Value *pre_ptr = builder.CreateGEP(sh_mem_ptr, builder.getInt32(alloc_->get_offset(phi)));
|
||||
pre_ptr = builder.CreateBitCast(pre_ptr, ptr->getType());
|
||||
Value *next_ptr = builder.CreateGEP(ptr, offset);
|
||||
ptr->addIncoming(pre_ptr, pre_block);
|
||||
ptr->addIncoming(next_ptr, loop_block);
|
||||
tmap_.insert({v, new shared_tile(ty, shapes, ptr, builder)});
|
||||
tmap_.insert({pre_value, new shared_tile(ty, shapes, pre_ptr, builder)});
|
||||
tmap_.insert({loop_value, new shared_tile(ty, shapes, next_ptr, builder)});
|
||||
}
|
||||
else
|
||||
throw std::runtime_error("unknown shared memory tile");
|
||||
@@ -633,46 +658,21 @@ void selection::run(ir::module &src, Module &dst){
|
||||
init_grids(fn, dst_builder, sh_mem_ptr);
|
||||
// iterate through block
|
||||
for(ir::basic_block *block: fn->blocks()) {
|
||||
dst_builder.SetInsertPoint((BasicBlock*)vmap_[block]);
|
||||
for(ir::instruction *i: block->get_inst_list())
|
||||
BasicBlock *parent = (BasicBlock*)vmap_[block];
|
||||
dst_builder.SetInsertPoint(parent);
|
||||
for(ir::instruction *i: block->get_inst_list()){
|
||||
if(dynamic_cast<ir::phi_node*>(i))
|
||||
dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt());
|
||||
lower_instruction(i, dst_builder);
|
||||
if(dynamic_cast<ir::phi_node*>(i))
|
||||
dst_builder.SetInsertPoint(parent);
|
||||
}
|
||||
}
|
||||
// add phi operands
|
||||
for(ir::basic_block *block: fn->blocks())
|
||||
for(ir::instruction *inst: block->get_inst_list())
|
||||
if(auto *phi = dynamic_cast<ir::phi_node*>(inst)){
|
||||
if(buffer_info_->is_shared(phi)) {
|
||||
BasicBlock *parent = (BasicBlock*)vmap_[phi->get_parent()];
|
||||
unsigned id_pre = 0, id_loop = 1;
|
||||
if(phi->get_incoming_block(0) == phi->get_parent())
|
||||
std::swap(id_pre, id_loop);
|
||||
ir::value *pre_value = phi->get_incoming_value(id_pre);
|
||||
ir::value *loop_value = phi->get_incoming_value(id_loop);
|
||||
BasicBlock *pre_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_pre)];
|
||||
BasicBlock *loop_block = (BasicBlock*)vmap_[phi->get_incoming_block(id_loop)];
|
||||
int pre_offset = alloc_->get_offset(pre_value);
|
||||
int loop_offset = alloc_->get_offset(loop_value);
|
||||
dst_builder.SetInsertPoint(&*parent->getFirstInsertionPt());
|
||||
PHINode *ptr = (PHINode*)(((shared_tile*)tmap_.at(phi))->get_pointer());
|
||||
// offset
|
||||
PHINode *offset = dst_builder.CreatePHI(dst_builder.getInt32Ty(), 2);
|
||||
dst_builder.SetInsertPoint(parent->getFirstNonPHI());
|
||||
Value *next_offset = dst_builder.CreateNeg(offset);
|
||||
offset->addIncoming(dst_builder.getInt32((loop_offset - pre_offset)/4), pre_block);
|
||||
offset->addIncoming(next_offset, loop_block);
|
||||
// next pointer
|
||||
Value *pre_ptr = dst_builder.CreateGEP(sh_mem_ptr, dst_builder.getInt32(pre_offset));
|
||||
pre_ptr = dst_builder.CreateBitCast(pre_ptr, ptr->getType());
|
||||
Value *next_ptr = dst_builder.CreateGEP(ptr, offset);
|
||||
ptr->addIncoming(pre_ptr, pre_block);
|
||||
ptr->addIncoming(next_ptr, loop_block);
|
||||
// barrier
|
||||
Function *barrier = Intrinsic::getDeclaration(dst_fn->getParent(), Intrinsic::nvvm_barrier0);
|
||||
dst_builder.SetInsertPoint(pre_block->getTerminator());
|
||||
dst_builder.CreateCall(barrier, {});
|
||||
dst_builder.SetInsertPoint(loop_block->getTerminator());
|
||||
dst_builder.CreateCall(barrier, {});
|
||||
|
||||
continue;
|
||||
}
|
||||
for(unsigned n = 0; n < phi->get_num_incoming(); n++){
|
||||
|
28
lib/codegen/vectorize.cpp
Normal file
28
lib/codegen/vectorize.cpp
Normal file
@@ -0,0 +1,28 @@
|
||||
#include "codegen/vectorize.h"
|
||||
#include "codegen/tune.h"
|
||||
#include "ir/module.h"
|
||||
#include "ir/function.h"
|
||||
#include "ir/basic_block.h"
|
||||
#include "ir/instructions.h"
|
||||
|
||||
namespace tdl {
|
||||
|
||||
namespace codegen{
|
||||
|
||||
void vectorize::run(ir::module &mod) {
|
||||
ir::builder &builder = mod.get_builder();
|
||||
for(ir::function *fn: mod.get_function_list())
|
||||
for(ir::basic_block *block: fn->blocks())
|
||||
for(ir::instruction *i: block->get_inst_list())
|
||||
if(dynamic_cast<ir::copy_to_shared_inst*>(i)){
|
||||
builder.set_insert_point(i);
|
||||
ir::value *x = i->get_operand(0);
|
||||
ir::instruction *rx = (ir::instruction*)builder.create_vectorize(x);
|
||||
x->replace_all_uses_with(rx);
|
||||
rx->set_operand(0, x);
|
||||
params_->copy(rx, x);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user