diff --git a/examples/matrix.cpp b/examples/matrix.cpp index 5783f6f3d..9f9e2a724 100644 --- a/examples/matrix.cpp +++ b/examples/matrix.cpp @@ -213,7 +213,7 @@ int main() { CUstream cu_stream; int major, minor; compile_machine_code(cu_device, cu_context, cu_module, cu_kernel, cu_stream, major, minor, src, "test"); -// std::cout << src << std::endl; + std::cout << src << std::endl; // execute machine code // Allocate buffers diff --git a/include/codegen/selection.h b/include/codegen/selection.h index 1e413f707..73c72f120 100644 --- a/include/codegen/selection.h +++ b/include/codegen/selection.h @@ -25,6 +25,7 @@ class tune; typedef std::vector indices_t; struct distributed_axis { + size_t contiguous; std::vector values; }; @@ -33,7 +34,7 @@ protected: typedef std::vector shapes_t; public: - tile(llvm::Type *ty, const shapes_t &shapes): shapes_(shapes){ } + tile(llvm::Type *ty, const shapes_t &shapes): ty_(ty), shapes_(shapes){ } virtual void set_value(indices_t idx, llvm::Value *v) = 0; virtual llvm::Value* get_value(indices_t idx) = 0; @@ -69,7 +70,9 @@ private: void init_indices(); public: - distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes); + distributed_tile(llvm::Type *ty, const shapes_t& shapes, const axes_t &axes, llvm::IRBuilder<> &builder); + void set_vectorized_iteration() { vectorized_ = true; } + void unset_vectorized_iteration() { vectorized_ = false; } void set_value(indices_t idx, llvm::Value *v); llvm::Value* get_value(indices_t idx); void for_each(std::function fn); @@ -78,6 +81,9 @@ private: axes_t axes_; indices_map_t indices_; values_t values_; + size_t vector_size_; + llvm::IRBuilder<> &builder_; + bool vectorized_; }; @@ -86,6 +92,9 @@ class selection{ typedef std::map tmap_t; private: + // utils + llvm::Type *make_vector_ty(llvm::Type *ty, size_t vector_size); + // LLVM conversions llvm::Type* llvm_type(ir::type *ty, llvm::LLVMContext &ctx); llvm::Value* llvm_value(ir::value *v, llvm::IRBuilder<> &builder); diff --git a/lib/codegen/selection.cpp b/lib/codegen/selection.cpp index 24f8ea482..501e25f49 100644 --- a/lib/codegen/selection.cpp +++ b/lib/codegen/selection.cpp @@ -34,24 +34,52 @@ void distributed_tile::init_indices() { } } -distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes) - : tile(ty, shapes), axes_(axes) { +distributed_tile::distributed_tile(Type *ty, const shapes_t &shapes, const axes_t &axes, llvm::IRBuilder<> &builder) + : tile(ty, shapes), axes_(axes), builder_(builder), vectorized_(true) { init_indices(); for(size_t i = 0; i < indices_.size(); i++) values_.push_back(UndefValue::get(ty)); + // vectorization + vector_size_ = 1; + if(ty->isVectorTy()) + vector_size_ = ty->getVectorNumElements(); } void distributed_tile::set_value(indices_t idx, Value *v) { - values_[indices_[idx]] = v; + unsigned value_idx = indices_[idx]; + Value *&result = values_[value_idx/vector_size_*vector_size_]; + if(v->getType() == result->getType()) { + assert(value_idx % vector_size_ == 0); + result = v; + } + // insert scalar in vector + else { + assert(vector_size_==1 || result->getType()->isVectorTy()); + assert(v->getType()->getScalarType() == result->getType()->getScalarType()); + result = builder_.CreateInsertElement(result, v, value_idx % vector_size_); + } } Value* distributed_tile::get_value(indices_t idx) { - return values_[indices_[idx]]; + unsigned value_idx = indices_[idx]; + Value *&result = values_[value_idx/vector_size_*vector_size_]; + if(vectorized_ || vector_size_ == 1) { + assert(value_idx % vector_size_ == 0); + return result; + } + // extract scalar from vector + else { + assert(result->getType()->isVectorTy()); + return builder_.CreateExtractElement(result, value_idx % vector_size_); + } + return result; } void distributed_tile::for_each(std::function fn) { - for(auto &idx: indices_) - fn(idx.first); + for(auto &idx: indices_) { + if(!vectorized_ || (idx.second % vector_size_ == 0)) + fn(idx.first); + } } /* Shared Tile */ @@ -121,12 +149,23 @@ Value* shared_tile::get_value(indices_t idx) { indices_t non_cst_idx, cst_idx; extract_constant(idx, non_cst_idx, cst_idx); Value *&base_ptr = ptr_cache_[non_cst_idx]; - if(base_ptr == nullptr) + if(base_ptr == nullptr){ base_ptr = builder_.CreateGEP(ptr_, shared_offset(non_cst_idx)); +// Type *vec_ty = VectorType::get(base_ptr->getType()->getPointerElementType(), vec_); +// Type *vec_ptr_ty = PointerType::get(vec_ty, base_ptr->getType()->getPointerElementType()); +// base_ptr = builder_.CreateBitCast(base_ptr, vec_ptr_ty); + } Value *ptr = builder_.CreateGEP(base_ptr, shared_offset(cst_idx)); return builder_.CreateLoad(ptr); } +/* helper to make vector type */ +llvm::Type *selection::make_vector_ty(llvm::Type *ty, size_t vector_size) { + if(vector_size == 1) + return ty; + return VectorType::get(ty, vector_size); +} + /* convert ir::type to Type */ Type *selection::llvm_type(ir::type *ty, LLVMContext &ctx) { // function @@ -299,7 +338,7 @@ void selection::init_axes(ir::value *v, IRBuilder<> &builder, Value *u_thread_id unsigned offset = n / contiguous[k] * per_block + n % contiguous[k]; idx_list[n] = builder.CreateAdd(thread_id, builder.getInt32(offset), "idx_" + str_k + "_" + std::to_string(n)); } - axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{idx_list}; + axes_[params_->get_param(v, "p0.d" + str_k)] = distributed_axis{contiguous[k], idx_list}; } } @@ -378,17 +417,22 @@ void selection::create_tile(ir::value *v, IRBuilder<> &builder, unsigned *x = params_->get_param(v, "p0.d" + std::to_string(d)); axes[d] = axes_.at(x); } - else + else{ + axes[d].contiguous = 1; axes[d].values = {builder.getInt32(0)}; + } } - distributed_tile *T = new distributed_tile(ty, shapes, axes); + distributed_tile *T = new distributed_tile(make_vector_ty(ty, axes[0].contiguous), shapes, axes, builder); tmap_.insert({v, T}); // constant range - if(dynamic_cast(v)) + if(dynamic_cast(v)){ + T->unset_vectorized_iteration(); T->for_each([&](indices_t idx){ assert(idx.size() == 1); T->set_value(idx, idx[0]); }); + T->set_vectorized_iteration(); + } } } @@ -454,6 +498,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & else if(dynamic_cast(ins)) { ir::value* in = ins->get_operand(0); distributed_tile *in_tile = (distributed_tile*)tmap_.at(in); + in_tile->unset_vectorized_iteration(); result->for_each([&](indices_t out_idx){ indices_t in_idx; for(size_t k = 0; k < shapes.size(); k++){ @@ -462,6 +507,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> & } result->set_value(out_idx, in_tile->get_value(in_idx)); }); + in_tile->set_vectorized_iteration(); } // splat else if(dynamic_cast(ins)) {