[dnn/shift]: added support for fp16

This commit is contained in:
Philippe Tillet
2019-07-13 21:05:34 -07:00
parent fe42cb7142
commit 3e7a3ed67a
11 changed files with 76 additions and 43 deletions

View File

@@ -376,7 +376,15 @@ Instruction *selection::llvm_inst(ir::instruction *inst, std::function<Value*(ir
if(ir::atomic_add_inst* ii = dynamic_cast<ir::atomic_add_inst*>(inst)){
Value *ptr = value(ii->get_operand(0));
Value *val = value(ii->get_operand(1));
Value *atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()});
Value *atom_f_add;
if(val->getType()->isFloatTy())
atom_f_add = Intrinsic::getDeclaration(builder.GetInsertBlock()->getModule(), Intrinsic::nvvm_atomic_load_add_f32, {ptr->getType()});
else if(val->getType()->isHalfTy()){
Type *fp16 = Type::getHalfTy(ctx);
FunctionType *atom_ty = FunctionType::get(fp16, {fp16->getPointerTo(), fp16}, false);
atom_f_add = InlineAsm::get(atom_ty, " atom.relaxed.global.gpu.add.noftz.f16 $0, [$1], $2;", "=h,l,h", true);
}
Value *res = builder.CreateCall(atom_f_add, {ptr, val});
return (Instruction*)res;
}
@@ -1110,6 +1118,7 @@ void selection::lower_tile_instruction(ir::instruction *ins, llvm::IRBuilder<> &
unsigned max_contiguous = axis_info_->get_max_contiguous(ptr);
unsigned alignment = std::min(starting_multiple, max_contiguous);
unsigned vector_size = std::min<unsigned>(result->axis(0).contiguous, alignment);
vector_size = 1;
// vector_size = result->axis(0).contiguous;
std::map<unsigned, Value*> packets;
distributed_tile *TP = (distributed_tile*)tmap_.at(ld->get_pointer_operand());