Improve ROCm support. (#780)
- updates to support ROCm 5.2 - workarounds in tests where NV tools were used unconditionally - implemented `get_num_blocks()` and `add_memfence()` for AMD GPU - backported from history some atomics - added bf16 support - minor warnings cleanup - added dockerfile to run on a ROCm enabled machine Co-authored-by: B1tway <andrew.shukshov@gmail.com> Co-authored-by: Andrey Shukshov <36711069+B1tway@users.noreply.github.com>
This commit is contained in:
@@ -41,7 +41,8 @@ Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, un
|
||||
}
|
||||
|
||||
Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) {
|
||||
throw std::runtime_error("not implemented on AMD");
|
||||
Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_waitcnt);
|
||||
return builder.CreateIntrinsic(Intrinsic::amdgcn_s_waitcnt, {}, {builder.getInt32(0)});
|
||||
}
|
||||
|
||||
|
||||
@@ -56,7 +57,50 @@ Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigne
|
||||
}
|
||||
|
||||
Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
|
||||
throw std::runtime_error("not implemented on AMD");
|
||||
Function &F = *builder.GetInsertBlock()->getParent();
|
||||
Module *Mod = F.getParent();
|
||||
// We are indexing into this struct, and want to extract the grid_size_*
|
||||
// fields.
|
||||
//
|
||||
// typedef struct hsa_kernel_dispatch_packet_s {
|
||||
// uint16_t header;
|
||||
// uint16_t setup;
|
||||
// uint16_t workgroup_size_x ;
|
||||
// uint16_t workgroup_size_y;
|
||||
// uint16_t workgroup_size_z;
|
||||
// uint16_t reserved0;
|
||||
// uint32_t grid_size_x ;
|
||||
// uint32_t grid_size_y ;
|
||||
// uint32_t grid_size_z;
|
||||
// .....
|
||||
// } hsa_kernel_dispatch_packet_t
|
||||
//
|
||||
Function *DispatchPtrFn =
|
||||
Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
|
||||
|
||||
CallInst *DispatchPtr = builder.CreateCall(DispatchPtrFn, {});
|
||||
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
|
||||
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
|
||||
F.removeFnAttr("amdgpu-no-dispatch-ptr");
|
||||
|
||||
// Size of the dispatch packet struct.
|
||||
DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
|
||||
|
||||
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
|
||||
// TODO: include AMDGPUAS:: declarations.
|
||||
Value *CastDispatchPtr = builder.CreateBitCast(
|
||||
DispatchPtr, PointerType::get(I32Ty, 4 /*AMDGPUAS::CONSTANT_ADDRESS*/));
|
||||
|
||||
// grid_size_x offset is 3*32bit
|
||||
assert(ax < 3);
|
||||
Value *GEP =
|
||||
builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, ax + 3);
|
||||
LoadInst *Load = builder.CreateAlignedLoad(I32Ty, GEP, Align(4));
|
||||
|
||||
MDNode *MD = MDNode::get(Mod->getContext(), None);
|
||||
Load->setMetadata(LLVMContext::MD_invariant_load, MD);
|
||||
|
||||
return Load; // throw std::runtime_error("not implemented on AMD");
|
||||
}
|
||||
|
||||
Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
|
||||
|
Reference in New Issue
Block a user