Improve ROCm support. (#780)

- updates to support ROCm 5.2
- workarounds in tests where NV tools were used unconditionally
- implemented `get_num_blocks()` and `add_memfence()` for AMD GPU
- backported from history some atomics
- added bf16 support
- minor warnings cleanup
- added dockerfile to run on a ROCm enabled machine

Co-authored-by: B1tway <andrew.shukshov@gmail.com>
Co-authored-by: Andrey Shukshov <36711069+B1tway@users.noreply.github.com>
This commit is contained in:
Daniil Fukalov
2022-10-14 21:33:42 +03:00
committed by GitHub
parent 94d5c2e8b5
commit 406d03bfaf
17 changed files with 435 additions and 155 deletions

View File

@@ -41,7 +41,8 @@ Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, un
}
Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) {
throw std::runtime_error("not implemented on AMD");
Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_waitcnt);
return builder.CreateIntrinsic(Intrinsic::amdgcn_s_waitcnt, {}, {builder.getInt32(0)});
}
@@ -56,7 +57,50 @@ Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigne
}
Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
throw std::runtime_error("not implemented on AMD");
Function &F = *builder.GetInsertBlock()->getParent();
Module *Mod = F.getParent();
// We are indexing into this struct, and want to extract the grid_size_*
// fields.
//
// typedef struct hsa_kernel_dispatch_packet_s {
// uint16_t header;
// uint16_t setup;
// uint16_t workgroup_size_x ;
// uint16_t workgroup_size_y;
// uint16_t workgroup_size_z;
// uint16_t reserved0;
// uint32_t grid_size_x ;
// uint32_t grid_size_y ;
// uint32_t grid_size_z;
// .....
// } hsa_kernel_dispatch_packet_t
//
Function *DispatchPtrFn =
Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
CallInst *DispatchPtr = builder.CreateCall(DispatchPtrFn, {});
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
F.removeFnAttr("amdgpu-no-dispatch-ptr");
// Size of the dispatch packet struct.
DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
Type *I32Ty = Type::getInt32Ty(Mod->getContext());
// TODO: include AMDGPUAS:: declarations.
Value *CastDispatchPtr = builder.CreateBitCast(
DispatchPtr, PointerType::get(I32Ty, 4 /*AMDGPUAS::CONSTANT_ADDRESS*/));
// grid_size_x offset is 3*32bit
assert(ax < 3);
Value *GEP =
builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, ax + 3);
LoadInst *Load = builder.CreateAlignedLoad(I32Ty, GEP, Align(4));
MDNode *MD = MDNode::get(Mod->getContext(), None);
Load->setMetadata(LLVMContext::MD_invariant_load, MD);
return Load; // throw std::runtime_error("not implemented on AMD");
}
Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {