Improve ROCm support. (#780)

- updates to support ROCm 5.2 - workarounds in tests where NV tools were used unconditionally - implemented `get_num_blocks()` and `add_memfence()` for AMD GPU - backported from history some atomics - added bf16 support - minor warnings cleanup - added dockerfile to run on a ROCm enabled machine Co-authored-by: B1tway <andrew.shukshov@gmail.com> Co-authored-by: Andrey Shukshov <36711069+B1tway@users.noreply.github.com>
2022-10-14 21:33:42 +03:00
parent 94d5c2e8b5
commit 406d03bfaf
17 changed files with 435 additions and 155 deletions
--- a/lib/codegen/target.cc
+++ b/lib/codegen/target.cc
@@ -41,7 +41,8 @@ Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, un
 }

 Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) {
-  throw std::runtime_error("not implemented on AMD");
+  Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_waitcnt);
+  return builder.CreateIntrinsic(Intrinsic::amdgcn_s_waitcnt, {}, {builder.getInt32(0)});
 }


@@ -56,7 +57,50 @@ Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigne
 }

 Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
-  throw std::runtime_error("not implemented on AMD");
+  Function &F = *builder.GetInsertBlock()->getParent();
+  Module *Mod = F.getParent();
+  // We are indexing into this struct, and want to extract the grid_size_*
+  // fields.
+  //
+  //   typedef struct hsa_kernel_dispatch_packet_s {
+  //     uint16_t header;
+  //     uint16_t setup;
+  //     uint16_t workgroup_size_x ;
+  //     uint16_t workgroup_size_y;
+  //     uint16_t workgroup_size_z;
+  //     uint16_t reserved0;
+  //     uint32_t grid_size_x ;
+  //     uint32_t grid_size_y ;
+  //     uint32_t grid_size_z;
+  //     .....
+  //   } hsa_kernel_dispatch_packet_t
+  //
+  Function *DispatchPtrFn =
+      Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+  CallInst *DispatchPtr = builder.CreateCall(DispatchPtrFn, {});
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+  F.removeFnAttr("amdgpu-no-dispatch-ptr");
+
+  // Size of the dispatch packet struct.
+  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
+
+  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+  // TODO: include AMDGPUAS:: declarations.
+  Value *CastDispatchPtr = builder.CreateBitCast(
+      DispatchPtr, PointerType::get(I32Ty, 4 /*AMDGPUAS::CONSTANT_ADDRESS*/));
+
+  // grid_size_x offset is 3*32bit
+  assert(ax < 3);
+  Value *GEP =
+      builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, ax + 3);
+  LoadInst *Load = builder.CreateAlignedLoad(I32Ty, GEP, Align(4));
+
+  MDNode *MD = MDNode::get(Mod->getContext(), None);
+  Load->setMetadata(LLVMContext::MD_invariant_load, MD);
+
+  return Load; // throw std::runtime_error("not implemented on AMD");
 }

 Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {