diff --git a/include/triton/codegen/analysis/layout.h b/include/triton/codegen/analysis/layout.h index 0cf7faf2e..833e96767 100644 --- a/include/triton/codegen/analysis/layout.h +++ b/include/triton/codegen/analysis/layout.h @@ -111,10 +111,15 @@ public: int rep(size_t k) { return rep_.at(k); } private: + // fragment per warp std::vector fpw_; + // shape per warp std::vector spw_; + // warp per tile std::vector wpt_; + // shape per tile std::vector spt_; + // repetitions std::vector rep_; }; @@ -131,7 +136,9 @@ struct scanline_layout: public data_layout { int nts(size_t k) { return nts_.at(k); } public: + // micro tile size. The size of a tile held by a thread block. std::vector mts_; + // nano tile size. The size of a tile held by a thread. std::vector nts_; }; diff --git a/lib/codegen/selection/generator.cc b/lib/codegen/selection/generator.cc index 8098462bb..7f189e4d2 100644 --- a/lib/codegen/selection/generator.cc +++ b/lib/codegen/selection/generator.cc @@ -465,7 +465,7 @@ Value* generator::bf16_to_fp32(Value *in0){ Value* generator::fp32_to_bf16(Value *in0){ if(tgt_->as_nvidia()->sm() >= 80){ - InlineAsm *ptx = InlineAsm::get(FunctionType::get(builder_->getInt16Ty(), {builder_->getFloatTy()}), + InlineAsm *ptx = InlineAsm::get(FunctionType::get(builder_->getInt16Ty(), {builder_->getFloatTy()}, false), "cvt.rn.bf16.f32 $0, $1;", "=h,r", false); return call(ptx, {in0}); } diff --git a/python/src/triton.cc b/python/src/triton.cc index 1e7fe255d..a7c3379b6 100644 --- a/python/src/triton.cc +++ b/python/src/triton.cc @@ -15,6 +15,7 @@ #include #include #include +#include namespace py = pybind11; namespace ir = triton::ir;