[Triton-MLIR] Fix threadsPerWarp derivation in BlockedEncodingAttr (#722)

Example: ``` auto encoding = triton::gpu::BlockedEncodingAttr::get( &getContext(), {8, 32}, {2, 2}, {1, 0}, 2); //shape = [32 x 8], order = [1, 0], sizePerThread=[2, 2], numWarps=2 ``` Expected output: ``` //#triton_gpu.blocked_layout<{ // sizePerThread = {2, 2} // threadsPerWarp = {8, 4} // warpsPerCTA = {2, 1} //}> ``` Incorrect output by the current branch ``` //#triton_gpu.blocked_layout<{ // sizePerThread = {2, 2} // threadsPerWarp = {16, 2} // warpsPerCTA = {2, 1} //}> ```
2022-09-27 16:41:30 -07:00
parent 9ddf0921fb
commit baba98ad69
2 changed files with 3 additions and 2 deletions
--- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
+++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td
@@ -169,6 +169,7 @@ for
        int dim = order[_dim];
        int maxNumThreads = int(shape[dim]) / sizePerThread[dim];
        warpsPerCTA[dim] = std::clamp(remainingWarps, 1, maxNumThreads);
+        maxNumThreads = maxNumThreads / warpsPerCTA[dim];
        threadsPerWarp[dim] = std::clamp(remainingLanes, 1, maxNumThreads);
        remainingWarps /= warpsPerCTA[dim];
        remainingLanes /= threadsPerWarp[dim];