From 513bcaee50cc0a373b1f82c2c21d81dcd817d6e5 Mon Sep 17 00:00:00 2001 From: Philippe Tillet Date: Wed, 27 Apr 2022 16:28:27 -0700 Subject: [PATCH] Added some ASCII art for encoding documentation --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 70 +++++++++++++++---- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index 38e2afbe4..38a13d81c 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -10,15 +10,30 @@ def TritonGPUSharedEncodingAttr : TritonGPU_Attr<"TritonGPUSharedEncoding"> { let mnemonic = "shared (memory) encoding"; let description = [{ - Example: +An encoding for tensors whose elements may be simultaneously accessed by different warps in the programs, via shared memory. - ```mlir - #SMEM = #triton_gpu.encoding<{ - vec = 8, - perPhase = 8, - maxPhase = 1 - }> - ``` +In order to avoid shared memory bank conflicts, elements may be stored in a swizzled layout. +For example, a swizzled row-major layout stores would store data as follows: + +A_{0, 0} A_{0, 1} A_{0, 2} A_{0, 3} ... [phase 0] \ per_phase = 2 +A_{1, 0} A_{0, 1} A_{1, 2} A_{1, 3} ... [phase 0] / + +groups of vec=2 elements +are stored contiguously +_ _ _ _ /\_ _ _ _ +A_{2, 2} A_{2, 3} A_{2, 0} A_{2, 1} ... [phase 1] \ per phase = 2 +A_{3, 2} A_{3, 3} A_{3, 0} A_{3, 1} ... [phase 1] / + + +And the associated TritonGPU MLIR + +```mlir +#SMEM = #triton_gpu.encoding<{ + vec = 2, + perPhase = 2, + maxPhase = 4 +}> +``` }]; let parameters = ( @@ -31,14 +46,41 @@ def TritonGPUSharedEncodingAttr : TritonGPU_Attr<"TritonGPUSharedEncoding"> { def TritonGPUCoalescedEncodingAttr : TritonGPU_Attr<"TritonGPUCoalescedEncoding"> { let mnemonic = "coalesced encoding"; - let description = [{}]; + let description = [{ +An encoding where each warp owns a contiguous portion of the target tensor. This is typically the kind of data layout +consumed (and returned) by LoadInst. +For example, a row-major coalesced layout may distribute a 32x16 tensor over 2 warps (i.e. 64 threads) as follows: + + thread tile size 2 + - - - - - - /\ - - - - - - +block| thread || A_{0, 0}[T0] A_{0, 1}[T0] ... A_{0, 6}[T3] A_{0, 7}[T3] A_{0, 8}[T0] A_{0, 9}[T0] ... A_{0, 14}[T3] A_{0, 15}[T3] +tile | tile size 2 || A_{1, 0}[T0] A_{1, 1}[T0] ... A_{1, 6}[T3] A_{1, 7}[T3] A_{1, 8}[T0] A_{1, 9}[T0] ... A_{1, 14}[T3] A_{1, 15}[T3] +size } .... +16 | A_{30, 0}[T60] A_{14, 1}[T60] ... A_{14, 6}[T63] A_{14, 7}[T63] A_{14, 8}[T60] A_{14, 9}[T60] ... A_{14, 14}[T63] A_{14, 15}[T63] + | A_{31, 0}[T60] A_{15, 1}[T60] ... A_{15, 6}[T63] A_{15, 7}[T63] A_{15, 8}[T60] A_{15, 9}[T60] ... A_{15, 14}[T63] A_{15, 15}[T63] + -----------------------------/\----------------------------------- + block tile size 8 + + A_{16, 0}[T0] A_{16, 1}[T0] ... A_{16, 6}[T3] A_{16, 7}[T3] A_{16, 8}[T0] A_{16, 9}[T0] ... A_{16, 14}[T3] A_{16, 15}[T3] + A_{17, 0}[T4] A_{17, 1}[T4] ... A_{17, 6}[T7] A_{17, 7}[T7] A_{17, 8}[T4] A_{17, 9}[T4] ... A_{17, 14}[T7] A_{17, 15}[T7] + .... + A_{30, 0}[T56] A_{30, 1}[T56] ... A_{30, 6}[T59] A_{30, 7}[T59] A_{30, 8}[T56] A_{30, 9}[T56] ... A_{30, 14}[T59] A_{30, 15}[T59] + A_{31, 0}[T60] A_{31, 1}[T60] ... A_{31, 6}[T63] A_{31, 7}[T63] A_{31, 8}[T60] A_{31, 9}[T60] ... A_{31, 14}[T63] A_{31, 15}[T63] + +And the associated TritonGPU MLIR +#SMEM = #triton_gpu.encoding<{ + threadTileSize = {2, 2} + blockTileSize = {16, 8} +}> + +// note to Da: In current Triton codebase, `nanoTileSize = threadTileSize`, and `macro-tile size = blockTileSize / threadTileSize` + probably clearer to have easier semantics (i.e., size of each tile owned by a thread or a block) +}]; let parameters = ( ins - ArrayRefParameter<"unsigned">:$nanoTileSize, - ArrayRefParameter<"unsigned">:$microTileSize, - ArrayRefParameter<"unsigned">:$shapePerCTA, - ArrayRefParameter<"unsigned">:$reptitions + ArrayRefParameter<"unsigned">:$threadTileSize, + ArrayRefParameter<"unsigned">:$blockTileSize, ); // let genVerifyDecl = 1; @@ -47,7 +89,7 @@ def TritonGPUCoalescedEncodingAttr : TritonGPU_Attr<"TritonGPUCoalescedEncoding" def TritonGPUMmaEncodingAttr : TritonGPU_Attr<"TritonGPUMmaEncoding"> { let mnemonic = "mma encoding"; - let description = [{}]; + let description = [{TODO: I think we may be able to implement it as a special-case of Coalesced encoding with maybe one more warpTileSize attribute!}]; let parameters = ( ins