[TritonGPU] Improved documentation and semantics of layout encodings (#30)
This commit is contained in:
@@ -10,7 +10,7 @@ func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {t
|
||||
// CHECK-NEXT: Contiguity: [128] ; Divisibility: [65536] ; Constancy: [1]
|
||||
%1 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32>
|
||||
// CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [65536, 1] ; Constancy: [1, 1]
|
||||
%2 = tt.reshape %0 : (tensor<128xi32>) -> tensor<128x1xi32>
|
||||
%2 = tt.view %0 : (tensor<128xi32>) -> tensor<128x1xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1]
|
||||
%3 = tt.splat %arg1 : (i32) -> tensor<128x1xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [1048576, 16] ; Constancy: [1, 1]
|
||||
@@ -20,7 +20,7 @@ func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {t
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 1]
|
||||
%6 = tt.getelementptr %5, %4 : tensor<128x1x!tt.ptr<f32>>
|
||||
// CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 65536] ; Constancy: [1, 1]
|
||||
%7 = tt.reshape %1 : (tensor<128xi32>) -> tensor<1x128xi32>
|
||||
%7 = tt.view %1 : (tensor<128xi32>) -> tensor<1x128xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128]
|
||||
%8 = tt.broadcast %6 : (tensor<128x1x!tt.ptr<f32>>) -> tensor<128x128x!tt.ptr<f32>>
|
||||
// CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 65536] ; Constancy: [128, 1]
|
||||
@@ -28,13 +28,13 @@ func @permute_2d(%arg0: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg1: i32 {t
|
||||
// CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 16] ; Constancy: [1, 1]
|
||||
%10 = tt.getelementptr %8, %9 : tensor<128x128x!tt.ptr<f32>>
|
||||
// CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [65536, 1] ; Constancy: [1, 1]
|
||||
%11 = tt.reshape %0 : (tensor<128xi32>) -> tensor<128x1xi32>
|
||||
%11 = tt.view %0 : (tensor<128xi32>) -> tensor<128x1xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [128, 1]
|
||||
%12 = tt.splat %arg2 : (!tt.ptr<f32>) -> tensor<128x1x!tt.ptr<f32>>
|
||||
// CHECK-NEXT: Contiguity: [128, 1] ; Divisibility: [16, 1] ; Constancy: [1, 1]
|
||||
%13 = tt.getelementptr %12, %11 : tensor<128x1x!tt.ptr<f32>>
|
||||
// CHECK-NEXT: Contiguity: [1, 128] ; Divisibility: [1, 65536] ; Constancy: [1, 1]
|
||||
%14 = tt.reshape %1 : (tensor<128xi32>) -> tensor<1x128xi32>
|
||||
%14 = tt.view %1 : (tensor<128xi32>) -> tensor<1x128xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 16] ; Constancy: [1, 128]
|
||||
%15 = tt.splat %arg3 : (i32) -> tensor<1x128xi32>
|
||||
// CHECK-NEXT: Contiguity: [1, 1] ; Divisibility: [16, 1048576] ; Constancy: [1, 1]
|
||||
|
@@ -1,26 +0,0 @@
|
||||
// RUN: triton-opt %s -split-input-file -verify-diagnostics
|
||||
|
||||
#reg = #triton_gpu.blocked_layout<{
|
||||
threadTileSize = [1, 1],
|
||||
warpTileSize = [32, 1],
|
||||
blockTileSize = [64, 1],
|
||||
order = [0]
|
||||
}>
|
||||
|
||||
#reg2 = #triton_gpu.blocked_layout<{
|
||||
threadTileSize = [2, 1],
|
||||
warpTileSize = [64, 1],
|
||||
blockTileSize = [128, 1],
|
||||
order = [0]
|
||||
}>
|
||||
|
||||
func @add(%arg0: tensor<256xi32, #reg>, %arg1: tensor<256xi32, #reg>) {
|
||||
%0 = arith.addi %arg0, %arg1 : tensor<256xi32, #reg>
|
||||
return
|
||||
}
|
||||
|
||||
func @add(%arg0: tensor<256xi32, #reg>, %arg1: tensor<256xi32, #reg>) { // expected-note {{prior use here}}
|
||||
// expected-error @+1 {{use of value '%arg0' expects different type than prior uses}}
|
||||
%0 = arith.addi %arg0, %arg1 : tensor<256xi32, #reg2>
|
||||
return
|
||||
}
|
@@ -1,45 +1,12 @@
|
||||
// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize -tritongpu-verifier
|
||||
// RUN: triton-opt %s -split-input-file -tritongpu-pipeline=num-stages=3 -canonicalize -tritongpu-verifier | FileCheck %s
|
||||
|
||||
// 4 warps
|
||||
// matmul: 128x32 @ 32x128 -> 128x128
|
||||
#AL = #triton_gpu.blocked_layout<{
|
||||
threadTileSize = [1, 4],
|
||||
warpTileSize = [4, 32],
|
||||
blockTileSize = [16, 32],
|
||||
order = [1, 0]
|
||||
}>
|
||||
|
||||
#BL = #triton_gpu.blocked_layout<{
|
||||
threadTileSize = [1, 4],
|
||||
warpTileSize = [1, 128],
|
||||
blockTileSize = [4, 128],
|
||||
order = [1, 0]
|
||||
}>
|
||||
|
||||
#A = #triton_gpu.shared_layout<{
|
||||
vec = 2,
|
||||
perPhase = 2,
|
||||
maxPhase = 4,
|
||||
order = [1, 0]
|
||||
}>
|
||||
|
||||
#B = #triton_gpu.shared_layout<{
|
||||
vec = 2,
|
||||
perPhase = 2,
|
||||
maxPhase = 4,
|
||||
order = [1, 0]
|
||||
}>
|
||||
|
||||
// TODO: check this
|
||||
#C = #triton_gpu.mma_layout<{
|
||||
fragmentPerWarp = [1, 1],
|
||||
shapePerWarp = [16, 8],
|
||||
warpPerTile = [2, 2],
|
||||
shapePerTile = [32, 16],
|
||||
repetitions = [4, 4],
|
||||
contigPerThread = [1, 8]
|
||||
}>
|
||||
#AL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
|
||||
#BL = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
|
||||
#A = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
|
||||
#B = #triton_gpu.shared<{vec = 2, perPhase = 2, maxPhase = 4, order = [1, 0]}>
|
||||
#C = #triton_gpu.mma<{version = 2, warpsPerCTA = [4, 1]}>
|
||||
|
||||
// CHECK: func @matmul_loop
|
||||
// CHECK: %[[A0:.*]] = triton_gpu.copy_async
|
||||
|
Reference in New Issue
Block a user