From 7807f64ef38c74476b65e5b07a9695a187675521 Mon Sep 17 00:00:00 2001 From: Yan Da Date: Sun, 5 Jun 2022 16:14:59 +0800 Subject: [PATCH] rename sharded_layout => blocked_layout --- .../Dialect/TritonGPU/IR/TritonGPUAttrDefs.td | 6 +- lib/Dialect/TritonGPU/IR/Dialect.cpp | 6 +- .../Transforms/TritonGPUConversion.cpp | 2 +- lib/Dialect/TritonGPU/Transforms/Verifier.cpp | 2 +- rewrite-test/jit/vecadd/vecadd.mlir | 172 +++++++++--------- test/Triton/vecadd.mlir | 2 + test/TritonGPU/layout.mlir | 4 +- 7 files changed, 98 insertions(+), 96 deletions(-) diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td index b90e1dc61..383271e1a 100644 --- a/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td +++ b/include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td @@ -48,8 +48,8 @@ And the associated TritonGPU MLIR ); } -def TritonGPUShardedEncodingAttr : TritonGPU_Attr<"TritonGPUShardedEncoding"> { - let mnemonic = "sharded_layout"; +def TritonGPUBlockedEncodingAttr : TritonGPU_Attr<"TritonGPUBlockedEncoding"> { + let mnemonic = "blocked_layout"; let description = [{ An encoding where each warp owns a contiguous portion of the target tensor. This is typically the kind of data layout @@ -74,7 +74,7 @@ size } .... A_{63, 0}[T60] A_{63, 1}[T60] ... A_{63, 6}[T63] A_{63, 7}[T63] A_{63, 8}[T60] A_{63, 9}[T60] ... A_{63, 14}[T63] A_{63, 15}[T63] And the associated TritonGPU MLIR -#LAYOUT = #triton_gpu.sharded_layout<{ +#LAYOUT = #triton_gpu.blocked_layout<{ threadTileSize = {2, 2} blockTileSize = {32, 8} }> diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp index a23c07c57..e1cf43c65 100644 --- a/lib/Dialect/TritonGPU/IR/Dialect.cpp +++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp @@ -37,7 +37,7 @@ static LogicalResult parseIntArrayAttr(AsmParser &parser, #include "triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.cpp.inc" Attribute -TritonGPUShardedEncodingAttr::parse(AsmParser &parser, Type type) { +TritonGPUBlockedEncodingAttr::parse(AsmParser &parser, Type type) { if (parser.parseLess().failed()) return {}; // Parse the data as a dictionary @@ -94,14 +94,14 @@ TritonGPUShardedEncodingAttr::parse(AsmParser &parser, Type type) { } } - return parser.getChecked(parser.getContext(), + return parser.getChecked(parser.getContext(), threadTileSize, warpTileSize, blockTileSize, order); } -void TritonGPUShardedEncodingAttr::print(mlir::AsmPrinter &printer) const { +void TritonGPUBlockedEncodingAttr::print(mlir::AsmPrinter &printer) const { printer << "<{" << "threadTileSize = [" << getThreadTileSize() << "]" << ", warpTileSize = [" << getWarpTileSize() << "]" diff --git a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp index 8edd88ad1..d6eb0c329 100644 --- a/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp +++ b/lib/Dialect/TritonGPU/Transforms/TritonGPUConversion.cpp @@ -46,7 +46,7 @@ TritonGPUTypeConverter::TritonGPUTypeConverter(MLIRContext *context, remainingThreads /= blockTileSize[dim]; // TODO: will we need repetition? } - Attribute encoding = triton::gpu::TritonGPUShardedEncodingAttr::get( + Attribute encoding = triton::gpu::TritonGPUBlockedEncodingAttr::get( context, threadTileSize, warpTileSize, blockTileSize, order); return RankedTensorType::get(shape, elementType, encoding); }); diff --git a/lib/Dialect/TritonGPU/Transforms/Verifier.cpp b/lib/Dialect/TritonGPU/Transforms/Verifier.cpp index 619701f85..4a6c4b645 100644 --- a/lib/Dialect/TritonGPU/Transforms/Verifier.cpp +++ b/lib/Dialect/TritonGPU/Transforms/Verifier.cpp @@ -50,7 +50,7 @@ private: if (!encoding) return dotOp.emitError() << name << " should have encoding"; if (!encoding.isa() && - !encoding.isa()) + !encoding.isa()) return dotOp.emitError() << name << " should be of distributed layout"; if (name == 'c') cLayout = encoding; diff --git a/rewrite-test/jit/vecadd/vecadd.mlir b/rewrite-test/jit/vecadd/vecadd.mlir index 1e4434346..4148ec28a 100644 --- a/rewrite-test/jit/vecadd/vecadd.mlir +++ b/rewrite-test/jit/vecadd/vecadd.mlir @@ -40,89 +40,89 @@ module { return } } -module { - func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32) { - %c64 = arith.constant 64 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %c256_i32 = arith.constant 256 : i32 - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.muli %0, %c256_i32 : i32 - %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu<"coalesced encoding">> - %3 = tt.broadcast %1 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %4 = arith.addi %3, %2 : tensor<256xi32, #triton_gpu<"coalesced encoding">> - %5 = tt.broadcast %arg3 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %6 = "triton_gpu.cmpi"(%4, %5) {predicate = 2 : i64} : (tensor<256xi32, #triton_gpu<"coalesced encoding">>, tensor<256xi32, #triton_gpu<"coalesced encoding">>) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %7 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %8 = tt.getelementptr %7, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %9 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %10 = tt.getelementptr %9, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %11 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %12 = arith.index_cast %arg4 : i32 to index - %13 = arith.cmpi slt, %c0, %12 : index - %14 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %15 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %16 = arith.andi %6, %15 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %17 = triton_gpu.copy_async %8, %16, %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %18 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %19 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %20 = arith.andi %6, %19 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %21 = triton_gpu.copy_async %10, %20, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %22 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %23 = tt.getelementptr %8, %22, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %24 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %25 = tt.getelementptr %10, %24, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %26 = arith.cmpi slt, %c32, %12 : index - %27 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %28 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %29 = arith.andi %6, %28 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %30 = triton_gpu.copy_async %23, %29, %27 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %31 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %32 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %33 = arith.andi %6, %32 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %34 = triton_gpu.copy_async %25, %33, %31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %35 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %36 = tt.getelementptr %23, %35, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %37 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %38 = tt.getelementptr %25, %37, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %39 = arith.cmpi slt, %c64, %12 : index - %40 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %41 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %42 = arith.andi %6, %41 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %43 = triton_gpu.copy_async %36, %42, %40 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %44 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %45 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %46 = arith.andi %6, %45 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %47 = triton_gpu.copy_async %38, %46, %44 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %48 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %49 = tt.getelementptr %36, %48, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %50 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %51 = tt.getelementptr %38, %50, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %52:12 = scf.for %arg6 = %c0 to %12 step %c32 iter_args(%arg7 = %11, %arg8 = %8, %arg9 = %10, %arg10 = %17, %arg11 = %30, %arg12 = %43, %arg13 = %21, %arg14 = %34, %arg15 = %47, %arg16 = %51, %arg17 = %49, %arg18 = %c64) -> (tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index) { - %55 = arith.addf %arg10, %arg13 : tensor<256xf32, #triton_gpu<"coalesced encoding">> - %56 = arith.addf %arg7, %55 : tensor<256xf32, #triton_gpu<"coalesced encoding">> - %57 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %58 = tt.getelementptr %arg8, %57, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %59 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %60 = tt.getelementptr %arg9, %59, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %61 = arith.addi %arg18, %c32 : index - %62 = arith.cmpi slt, %61, %12 : index - %63 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %64 = tt.broadcast %62 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> - %65 = arith.andi %64, %6 : tensor<256xi1, #triton_gpu<"coalesced encoding">> - %66 = triton_gpu.copy_async %arg17, %65, %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %67 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %68 = triton_gpu.copy_async %arg16, %65, %67 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> - %69 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %70 = tt.getelementptr %arg17, %69, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %71 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> - %72 = tt.getelementptr %arg16, %71, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - scf.yield %56, %58, %60, %arg11, %arg12, %66, %arg14, %arg15, %68, %72, %70, %61 : tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index - } - %53 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - %54 = tt.getelementptr %53, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> - tt.store %54, %52#0, %6, : tensor<256xf32, #triton_gpu<"coalesced encoding">> - return - } -} +// module { +// func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32) { +// %c64 = arith.constant 64 : index +// %c32 = arith.constant 32 : index +// %c0 = arith.constant 0 : index +// %cst = arith.constant 0.000000e+00 : f32 +// %c256_i32 = arith.constant 256 : i32 +// %0 = tt.get_program_id {axis = 0 : i32} : i32 +// %1 = arith.muli %0, %c256_i32 : i32 +// %2 = tt.make_range {end = 256 : i32, start = 0 : i32} : tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %3 = tt.broadcast %1 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %4 = arith.addi %3, %2 : tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %5 = tt.broadcast %arg3 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %6 = "triton_gpu.cmpi"(%4, %5) {predicate = 2 : i64} : (tensor<256xi32, #triton_gpu<"coalesced encoding">>, tensor<256xi32, #triton_gpu<"coalesced encoding">>) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %7 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %8 = tt.getelementptr %7, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %9 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %10 = tt.getelementptr %9, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %11 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %12 = arith.index_cast %arg4 : i32 to index +// %13 = arith.cmpi slt, %c0, %12 : index +// %14 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %15 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %16 = arith.andi %6, %15 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %17 = triton_gpu.copy_async %8, %16, %14 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %18 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %19 = tt.broadcast %13 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %20 = arith.andi %6, %19 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %21 = triton_gpu.copy_async %10, %20, %18 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %22 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %23 = tt.getelementptr %8, %22, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %24 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %25 = tt.getelementptr %10, %24, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %26 = arith.cmpi slt, %c32, %12 : index +// %27 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %28 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %29 = arith.andi %6, %28 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %30 = triton_gpu.copy_async %23, %29, %27 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %31 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %32 = tt.broadcast %26 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %33 = arith.andi %6, %32 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %34 = triton_gpu.copy_async %25, %33, %31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %35 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %36 = tt.getelementptr %23, %35, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %37 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %38 = tt.getelementptr %25, %37, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %39 = arith.cmpi slt, %c64, %12 : index +// %40 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %41 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %42 = arith.andi %6, %41 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %43 = triton_gpu.copy_async %36, %42, %40 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %44 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %45 = tt.broadcast %39 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %46 = arith.andi %6, %45 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %47 = triton_gpu.copy_async %38, %46, %44 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %48 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %49 = tt.getelementptr %36, %48, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %50 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %51 = tt.getelementptr %38, %50, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %52:12 = scf.for %arg6 = %c0 to %12 step %c32 iter_args(%arg7 = %11, %arg8 = %8, %arg9 = %10, %arg10 = %17, %arg11 = %30, %arg12 = %43, %arg13 = %21, %arg14 = %34, %arg15 = %47, %arg16 = %51, %arg17 = %49, %arg18 = %c64) -> (tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index) { +// %55 = arith.addf %arg10, %arg13 : tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %56 = arith.addf %arg7, %55 : tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %57 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %58 = tt.getelementptr %arg8, %57, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %59 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %60 = tt.getelementptr %arg9, %59, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %61 = arith.addi %arg18, %c32 : index +// %62 = arith.cmpi slt, %61, %12 : index +// %63 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %64 = tt.broadcast %62 : (i1) -> tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %65 = arith.andi %64, %6 : tensor<256xi1, #triton_gpu<"coalesced encoding">> +// %66 = triton_gpu.copy_async %arg17, %65, %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %67 = tt.broadcast %cst : (f32) -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %68 = triton_gpu.copy_async %arg16, %65, %67 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> -> tensor<256xf32, #triton_gpu<"coalesced encoding">> +// %69 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %70 = tt.getelementptr %arg17, %69, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %71 = tt.broadcast %arg5 : (i32) -> tensor<256xi32, #triton_gpu<"coalesced encoding">> +// %72 = tt.getelementptr %arg16, %71, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// scf.yield %56, %58, %60, %arg11, %arg12, %66, %arg14, %arg15, %68, %72, %70, %61 : tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256xf32, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">>, index +// } +// %53 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// %54 = tt.getelementptr %53, %4, : tensor<256x!tt.ptr, #triton_gpu<"coalesced encoding">> +// tt.store %54, %52#0, %6, : tensor<256xf32, #triton_gpu<"coalesced encoding">> +// return +// } +// } diff --git a/test/Triton/vecadd.mlir b/test/Triton/vecadd.mlir index 4148ec28a..8e4e23b69 100644 --- a/test/Triton/vecadd.mlir +++ b/test/Triton/vecadd.mlir @@ -1,3 +1,5 @@ +// RUN: triton-opt %s -tritongpu-verifier -verify-diagnostics + module { func @add_kernel__Pfp32_Pfp32_Pfp32_i32_i32_i32__(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32) { %0 = tt.get_program_id {axis = 0 : i32} : i32 diff --git a/test/TritonGPU/layout.mlir b/test/TritonGPU/layout.mlir index cccb04f24..e03c018bf 100644 --- a/test/TritonGPU/layout.mlir +++ b/test/TritonGPU/layout.mlir @@ -1,13 +1,13 @@ // RUN: triton-opt %s -split-input-file -verify-diagnostics -#reg = #triton_gpu.sharded_layout<{ +#reg = #triton_gpu.blocked_layout<{ threadTileSize = [1, 1], warpTileSize = [32, 1], blockTileSize = [64, 1], order = [0] }> -#reg2 = #triton_gpu.sharded_layout<{ +#reg2 = #triton_gpu.blocked_layout<{ threadTileSize = [2, 1], warpTileSize = [64, 1], blockTileSize = [128, 1],