From edca91bf8f15ffcec5844f1da55a3dd3de40e66f Mon Sep 17 00:00:00 2001 From: Yan Da Date: Wed, 27 Apr 2022 19:41:07 +0800 Subject: [PATCH] Update traits (NoSideEffect) --- include/triton/Dialect/Triton/IR/Dialect.h | 4 +- include/triton/Dialect/Triton/IR/TritonOps.td | 6 +- include/triton/transforms/Passes.h | 12 - lib/Dialect/Triton/IR/Dialect.cpp | 4 +- lib/Dialect/Triton/IR/Ops.cpp | 4 +- rewrite-test/jit/matmul/matmul.mlir | 259 +----------------- 6 files changed, 17 insertions(+), 272 deletions(-) delete mode 100644 include/triton/transforms/Passes.h diff --git a/include/triton/Dialect/Triton/IR/Dialect.h b/include/triton/Dialect/Triton/IR/Dialect.h index fc25667b9..d5c8d2f43 100644 --- a/include/triton/Dialect/Triton/IR/Dialect.h +++ b/include/triton/Dialect/Triton/IR/Dialect.h @@ -9,8 +9,8 @@ #include "mlir/Dialect/SCF/SCF.h" #include "triton/Dialect/Triton/IR/Traits.h" -#include "triton/Dialect/Triton/Dialect.h.inc" -#include "triton/Dialect/Triton/OpsEnums.h.inc" +#include "triton/Dialect/Triton/IR/Dialect.h.inc" +#include "triton/Dialect/Triton/IR/OpsEnums.h.inc" #define GET_OP_CLASSES #include "triton/Dialect/Triton/IR/Ops.h.inc" diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td index f664fba9a..de7e11d43 100644 --- a/include/triton/Dialect/Triton/IR/TritonOps.td +++ b/include/triton/Dialect/Triton/IR/TritonOps.td @@ -198,7 +198,7 @@ def TT_GEPOp : TT_Op<"getelementptr", // // Shape Manipulation Ops // -def TT_ReshapeOp : TT_Op<"reshape", [SameOperandsAndResultElementType]> { +def TT_ReshapeOp : TT_Op<"reshape", [NoSideEffect, SameOperandsAndResultElementType]> { let summary = "reshape"; let arguments = (ins TT_Tensor:$src); @@ -208,7 +208,7 @@ def TT_ReshapeOp : TT_Op<"reshape", [SameOperandsAndResultElementType]> { let assemblyFormat = "$src attr-dict `:` functional-type(operands, results)"; } -def TT_BroadcastOp : TT_Op<"broadcast", [SameOperandsAndResultElementType]> { +def TT_BroadcastOp : TT_Op<"broadcast", [NoSideEffect, SameOperandsAndResultElementType]> { let summary = "broadcast"; let arguments = (ins TT_Type:$src); @@ -220,7 +220,7 @@ def TT_BroadcastOp : TT_Op<"broadcast", [SameOperandsAndResultElementType]> { let hasFolder = 1; } -def TT_CatOp : TT_Op<"cat", [SameOperandsAndResultElementType]> { +def TT_CatOp : TT_Op<"cat", [NoSideEffect, SameOperandsAndResultElementType]> { let summary = "concatenate 2 tensors"; let arguments = (ins TT_Tensor:$lhs, TT_Tensor:$rhs); diff --git a/include/triton/transforms/Passes.h b/include/triton/transforms/Passes.h deleted file mode 100644 index 69b1593a3..000000000 --- a/include/triton/transforms/Passes.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef TRITON_TRANSFORMS_PASSES_H_ -#define TRITON_TRANSFORMS_PASSES_H_ - -#include "mlir/Pass/Pass.h" - -std::unique_ptr createCombineOpsPass(); - -// // Registration -// #define GEN_PASS_REGISTRATION -// #include - -#endif // TRITON_TRANSFORMS_PASSES_H_ diff --git a/lib/Dialect/Triton/IR/Dialect.cpp b/lib/Dialect/Triton/IR/Dialect.cpp index e21384d47..0b0adef6d 100644 --- a/lib/Dialect/Triton/IR/Dialect.cpp +++ b/lib/Dialect/Triton/IR/Dialect.cpp @@ -8,7 +8,7 @@ #include "mlir/IR/DialectImplementation.h" -#include "triton/Dialect/Triton/Dialect.cpp.inc" +#include "triton/Dialect/Triton/IR/Dialect.cpp.inc" using namespace mlir; using namespace mlir::triton; @@ -18,7 +18,7 @@ void TritonDialect::initialize() { addOperations< #define GET_OP_LIST -#include "triton/Dialect/Triton/Ops.cpp.inc" +#include "triton/Dialect/Triton/IR/Ops.cpp.inc" >(); // We can also add interface here. diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp index acb4adf6c..a8f9e4880 100644 --- a/lib/Dialect/Triton/IR/Ops.cpp +++ b/lib/Dialect/Triton/IR/Ops.cpp @@ -38,10 +38,10 @@ static Type getPointerTypeFromTensor(Type type) { } #define GET_OP_CLASSES -#include "triton/Dialect/Triton/Ops.cpp.inc" +#include "triton/Dialect/Triton/IR/Ops.cpp.inc" // enum attribute definitions -#include "triton/Dialect/Triton/OpsEnums.cpp.inc" +#include "triton/Dialect/Triton/IR/OpsEnums.cpp.inc" namespace mlir { namespace triton { diff --git a/rewrite-test/jit/matmul/matmul.mlir b/rewrite-test/jit/matmul/matmul.mlir index 8ce9154bf..b73890fdd 100644 --- a/rewrite-test/jit/matmul/matmul.mlir +++ b/rewrite-test/jit/matmul/matmul.mlir @@ -1,251 +1,8 @@ -module { - func @matmul_kernel__Pfp16_Pfp16_Pfp16_i32_i32_i32_i32_i32_i32__7c1_9c1_11c1_12c64_13c64_14c32_15c8(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = call @"cdiv__i32__1cconstexpr[64]"(%arg3) : (i32) -> i32 - %2 = call @"cdiv__i32__1cconstexpr[64]"(%arg4) : (i32) -> i32 - %c8_i32 = arith.constant 8 : i32 - %3 = arith.muli %2, %c8_i32 : i32 - %4 = arith.divsi %0, %3 : i32 - %c8_i32_0 = arith.constant 8 : i32 - %5 = arith.muli %4, %c8_i32_0 : i32 - %6 = arith.subi %1, %5 : i32 - %7 = call @"minimum__i32__1cconstexpr[8]"(%6) : (i32) -> i32 - %8 = arith.remsi %0, %7 : i32 - %9 = arith.addi %5, %8 : i32 - %10 = arith.remsi %0, %3 : i32 - %11 = arith.divsi %10, %7 : i32 - %c64_i32 = arith.constant 64 : i32 - %12 = arith.muli %9, %c64_i32 : i32 - %13 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %14 = tt.broadcast %12 : (i32) -> tensor<64xi32> - %15 = arith.addi %14, %13 : tensor<64xi32> - %c64_i32_1 = arith.constant 64 : i32 - %16 = arith.muli %11, %c64_i32_1 : i32 - %17 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %18 = tt.broadcast %16 : (i32) -> tensor<64xi32> - %19 = arith.addi %18, %17 : tensor<64xi32> - %20 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> - %21 = tt.reshape %15 : (tensor<64xi32>) -> tensor<64x1xi32> - %22 = tt.broadcast %arg6 : (i32) -> tensor<64x1xi32> - %23 = arith.muli %21, %22 : tensor<64x1xi32> - %24 = tt.reshape %20 : (tensor<32xi32>) -> tensor<1x32xi32> - %c1_i32 = arith.constant 1 : i32 - %25 = tt.broadcast %c1_i32 : (i32) -> tensor<1x32xi32> - %26 = arith.muli %24, %25 : tensor<1x32xi32> - %27 = tt.broadcast %23 : (tensor<64x1xi32>) -> tensor<64x32xi32> - %28 = tt.broadcast %26 : (tensor<1x32xi32>) -> tensor<64x32xi32> - %29 = arith.addi %27, %28 : tensor<64x32xi32> - %30 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<64x32x!tt.ptr> - %31 = tt.getelementptr %30, %29, : tensor<64x32x!tt.ptr> - %32 = tt.reshape %20 : (tensor<32xi32>) -> tensor<32x1xi32> - %33 = tt.broadcast %arg7 : (i32) -> tensor<32x1xi32> - %34 = arith.muli %32, %33 : tensor<32x1xi32> - %35 = tt.reshape %19 : (tensor<64xi32>) -> tensor<1x64xi32> - %c1_i32_2 = arith.constant 1 : i32 - %36 = tt.broadcast %c1_i32_2 : (i32) -> tensor<1x64xi32> - %37 = arith.muli %35, %36 : tensor<1x64xi32> - %38 = tt.broadcast %34 : (tensor<32x1xi32>) -> tensor<32x64xi32> - %39 = tt.broadcast %37 : (tensor<1x64xi32>) -> tensor<32x64xi32> - %40 = arith.addi %38, %39 : tensor<32x64xi32> - %41 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<32x64x!tt.ptr> - %42 = tt.getelementptr %41, %40, : tensor<32x64x!tt.ptr> - %cst = arith.constant 0.000000e+00 : f32 - %43 = tt.broadcast %cst : (f32) -> tensor<64x64xf32> - %c0_i32 = arith.constant 0 : i32 - %c32_i32 = arith.constant 32 : i32 - %44 = arith.index_cast %c0_i32 : i32 to index - %45 = arith.index_cast %arg5 : i32 to index - %46 = arith.index_cast %c32_i32 : i32 to index - %47:3 = scf.for %arg9 = %44 to %45 step %46 iter_args(%arg10 = %43, %arg11 = %31, %arg12 = %42) -> (tensor<64x64xf32>, tensor<64x32x!tt.ptr>, tensor<32x64x!tt.ptr>) { - %cst_6 = arith.constant dense : tensor<64x32xi1> - %cst_7 = arith.constant dense<0.000000e+00> : tensor<64x32xf16> - %77 = tt.load %arg11, %cst_6, %cst_7 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32xf16> - %cst_8 = arith.constant dense : tensor<32x64xi1> - %cst_9 = arith.constant dense<0.000000e+00> : tensor<32x64xf16> - %78 = tt.load %arg12, %cst_8, %cst_9 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x64xf16> - %cst_10 = arith.constant 0.000000e+00 : f32 - %79 = tt.broadcast %cst_10 : (f32) -> tensor<64x64xf32> - %80 = tt.dot %77, %78, %79 {allowTF32 = true} : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> - %81 = arith.addf %arg10, %80 : tensor<64x64xf32> - %c32_i32_11 = arith.constant 32 : i32 - %82 = tt.broadcast %c32_i32_11 : (i32) -> tensor<64x32xi32> - %83 = tt.getelementptr %arg11, %82, : tensor<64x32x!tt.ptr> - %c32_i32_12 = arith.constant 32 : i32 - %84 = arith.muli %arg7, %c32_i32_12 : i32 - %85 = tt.broadcast %84 : (i32) -> tensor<32x64xi32> - %86 = tt.getelementptr %arg12, %85, : tensor<32x64x!tt.ptr> - scf.yield %81, %83, %86 : tensor<64x64xf32>, tensor<64x32x!tt.ptr>, tensor<32x64x!tt.ptr> - } - %48 = arith.truncf %47#0 : tensor<64x64xf32> to tensor<64x64xf16> - %c64_i32_3 = arith.constant 64 : i32 - %49 = arith.muli %9, %c64_i32_3 : i32 - %50 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %51 = tt.broadcast %49 : (i32) -> tensor<64xi32> - %52 = arith.addi %51, %50 : tensor<64xi32> - %c64_i32_4 = arith.constant 64 : i32 - %53 = arith.muli %11, %c64_i32_4 : i32 - %54 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %55 = tt.broadcast %53 : (i32) -> tensor<64xi32> - %56 = arith.addi %55, %54 : tensor<64xi32> - %57 = tt.reshape %52 : (tensor<64xi32>) -> tensor<64x1xi32> - %58 = tt.broadcast %arg8 : (i32) -> tensor<64x1xi32> - %59 = arith.muli %58, %57 : tensor<64x1xi32> - %60 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> - %61 = tt.getelementptr %60, %59, : tensor<64x1x!tt.ptr> - %62 = tt.reshape %56 : (tensor<64xi32>) -> tensor<1x64xi32> - %c1_i32_5 = arith.constant 1 : i32 - %63 = tt.broadcast %c1_i32_5 : (i32) -> tensor<1x64xi32> - %64 = arith.muli %62, %63 : tensor<1x64xi32> - %65 = tt.broadcast %61 : (tensor<64x1x!tt.ptr>) -> tensor<64x64x!tt.ptr> - %66 = tt.broadcast %64 : (tensor<1x64xi32>) -> tensor<64x64xi32> - %67 = tt.getelementptr %65, %66, : tensor<64x64x!tt.ptr> - %68 = tt.reshape %52 : (tensor<64xi32>) -> tensor<64x1xi32> - %69 = tt.broadcast %arg3 : (i32) -> tensor<64x1xi32> - %70 = arith.cmpi slt, %68, %69 : tensor<64x1xi32> - %71 = tt.reshape %56 : (tensor<64xi32>) -> tensor<1x64xi32> - %72 = tt.broadcast %arg4 : (i32) -> tensor<1x64xi32> - %73 = arith.cmpi slt, %71, %72 : tensor<1x64xi32> - %74 = tt.broadcast %70 : (tensor<64x1xi1>) -> tensor<64x64xi1> - %75 = tt.broadcast %73 : (tensor<1x64xi1>) -> tensor<64x64xi1> - %76 = arith.andi %74, %75 : tensor<64x64xi1> - tt.store %67, %48, %76, : tensor<64x64xf16> - return - } - func @"cdiv__i32__1cconstexpr[64]"(%arg0: i32) -> i32 { - %c64_i32 = arith.constant 64 : i32 - %0 = arith.addi %arg0, %c64_i32 : i32 - %c1_i32 = arith.constant 1 : i32 - %1 = arith.subi %0, %c1_i32 : i32 - %c64_i32_0 = arith.constant 64 : i32 - %2 = arith.divsi %1, %c64_i32_0 : i32 - return %2 : i32 - } - func @"minimum__i32__1cconstexpr[8]"(%arg0: i32) -> i32 { - %c8_i32 = arith.constant 8 : i32 - %0 = arith.cmpi slt, %arg0, %c8_i32 : i32 - %c8_i32_0 = arith.constant 8 : i32 - %1 = select %0, %arg0, %c8_i32_0 : i32 - return %1 : i32 - } -} -module { - func @matmul_kernel__Pfp16_Pfp16_Pfp16_i32_i32_i32_i32_i32_i32__7c1_9c1_11c1_12c64_13c64_14c32_15c8(%arg0: !tt.ptr, %arg1: !tt.ptr, %arg2: !tt.ptr, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - %c1_i32 = arith.constant 1 : i32 - %c32_i32 = arith.constant 32 : i32 - %cst = arith.constant dense<0.000000e+00> : tensor<32x64xf16> - %cst_0 = arith.constant dense : tensor<32x64xi1> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<64x32xf16> - %cst_2 = arith.constant dense : tensor<64x32xi1> - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - %cst_3 = arith.constant 0.000000e+00 : f32 - %c64_i32 = arith.constant 64 : i32 - %c63_i32 = arith.constant 63 : i32 - %c8_i32 = arith.constant 8 : i32 - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.addi %arg3, %c63_i32 : i32 - %2 = arith.divsi %1, %c64_i32 : i32 - %3 = arith.addi %arg4, %c63_i32 : i32 - %4 = arith.divsi %3, %c64_i32 : i32 - %5 = arith.muli %4, %c8_i32 : i32 - %6 = arith.divsi %0, %5 : i32 - %7 = arith.muli %6, %c8_i32 : i32 - %8 = arith.subi %2, %7 : i32 - %9 = arith.cmpi slt, %8, %c8_i32 : i32 - %10 = select %9, %8, %c8_i32 : i32 - %11 = arith.remsi %0, %10 : i32 - %12 = arith.addi %7, %11 : i32 - %13 = arith.remsi %0, %5 : i32 - %14 = arith.divsi %13, %10 : i32 - %15 = arith.muli %12, %c64_i32 : i32 - %16 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %17 = tt.broadcast %15 : (i32) -> tensor<64xi32> - %18 = arith.addi %17, %16 : tensor<64xi32> - %19 = arith.muli %14, %c64_i32 : i32 - %20 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %21 = tt.broadcast %19 : (i32) -> tensor<64xi32> - %22 = arith.addi %21, %20 : tensor<64xi32> - %23 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32> - %24 = tt.reshape %18 : (tensor<64xi32>) -> tensor<64x1xi32> - %25 = tt.broadcast %arg6 : (i32) -> tensor<64x1xi32> - %26 = arith.muli %24, %25 : tensor<64x1xi32> - %27 = tt.reshape %23 : (tensor<32xi32>) -> tensor<1x32xi32> - %28 = tt.broadcast %c1_i32 : (i32) -> tensor<1x32xi32> - %29 = arith.muli %27, %28 : tensor<1x32xi32> - %30 = tt.broadcast %26 : (tensor<64x1xi32>) -> tensor<64x32xi32> - %31 = tt.broadcast %29 : (tensor<1x32xi32>) -> tensor<64x32xi32> - %32 = arith.addi %30, %31 : tensor<64x32xi32> - %33 = tt.broadcast %arg0 : (!tt.ptr) -> tensor<64x32x!tt.ptr> - %34 = tt.getelementptr %33, %32, : tensor<64x32x!tt.ptr> - %35 = tt.reshape %23 : (tensor<32xi32>) -> tensor<32x1xi32> - %36 = tt.broadcast %arg7 : (i32) -> tensor<32x1xi32> - %37 = arith.muli %35, %36 : tensor<32x1xi32> - %38 = tt.reshape %22 : (tensor<64xi32>) -> tensor<1x64xi32> - %39 = tt.broadcast %c1_i32 : (i32) -> tensor<1x64xi32> - %40 = arith.muli %38, %39 : tensor<1x64xi32> - %41 = tt.broadcast %37 : (tensor<32x1xi32>) -> tensor<32x64xi32> - %42 = tt.broadcast %40 : (tensor<1x64xi32>) -> tensor<32x64xi32> - %43 = arith.addi %41, %42 : tensor<32x64xi32> - %44 = tt.broadcast %arg1 : (!tt.ptr) -> tensor<32x64x!tt.ptr> - %45 = tt.getelementptr %44, %43, : tensor<32x64x!tt.ptr> - %46 = tt.broadcast %cst_3 : (f32) -> tensor<64x64xf32> - %47 = arith.index_cast %arg5 : i32 to index - %48:3 = scf.for %arg9 = %c0 to %47 step %c32 iter_args(%arg10 = %46, %arg11 = %34, %arg12 = %45) -> (tensor<64x64xf32>, tensor<64x32x!tt.ptr>, tensor<32x64x!tt.ptr>) { - %78 = tt.load %arg11, %cst_2, %cst_1 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x32xf16> - %79 = tt.load %arg12, %cst_0, %cst {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<32x64xf16> - %80 = tt.broadcast %cst_3 : (f32) -> tensor<64x64xf32> - %81 = tt.dot %78, %79, %80 {allowTF32 = true} : tensor<64x32xf16> * tensor<32x64xf16> -> tensor<64x64xf32> - %82 = arith.addf %arg10, %81 : tensor<64x64xf32> - %83 = tt.broadcast %c32_i32 : (i32) -> tensor<64x32xi32> - %84 = tt.getelementptr %arg11, %83, : tensor<64x32x!tt.ptr> - %85 = arith.muli %arg7, %c32_i32 : i32 - %86 = tt.broadcast %85 : (i32) -> tensor<32x64xi32> - %87 = tt.getelementptr %arg12, %86, : tensor<32x64x!tt.ptr> - scf.yield %82, %84, %87 : tensor<64x64xf32>, tensor<64x32x!tt.ptr>, tensor<32x64x!tt.ptr> - } - %49 = arith.truncf %48#0 : tensor<64x64xf32> to tensor<64x64xf16> - %50 = arith.muli %12, %c64_i32 : i32 - %51 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %52 = tt.broadcast %50 : (i32) -> tensor<64xi32> - %53 = arith.addi %52, %51 : tensor<64xi32> - %54 = arith.muli %14, %c64_i32 : i32 - %55 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32> - %56 = tt.broadcast %54 : (i32) -> tensor<64xi32> - %57 = arith.addi %56, %55 : tensor<64xi32> - %58 = tt.reshape %53 : (tensor<64xi32>) -> tensor<64x1xi32> - %59 = tt.broadcast %arg8 : (i32) -> tensor<64x1xi32> - %60 = arith.muli %59, %58 : tensor<64x1xi32> - %61 = tt.broadcast %arg2 : (!tt.ptr) -> tensor<64x1x!tt.ptr> - %62 = tt.getelementptr %61, %60, : tensor<64x1x!tt.ptr> - %63 = tt.reshape %57 : (tensor<64xi32>) -> tensor<1x64xi32> - %64 = tt.broadcast %c1_i32 : (i32) -> tensor<1x64xi32> - %65 = arith.muli %63, %64 : tensor<1x64xi32> - %66 = tt.broadcast %62 : (tensor<64x1x!tt.ptr>) -> tensor<64x64x!tt.ptr> - %67 = tt.broadcast %65 : (tensor<1x64xi32>) -> tensor<64x64xi32> - %68 = tt.getelementptr %66, %67, : tensor<64x64x!tt.ptr> - %69 = tt.reshape %53 : (tensor<64xi32>) -> tensor<64x1xi32> - %70 = tt.broadcast %arg3 : (i32) -> tensor<64x1xi32> - %71 = arith.cmpi slt, %69, %70 : tensor<64x1xi32> - %72 = tt.reshape %57 : (tensor<64xi32>) -> tensor<1x64xi32> - %73 = tt.broadcast %arg4 : (i32) -> tensor<1x64xi32> - %74 = arith.cmpi slt, %72, %73 : tensor<1x64xi32> - %75 = tt.broadcast %71 : (tensor<64x1xi1>) -> tensor<64x64xi1> - %76 = tt.broadcast %74 : (tensor<1x64xi1>) -> tensor<64x64xi1> - %77 = arith.andi %75, %76 : tensor<64x64xi1> - tt.store %68, %49, %77, : tensor<64x64xf16> - return - } - func @"cdiv__i32__1cconstexpr[64]"(%arg0: i32) -> i32 { - %c63_i32 = arith.constant 63 : i32 - %c64_i32 = arith.constant 64 : i32 - %0 = arith.addi %arg0, %c63_i32 : i32 - %1 = arith.divsi %0, %c64_i32 : i32 - return %1 : i32 - } - func @"minimum__i32__1cconstexpr[8]"(%arg0: i32) -> i32 { - %c8_i32 = arith.constant 8 : i32 - %0 = arith.cmpi slt, %arg0, %c8_i32 : i32 - %1 = select %0, %arg0, %c8_i32 : i32 - return %1 : i32 - } -} +Traceback (most recent call last): + File "matmul.py", line 1, in + import triton + File "/home/da/miniconda3/envs/torch-src/lib/python3.7/site-packages/triton-2.0.0-py3.7-linux-x86_64.egg/triton/__init__.py", line 9, in + from .code_gen import cdiv, next_power_of_2, jit, autotune, heuristics, \ + File "/home/da/miniconda3/envs/torch-src/lib/python3.7/site-packages/triton-2.0.0-py3.7-linux-x86_64.egg/triton/code_gen.py", line 23, in + import triton._C.libtriton.triton as _triton +ImportError: /home/da/miniconda3/envs/torch-src/lib/python3.7/site-packages/triton-2.0.0-py3.7-linux-x86_64.egg/triton/_C/libtriton.so: undefined symbol: _ZN4mlir6triton5CatOp10getEffectsERN4llvm15SmallVectorImplINS_11SideEffects14EffectInstanceINS_13MemoryEffects6EffectEEEEE