From 993bc1731173b137cc09c1f45dc01b29551ad377 Mon Sep 17 00:00:00 2001 From: Phil Tillet Date: Mon, 9 Jan 2023 22:35:00 -0800 Subject: [PATCH] cleanup --- python/being-optimized.ttgir | 169 --- python/bwd.ptx | 2764 ---------------------------------- python/bwd.ttgir | 169 --- python/flash-attention.ttgir | 159 -- python/slow.ttgir | 168 --- python/unoptimized.ttgir | 178 --- 6 files changed, 3607 deletions(-) delete mode 100644 python/being-optimized.ttgir delete mode 100644 python/bwd.ptx delete mode 100644 python/bwd.ttgir delete mode 100644 python/flash-attention.ttgir delete mode 100644 python/slow.ttgir delete mode 100644 python/unoptimized.ttgir diff --git a/python/being-optimized.ttgir b/python/being-optimized.ttgir deleted file mode 100644 index 887ff1246..000000000 --- a/python/being-optimized.ttgir +++ /dev/null @@ -1,169 +0,0 @@ -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma0 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1]}> -#mma1 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2]}> -#shared0 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1]}> -module attributes {"triton_gpu.num-warps" = 8 : i32} { - func public @_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32 {tt.divisibility = 16 : i32}, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32, %arg22: i32 {tt.divisibility = 16 : i32}, %arg23: i32 {tt.divisibility = 16 : i32}, %arg24: i32) { - %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma0> - %cst_0 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #mma0> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %cst_10 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %c128 = arith.constant 128 : index - %c128_i32 = arith.constant 128 : i32 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.divsi %0, %arg22 : i32 - %2 = arith.remsi %0, %arg22 : i32 - %3 = arith.muli %1, %arg12 : i32 - %4 = arith.muli %2, %arg13 : i32 - %5 = arith.addi %3, %4 : i32 - %6 = tt.addptr %arg0, %5 : !tt.ptr, i32 - %7 = tt.addptr %arg1, %5 : !tt.ptr, i32 - %8 = tt.addptr %arg2, %5 : !tt.ptr, i32 - %9 = tt.addptr %arg5, %5 : !tt.ptr, i32 - %10 = tt.addptr %arg6, %5 : !tt.ptr, i32 - %11 = tt.addptr %arg7, %5 : !tt.ptr, i32 - %12 = tt.addptr %arg8, %5 : !tt.ptr, i32 - %13 = arith.index_cast %arg24 : i32 to index - %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %19 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> - %20 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked2> - %21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %22 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %23 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %25 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2> - %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %27 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %28 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> - %29 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %31 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked2> - %33 = arith.muli %0, %arg23 : i32 - %34 = tt.addptr %arg11, %33 : !tt.ptr, i32 - %35 = tt.addptr %arg10, %33 : !tt.ptr, i32 - %36 = arith.muli %arg24, %c128_i32 : i32 - %37 = arith.index_cast %36 : i32 to index - %38 = tt.splat %35 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %39 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma0> - %40 = tt.splat %34 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %41 = arith.muli %arg14, %c128_i32 : i32 - %42 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked1> - %43 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked2> - %44 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %45 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - scf.for %arg25 = %c0 to %13 step %c1 { - %46 = arith.index_cast %arg25 : index to i32 - %47 = arith.muli %46, %c128_i32 : i32 - %48 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %49 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %50 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %51 = arith.addi %48, %15 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %52 = arith.addi %49, %16 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %53 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %54 = tt.expand_dims %52 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<128x1xi32, #blocked2> - %55 = arith.muli %53, %28 : tensor<128x1xi32, #blocked1> - %56 = tt.broadcast %55 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %57 = arith.addi %56, %24 : tensor<128x64xi32, #blocked1> - %58 = tt.addptr %29, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %59 = tt.load %58 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %60 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %61 = arith.muli %53, %19 : tensor<128x1xi32, #blocked1> - %62 = tt.broadcast %61 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %63 = arith.addi %62, %24 : tensor<128x64xi32, #blocked1> - %64 = tt.addptr %30, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %65 = tt.load %64 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %66 = triton_gpu.convert_layout %65 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %67 = arith.index_cast %47 : i32 to index - %68 = arith.addi %50, %17 : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %69 = tt.expand_dims %68 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>>) -> tensor<1x128xi32, #mma0> - %70 = tt.broadcast %69 : (tensor<1x128xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %71 = arith.muli %54, %20 : tensor<128x1xi32, #blocked2> - %72 = tt.broadcast %71 : (tensor<128x1xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %73 = arith.addi %72, %26 : tensor<128x64xi32, #blocked2> - %74 = tt.addptr %32, %73 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %75 = tt.addptr %27, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %76 = tt.addptr %31, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %77:5 = scf.for %arg26 = %67 to %37 step %c128 iter_args(%arg27 = %cst_1, %arg28 = %cst_1, %arg29 = %74, %arg30 = %75, %arg31 = %76) -> (tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { - %84 = arith.index_cast %arg26 : index to i32 - %85 = tt.splat %84 : (i32) -> tensor<128xi32, #blocked0> - %86 = tt.splat %84 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %87 = arith.addi %85, %14 : tensor<128xi32, #blocked0> - %88 = tt.load %arg30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %89 = triton_gpu.convert_layout %88 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %90 = tt.trans %60 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %91 = triton_gpu.convert_layout %89 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %92 = triton_gpu.convert_layout %90 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %93 = tt.dot %91, %92, %cst {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %94 = arith.addi %86, %18 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %95 = tt.expand_dims %94 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xi32, #mma0> - %96 = tt.broadcast %95 : (tensor<128x1xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %97 = "triton_gpu.cmpi"(%96, %70) {predicate = 5 : i64} : (tensor<128x128xi32, #mma0>, tensor<128x128xi32, #mma0>) -> tensor<128x128xi1, #mma0> - %98 = "triton_gpu.select"(%97, %93, %cst_0) : (tensor<128x128xi1, #mma0>, tensor<128x128xf32, #mma0>, tensor<128x128xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %99 = tt.addptr %38, %87 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %100 = tt.load %99 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %101 = triton_gpu.convert_layout %100 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %102 = arith.mulf %98, %39 : tensor<128x128xf32, #mma0> - %103 = tt.expand_dims %101 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %104 = tt.broadcast %103 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %105 = arith.subf %102, %104 : tensor<128x128xf32, #mma0> - %106 = math.exp %105 : tensor<128x128xf32, #mma0> - %107 = tt.load %arg31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %108 = triton_gpu.convert_layout %107 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %109 = arith.truncf %106 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %110 = triton_gpu.convert_layout %109 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %111 = tt.trans %110 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %112 = triton_gpu.convert_layout %111 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %113 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %114 = tt.dot %112, %113, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %115 = tt.addptr %40, %87 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %116 = tt.load %115 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %117 = triton_gpu.convert_layout %116 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %118 = tt.expand_dims %117 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %119 = tt.broadcast %118 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %120 = arith.subf %cst, %119 : tensor<128x128xf32, #mma0> - %121 = tt.trans %66 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %122 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %123 = triton_gpu.convert_layout %121 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %124 = tt.dot %122, %123, %120 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %125 = arith.mulf %106, %124 : tensor<128x128xf32, #mma0> - %126 = arith.mulf %125, %39 : tensor<128x128xf32, #mma0> - %127 = arith.truncf %126 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %128 = triton_gpu.convert_layout %127 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %129 = tt.trans %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %130 = triton_gpu.convert_layout %129 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %131 = triton_gpu.convert_layout %89 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %132 = tt.dot %130, %131, %arg28 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %133 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf32, #blocked2> - %134 = triton_gpu.convert_layout %133 : (tensor<128x64xf32, #blocked2>) -> tensor<128x64xf32, #mma1> - %135 = triton_gpu.convert_layout %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %136 = triton_gpu.convert_layout %60 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %137 = tt.dot %135, %136, %134 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %138 = triton_gpu.convert_layout %137 : (tensor<128x64xf32, #mma1>) -> tensor<128x64xf32, #blocked2> - tt.store %arg29, %138 : tensor<128x64xf32, #blocked2> - %139 = tt.addptr %arg29, %43 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %140 = tt.addptr %arg30, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %141 = tt.addptr %arg31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %114, %132, %139, %140, %141 : tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> - } - %78 = arith.truncf %77#0 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %79 = tt.addptr %44, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %80 = triton_gpu.convert_layout %78 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %79, %80 : tensor<128x64xf16, #blocked1> - %81 = arith.truncf %77#1 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %82 = tt.addptr %45, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %83 = triton_gpu.convert_layout %81 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %82, %83 : tensor<128x64xf16, #blocked1> - } - return - } -} \ No newline at end of file diff --git a/python/bwd.ptx b/python/bwd.ptx deleted file mode 100644 index dc40696f9..000000000 --- a/python/bwd.ptx +++ /dev/null @@ -1,2764 +0,0 @@ -// -// Generated by LLVM NVPTX Back-End -// - -.version 7.4 -.target sm_86 -.address_size 64 - - // .globl _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27 -.extern .shared .align 1 .b8 global_smem[]; - -.visible .entry _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27( - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2, - .param .f32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_4, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_9, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10, - .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_15, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_16, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_18, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_19, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_20, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_21, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23, - .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24 -) -.maxntid 256, 1, 1 -{ - .reg .pred %p<111>; - .reg .b16 %h<193>; - .reg .b32 %r<6177>; - .reg .b32 %hh<65>; - .reg .f32 %f<973>; - .reg .b64 %rd<139>; - - ld.param.u32 %r380, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24]; - setp.lt.s32 %p1, %r380, 1; - @%p1 bra LBB0_6; - ld.param.u32 %r379, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17]; - ld.param.u32 %r378, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14]; - ld.param.u64 %rd55, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5]; - ld.param.f32 %f195, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3]; - ld.param.u64 %rd54, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0]; - mov.u32 %r1, %tid.x; - ld.param.u64 %rd56, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1]; - ld.param.u64 %rd57, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2]; - bfe.u32 %r2, %r1, 5, 2; - and.b32 %r3, %r1, 127; - bfe.u32 %r4, %r1, 3, 2; - ld.param.u64 %rd58, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6]; - shr.u32 %r381, %r1, 3; - ld.param.u64 %rd59, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7]; - and.b32 %r382, %r381, 124; - ld.param.u64 %rd60, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8]; - or.b32 %r5, %r382, %r4; - add.s32 %r6, %r5, 32; - ld.param.u64 %rd61, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10]; - add.s32 %r7, %r5, 64; - ld.param.u64 %rd62, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11]; - add.s32 %r8, %r5, 96; - ld.param.u32 %r383, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12]; - shl.b32 %r384, %r1, 1; - ld.param.u32 %r385, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13]; - and.b32 %r9, %r384, 6; - bfe.u32 %r10, %r1, 4, 1; - shr.u32 %r386, %r1, 4; - and.b32 %r387, %r386, 126; - or.b32 %r11, %r387, %r10; - add.s32 %r12, %r11, 16; - add.s32 %r13, %r11, 32; - add.s32 %r14, %r11, 48; - shr.u32 %r388, %r1, 1; - ld.param.u32 %r389, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22]; - and.b32 %r15, %r388, 112; - ld.param.u32 %r390, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23]; - bfe.u32 %r16, %r1, 2, 3; - or.b32 %r17, %r15, %r16; - or.b32 %r18, %r17, 8; - and.b32 %r19, %r1, 7; - shl.b32 %r20, %r19, 3; - shl.b32 %r391, %r1, 2; - and.b32 %r21, %r391, 60; - mov.u32 %r392, %ctaid.x; - div.s32 %r395, %r392, %r389; - mul.lo.s32 %r396, %r395, %r389; - sub.s32 %r397, %r392, %r396; - mul.lo.s32 %r398, %r395, %r383; - mad.lo.s32 %r399, %r397, %r385, %r398; - cvt.s64.s32 %rd1, %r399; - mul.wide.s32 %rd63, %r399, 2; - add.s64 %rd2, %rd56, %rd63; - add.s64 %rd3, %rd57, %rd63; - mul.wide.s32 %rd64, %r399, 4; - add.s64 %rd4, %rd58, %rd64; - add.s64 %rd5, %rd59, %rd63; - add.s64 %rd6, %rd60, %rd63; - mul.lo.s32 %r400, %r392, %r390; - mul.wide.s32 %rd65, %r400, 4; - add.s64 %rd7, %rd62, %rd65; - add.s64 %rd8, %rd61, %rd65; - shl.b32 %r22, %r380, 7; - shl.b32 %r23, %r378, 7; - and.b32 %r402, %r5, 7; - xor.b32 %r403, %r402, %r19; - shl.b32 %r404, %r5, 7; - shl.b32 %r405, %r403, 4; - or.b32 %r406, %r405, %r404; - mov.u32 %r407, global_smem; - add.s32 %r24, %r407, %r406; - shl.b32 %r408, %r6, 7; - or.b32 %r409, %r408, %r405; - add.s32 %r25, %r407, %r409; - shl.b32 %r410, %r7, 7; - or.b32 %r411, %r410, %r405; - add.s32 %r26, %r407, %r411; - shl.b32 %r412, %r8, 7; - or.b32 %r413, %r412, %r405; - add.s32 %r27, %r407, %r413; - add.s32 %r414, %r407, 16384; - add.s32 %r28, %r414, %r406; - add.s32 %r29, %r414, %r409; - add.s32 %r30, %r414, %r411; - add.s32 %r31, %r414, %r413; - add.s32 %r415, %r407, 32768; - add.s32 %r32, %r415, %r406; - add.s32 %r33, %r415, %r409; - add.s32 %r34, %r415, %r411; - add.s32 %r35, %r415, %r413; - and.b32 %r416, %r4, 1; - shl.b32 %r417, %r416, 3; - or.b32 %r418, %r417, %r15; - or.b32 %r419, %r418, %r19; - xor.b32 %r420, %r10, %r19; - shl.b32 %r421, %r419, 6; - shl.b32 %r422, %r420, 3; - or.b32 %r423, %r421, %r422; - shl.b32 %r424, %r423, 1; - add.s32 %r704, %r415, %r424; - or.b32 %r425, %r10, 2; - xor.b32 %r426, %r425, %r19; - shl.b32 %r427, %r426, 3; - or.b32 %r428, %r421, %r427; - shl.b32 %r429, %r428, 1; - add.s32 %r709, %r415, %r429; - or.b32 %r430, %r10, 4; - xor.b32 %r431, %r430, %r19; - shl.b32 %r432, %r431, 3; - or.b32 %r433, %r421, %r432; - shl.b32 %r434, %r433, 1; - add.s32 %r714, %r415, %r434; - or.b32 %r435, %r10, 6; - xor.b32 %r436, %r435, %r19; - shl.b32 %r437, %r436, 3; - or.b32 %r438, %r421, %r437; - shl.b32 %r439, %r438, 1; - add.s32 %r719, %r415, %r439; - shl.b32 %r440, %r10, 3; - or.b32 %r441, %r440, %r19; - xor.b32 %r442, %r416, %r19; - shl.b32 %r443, %r442, 4; - shl.b32 %r444, %r441, 7; - or.b32 %r445, %r443, %r444; - add.s32 %r724, %r407, %r445; - or.b32 %r446, %r4, 2; - xor.b32 %r447, %r446, %r19; - shl.b32 %r448, %r447, 4; - or.b32 %r449, %r448, %r444; - add.s32 %r729, %r407, %r449; - or.b32 %r450, %r416, 4; - xor.b32 %r451, %r450, %r19; - shl.b32 %r452, %r451, 4; - or.b32 %r453, %r452, %r444; - add.s32 %r734, %r407, %r453; - or.b32 %r454, %r4, 6; - xor.b32 %r455, %r454, %r19; - shl.b32 %r456, %r455, 4; - or.b32 %r457, %r456, %r444; - add.s32 %r739, %r407, %r457; - add.s32 %r744, %r724, 2048; - add.s32 %r749, %r729, 2048; - add.s32 %r754, %r734, 2048; - add.s32 %r759, %r739, 2048; - add.s32 %r764, %r724, 4096; - add.s32 %r769, %r729, 4096; - add.s32 %r774, %r734, 4096; - add.s32 %r779, %r739, 4096; - add.s32 %r784, %r724, 6144; - add.s32 %r789, %r729, 6144; - add.s32 %r794, %r734, 6144; - add.s32 %r799, %r739, 6144; - add.s32 %r804, %r724, 8192; - add.s32 %r809, %r729, 8192; - add.s32 %r814, %r734, 8192; - add.s32 %r819, %r739, 8192; - add.s32 %r824, %r724, 10240; - add.s32 %r829, %r729, 10240; - add.s32 %r834, %r734, 10240; - add.s32 %r839, %r739, 10240; - add.s32 %r844, %r724, 12288; - add.s32 %r849, %r729, 12288; - add.s32 %r854, %r734, 12288; - add.s32 %r859, %r739, 12288; - add.s32 %r864, %r724, 14336; - add.s32 %r869, %r729, 14336; - add.s32 %r874, %r734, 14336; - add.s32 %r879, %r739, 14336; - shl.b32 %r458, %r3, 2; - add.s32 %r459, %r407, 49152; - add.s32 %r72, %r459, %r458; - shl.b32 %r460, %r17, 2; - add.s32 %r73, %r459, %r460; - add.s32 %r461, %r16, %r15; - shl.b32 %r462, %r461, 2; - add.s32 %r74, %r459, %r462; - add.s32 %r75, %r459, %r406; - add.s32 %r76, %r459, %r409; - add.s32 %r77, %r459, %r411; - add.s32 %r78, %r459, %r413; - shl.b32 %r463, %r17, 7; - shl.b32 %r464, %r16, 3; - or.b32 %r465, %r464, %r9; - or.b32 %r466, %r463, %r465; - shl.b32 %r467, %r466, 1; - add.s32 %r468, %r407, 65536; - add.s32 %r79, %r468, %r467; - shl.b32 %r469, %r18, 7; - or.b32 %r470, %r469, %r465; - shl.b32 %r471, %r470, 1; - add.s32 %r80, %r468, %r471; - xor.b32 %r472, %r466, 8; - shl.b32 %r473, %r472, 1; - add.s32 %r81, %r468, %r473; - xor.b32 %r474, %r470, 8; - shl.b32 %r475, %r474, 1; - add.s32 %r82, %r468, %r475; - xor.b32 %r476, %r466, 16; - shl.b32 %r477, %r476, 1; - add.s32 %r83, %r468, %r477; - xor.b32 %r478, %r470, 16; - shl.b32 %r479, %r478, 1; - add.s32 %r84, %r468, %r479; - xor.b32 %r480, %r466, 24; - shl.b32 %r481, %r480, 1; - add.s32 %r85, %r468, %r481; - xor.b32 %r482, %r470, 24; - shl.b32 %r483, %r482, 1; - add.s32 %r86, %r468, %r483; - xor.b32 %r484, %r466, 32; - shl.b32 %r485, %r484, 1; - add.s32 %r87, %r468, %r485; - xor.b32 %r486, %r470, 32; - shl.b32 %r487, %r486, 1; - add.s32 %r88, %r468, %r487; - xor.b32 %r488, %r466, 40; - shl.b32 %r489, %r488, 1; - add.s32 %r89, %r468, %r489; - xor.b32 %r490, %r470, 40; - shl.b32 %r491, %r490, 1; - add.s32 %r90, %r468, %r491; - xor.b32 %r492, %r466, 48; - shl.b32 %r493, %r492, 1; - add.s32 %r91, %r468, %r493; - xor.b32 %r494, %r470, 48; - shl.b32 %r495, %r494, 1; - add.s32 %r92, %r468, %r495; - xor.b32 %r496, %r466, 56; - shl.b32 %r497, %r496, 1; - add.s32 %r93, %r468, %r497; - xor.b32 %r498, %r470, 56; - shl.b32 %r499, %r498, 1; - add.s32 %r94, %r468, %r499; - xor.b32 %r500, %r466, 72; - shl.b32 %r501, %r500, 1; - add.s32 %r97, %r468, %r501; - xor.b32 %r502, %r470, 72; - shl.b32 %r503, %r502, 1; - add.s32 %r98, %r468, %r503; - xor.b32 %r504, %r466, 80; - shl.b32 %r505, %r504, 1; - add.s32 %r99, %r468, %r505; - xor.b32 %r506, %r470, 80; - shl.b32 %r507, %r506, 1; - add.s32 %r100, %r468, %r507; - xor.b32 %r508, %r466, 88; - shl.b32 %r509, %r508, 1; - add.s32 %r101, %r468, %r509; - xor.b32 %r510, %r470, 88; - shl.b32 %r511, %r510, 1; - add.s32 %r102, %r468, %r511; - xor.b32 %r512, %r466, 96; - shl.b32 %r513, %r512, 1; - add.s32 %r103, %r468, %r513; - xor.b32 %r514, %r470, 96; - shl.b32 %r515, %r514, 1; - add.s32 %r104, %r468, %r515; - xor.b32 %r516, %r466, 104; - shl.b32 %r517, %r516, 1; - add.s32 %r105, %r468, %r517; - xor.b32 %r518, %r470, 104; - shl.b32 %r519, %r518, 1; - add.s32 %r106, %r468, %r519; - xor.b32 %r520, %r466, 112; - shl.b32 %r521, %r520, 1; - add.s32 %r107, %r468, %r521; - xor.b32 %r522, %r470, 112; - shl.b32 %r523, %r522, 1; - add.s32 %r108, %r468, %r523; - xor.b32 %r524, %r466, 120; - shl.b32 %r525, %r524, 1; - add.s32 %r109, %r468, %r525; - xor.b32 %r526, %r470, 120; - shl.b32 %r527, %r526, 1; - add.s32 %r110, %r468, %r527; - shl.b32 %r528, %r2, 1; - or.b32 %r529, %r528, %r416; - xor.b32 %r530, %r529, %r19; - shl.b32 %r531, %r530, 4; - shl.b32 %r532, %r441, 8; - or.b32 %r533, %r531, %r532; - add.s32 %r1797, %r468, %r533; - add.s32 %r1802, %r1797, 4096; - add.s32 %r1807, %r1797, 8192; - add.s32 %r1812, %r1797, 12288; - add.s32 %r1817, %r1797, 16384; - add.s32 %r1822, %r1797, 20480; - add.s32 %r1827, %r1797, 24576; - add.s32 %r1832, %r1797, 28672; - or.b32 %r534, %r529, 8; - xor.b32 %r535, %r534, %r19; - shl.b32 %r536, %r535, 4; - or.b32 %r537, %r536, %r532; - add.s32 %r1837, %r468, %r537; - add.s32 %r1842, %r1837, 4096; - add.s32 %r1847, %r1837, 8192; - add.s32 %r1852, %r1837, 12288; - add.s32 %r1857, %r1837, 16384; - add.s32 %r1862, %r1837, 20480; - add.s32 %r1867, %r1837, 24576; - add.s32 %r1872, %r1837, 28672; - bfe.u32 %r538, %r1, 7, 1; - shl.b32 %r539, %r10, 1; - or.b32 %r540, %r539, %r538; - xor.b32 %r541, %r540, %r19; - shl.b32 %r542, %r416, 9; - shl.b32 %r543, %r19, 6; - or.b32 %r544, %r542, %r543; - shl.b32 %r545, %r541, 4; - shl.b32 %r546, %r544, 1; - or.b32 %r547, %r545, %r546; - add.s32 %r1877, %r459, %r547; - add.s32 %r1882, %r1877, 2048; - add.s32 %r1887, %r1877, 4096; - add.s32 %r1892, %r1877, 6144; - add.s32 %r1897, %r1877, 8192; - add.s32 %r1902, %r1877, 10240; - add.s32 %r1907, %r1877, 12288; - add.s32 %r1912, %r1877, 14336; - or.b32 %r548, %r540, 4; - xor.b32 %r549, %r548, %r19; - shl.b32 %r550, %r549, 4; - or.b32 %r551, %r550, %r546; - add.s32 %r1917, %r459, %r551; - add.s32 %r1922, %r1917, 2048; - add.s32 %r1927, %r1917, 4096; - add.s32 %r1932, %r1917, 6144; - add.s32 %r1937, %r1917, 8192; - add.s32 %r1942, %r1917, 10240; - add.s32 %r1947, %r1917, 12288; - add.s32 %r1952, %r1917, 14336; - add.s32 %r143, %r468, %r458; - add.s32 %r144, %r468, %r460; - add.s32 %r145, %r468, %r462; - add.s32 %r2854, %r459, %r424; - add.s32 %r2859, %r459, %r429; - add.s32 %r2864, %r459, %r434; - add.s32 %r2869, %r459, %r439; - add.s32 %r2874, %r414, %r445; - add.s32 %r2879, %r414, %r449; - add.s32 %r2884, %r414, %r453; - add.s32 %r2889, %r414, %r457; - add.s32 %r2894, %r2874, 2048; - add.s32 %r2899, %r2879, 2048; - add.s32 %r2904, %r2884, 2048; - add.s32 %r2909, %r2889, 2048; - add.s32 %r2914, %r2874, 4096; - add.s32 %r2919, %r2879, 4096; - add.s32 %r2924, %r2884, 4096; - add.s32 %r2929, %r2889, 4096; - add.s32 %r2934, %r2874, 6144; - add.s32 %r2939, %r2879, 6144; - add.s32 %r2944, %r2884, 6144; - add.s32 %r2949, %r2889, 6144; - add.s32 %r2954, %r2874, 8192; - add.s32 %r2959, %r2879, 8192; - add.s32 %r2964, %r2884, 8192; - add.s32 %r2969, %r2889, 8192; - add.s32 %r2974, %r2874, 10240; - add.s32 %r2979, %r2879, 10240; - add.s32 %r2984, %r2884, 10240; - add.s32 %r2989, %r2889, 10240; - add.s32 %r2994, %r2874, 12288; - add.s32 %r2999, %r2879, 12288; - add.s32 %r3004, %r2884, 12288; - add.s32 %r3009, %r2889, 12288; - add.s32 %r3014, %r2874, 14336; - add.s32 %r3019, %r2879, 14336; - add.s32 %r3024, %r2884, 14336; - add.s32 %r3029, %r2889, 14336; - add.s32 %r552, %r407, 50176; - add.s32 %r182, %r552, %r467; - add.s32 %r183, %r552, %r471; - add.s32 %r184, %r552, %r473; - add.s32 %r185, %r552, %r475; - add.s32 %r186, %r552, %r477; - add.s32 %r187, %r552, %r479; - add.s32 %r188, %r552, %r481; - add.s32 %r189, %r552, %r483; - add.s32 %r190, %r552, %r485; - add.s32 %r191, %r552, %r487; - add.s32 %r192, %r552, %r489; - add.s32 %r193, %r552, %r491; - add.s32 %r194, %r552, %r493; - add.s32 %r195, %r552, %r495; - add.s32 %r196, %r552, %r497; - add.s32 %r197, %r552, %r499; - add.s32 %r200, %r552, %r501; - add.s32 %r201, %r552, %r503; - add.s32 %r202, %r552, %r505; - add.s32 %r203, %r552, %r507; - add.s32 %r204, %r552, %r509; - add.s32 %r205, %r552, %r511; - add.s32 %r206, %r552, %r513; - add.s32 %r207, %r552, %r515; - add.s32 %r208, %r552, %r517; - add.s32 %r209, %r552, %r519; - add.s32 %r210, %r552, %r521; - add.s32 %r211, %r552, %r523; - add.s32 %r212, %r552, %r525; - add.s32 %r213, %r552, %r527; - add.s32 %r3930, %r552, %r533; - add.s32 %r3935, %r3930, 4096; - add.s32 %r3940, %r3930, 8192; - add.s32 %r3945, %r3930, 12288; - add.s32 %r3950, %r3930, 16384; - add.s32 %r3955, %r3930, 20480; - add.s32 %r3960, %r3930, 24576; - add.s32 %r3965, %r3930, 28672; - add.s32 %r3970, %r552, %r537; - add.s32 %r3975, %r3970, 4096; - add.s32 %r3980, %r3970, 8192; - add.s32 %r3985, %r3970, 12288; - add.s32 %r3990, %r3970, 16384; - add.s32 %r3995, %r3970, 20480; - add.s32 %r4000, %r3970, 24576; - add.s32 %r4005, %r3970, 28672; - add.s32 %r4010, %r415, %r547; - add.s32 %r4015, %r4010, 2048; - add.s32 %r4020, %r4010, 4096; - add.s32 %r4025, %r4010, 6144; - add.s32 %r4030, %r4010, 8192; - add.s32 %r4035, %r4010, 10240; - add.s32 %r4040, %r4010, 12288; - add.s32 %r4045, %r4010, 14336; - add.s32 %r4050, %r415, %r551; - add.s32 %r4055, %r4050, 2048; - add.s32 %r4060, %r4050, 4096; - add.s32 %r4065, %r4050, 6144; - add.s32 %r4070, %r4050, 8192; - add.s32 %r4075, %r4050, 10240; - add.s32 %r4080, %r4050, 12288; - add.s32 %r4085, %r4050, 14336; - mad.lo.s32 %r553, %r11, 68, %r21; - shl.b32 %r554, %r553, 2; - add.s32 %r246, %r415, %r554; - shl.b32 %r555, %r2, 4; - or.b32 %r556, %r555, %r16; - and.b32 %r558, %r386, 56; - or.b32 %r559, %r9, %r558; - mad.lo.s32 %r560, %r556, 68, %r559; - shl.b32 %r561, %r560, 2; - add.s32 %r247, %r415, %r561; - or.b32 %r562, %r556, 8; - mad.lo.s32 %r563, %r562, 68, %r559; - shl.b32 %r564, %r563, 2; - add.s32 %r248, %r415, %r564; - shl.b32 %r565, %r529, 10; - shl.b32 %r566, %r19, 7; - or.b32 %r567, %r565, %r566; - or.b32 %r568, %r567, %r422; - shl.b32 %r569, %r568, 1; - add.s32 %r5018, %r552, %r569; - or.b32 %r570, %r567, %r427; - shl.b32 %r571, %r570, 1; - add.s32 %r5023, %r552, %r571; - or.b32 %r572, %r567, %r432; - shl.b32 %r573, %r572, 1; - add.s32 %r5028, %r552, %r573; - or.b32 %r574, %r567, %r437; - shl.b32 %r575, %r574, 1; - add.s32 %r5033, %r552, %r575; - or.b32 %r576, %r10, 8; - xor.b32 %r577, %r576, %r19; - shl.b32 %r578, %r577, 4; - shl.b32 %r579, %r567, 1; - or.b32 %r580, %r578, %r579; - add.s32 %r5038, %r552, %r580; - or.b32 %r581, %r10, 10; - xor.b32 %r582, %r581, %r19; - shl.b32 %r583, %r582, 4; - or.b32 %r584, %r583, %r579; - add.s32 %r5043, %r552, %r584; - or.b32 %r585, %r10, 12; - xor.b32 %r586, %r585, %r19; - shl.b32 %r587, %r586, 4; - or.b32 %r588, %r587, %r579; - add.s32 %r5048, %r552, %r588; - or.b32 %r589, %r10, 14; - xor.b32 %r590, %r589, %r19; - shl.b32 %r591, %r590, 4; - or.b32 %r592, %r591, %r579; - add.s32 %r5053, %r552, %r592; - add.s32 %r5058, %r5018, 16384; - add.s32 %r5063, %r5023, 16384; - add.s32 %r5068, %r5028, 16384; - add.s32 %r5073, %r5033, 16384; - add.s32 %r5078, %r5038, 16384; - add.s32 %r5083, %r5043, 16384; - add.s32 %r5088, %r5048, 16384; - add.s32 %r5093, %r5053, 16384; - add.s32 %r5098, %r407, %r547; - add.s32 %r5103, %r5098, 2048; - add.s32 %r5108, %r5098, 4096; - add.s32 %r5113, %r5098, 6144; - add.s32 %r5118, %r5098, 8192; - add.s32 %r5123, %r5098, 10240; - add.s32 %r5128, %r5098, 12288; - add.s32 %r5133, %r5098, 14336; - add.s32 %r5138, %r407, %r551; - add.s32 %r5143, %r5138, 2048; - add.s32 %r5148, %r5138, 4096; - add.s32 %r5153, %r5138, 6144; - add.s32 %r5158, %r5138, 8192; - add.s32 %r5163, %r5138, 10240; - add.s32 %r5168, %r5138, 12288; - add.s32 %r5173, %r5138, 14336; - mad.lo.s32 %r593, %r556, 72, %r559; - shl.b32 %r594, %r593, 1; - add.s32 %r281, %r407, %r594; - add.s32 %r282, %r281, 1152; - mad.lo.s32 %r595, %r5, 72, %r20; - shl.b32 %r596, %r595, 1; - add.s32 %r283, %r407, %r596; - shl.b64 %rd9, %rd1, 1; - mad.lo.s32 %r6174, %r378, %r8, %r20; - mul.wide.s32 %rd10, %r23, 2; - mad.lo.s32 %r6173, %r378, %r7, %r20; - mad.lo.s32 %r6172, %r378, %r6, %r20; - mul.wide.s32 %rd11, %r23, 4; - mad.lo.s32 %r6171, %r378, %r14, %r21; - mad.lo.s32 %r6170, %r378, %r13, %r21; - mad.lo.s32 %r6169, %r378, %r12, %r21; - mov.u32 %r6175, 0; - mov.pred %p102, -1; - mov.f32 %f227, 0f00000000; - bra.uni LBB0_2; -LBB0_5: - add.s64 %rd113, %rd6, %rd129; - shl.b64 %rd122, %rd26, 1; - add.s64 %rd114, %rd6, %rd122; - shl.b64 %rd123, %rd27, 1; - add.s64 %rd115, %rd6, %rd123; - shl.b64 %rd124, %rd28, 1; - add.s64 %rd116, %rd6, %rd124; - cvt.rn.f16.f32 %h129, %f909; - cvt.rn.f16.f32 %h130, %f910; - cvt.rn.f16.f32 %h131, %f911; - cvt.rn.f16.f32 %h132, %f912; - cvt.rn.f16.f32 %h133, %f913; - cvt.rn.f16.f32 %h134, %f914; - cvt.rn.f16.f32 %h135, %f915; - cvt.rn.f16.f32 %h136, %f916; - cvt.rn.f16.f32 %h137, %f917; - cvt.rn.f16.f32 %h138, %f918; - cvt.rn.f16.f32 %h139, %f919; - cvt.rn.f16.f32 %h140, %f920; - cvt.rn.f16.f32 %h141, %f921; - cvt.rn.f16.f32 %h142, %f922; - cvt.rn.f16.f32 %h143, %f923; - cvt.rn.f16.f32 %h144, %f924; - cvt.rn.f16.f32 %h145, %f925; - cvt.rn.f16.f32 %h146, %f926; - cvt.rn.f16.f32 %h147, %f927; - cvt.rn.f16.f32 %h148, %f928; - cvt.rn.f16.f32 %h149, %f929; - cvt.rn.f16.f32 %h150, %f930; - cvt.rn.f16.f32 %h151, %f931; - cvt.rn.f16.f32 %h152, %f932; - cvt.rn.f16.f32 %h153, %f933; - cvt.rn.f16.f32 %h154, %f934; - cvt.rn.f16.f32 %h155, %f935; - cvt.rn.f16.f32 %h156, %f936; - cvt.rn.f16.f32 %h157, %f937; - cvt.rn.f16.f32 %h158, %f938; - cvt.rn.f16.f32 %h159, %f939; - cvt.rn.f16.f32 %h160, %f940; - st.shared.v2.b16 [%r281], {%h129, %h130}; - st.shared.v2.b16 [%r282], {%h131, %h132}; - st.shared.v2.b16 [%r281+32], {%h133, %h134}; - st.shared.v2.b16 [%r282+32], {%h135, %h136}; - st.shared.v2.b16 [%r281+64], {%h137, %h138}; - st.shared.v2.b16 [%r282+64], {%h139, %h140}; - st.shared.v2.b16 [%r281+96], {%h141, %h142}; - st.shared.v2.b16 [%r282+96], {%h143, %h144}; - bar.sync 0; - ld.shared.v4.u32 {%r6137, %r6138, %r6139, %r6140}, [%r283]; - ld.shared.v4.u32 {%r6141, %r6142, %r6143, %r6144}, [%r283+4608]; - bar.sync 0; - st.shared.v2.b16 [%r281], {%h145, %h146}; - st.shared.v2.b16 [%r282], {%h147, %h148}; - st.shared.v2.b16 [%r281+32], {%h149, %h150}; - st.shared.v2.b16 [%r282+32], {%h151, %h152}; - st.shared.v2.b16 [%r281+64], {%h153, %h154}; - st.shared.v2.b16 [%r282+64], {%h155, %h156}; - st.shared.v2.b16 [%r281+96], {%h157, %h158}; - st.shared.v2.b16 [%r282+96], {%h159, %h160}; - bar.sync 0; - ld.shared.v4.u32 {%r6145, %r6146, %r6147, %r6148}, [%r283]; - ld.shared.v4.u32 {%r6149, %r6150, %r6151, %r6152}, [%r283+4608]; - @%p102 st.global.v4.b32 [ %rd113 + 0 ], { %r6137, %r6138, %r6139, %r6140 }; - @%p102 st.global.v4.b32 [ %rd114 + 0 ], { %r6141, %r6142, %r6143, %r6144 }; - @%p102 st.global.v4.b32 [ %rd115 + 0 ], { %r6145, %r6146, %r6147, %r6148 }; - @%p102 st.global.v4.b32 [ %rd116 + 0 ], { %r6149, %r6150, %r6151, %r6152 }; - shl.b64 %rd125, %rd21, 1; - add.s64 %rd117, %rd5, %rd125; - shl.b64 %rd126, %rd22, 1; - add.s64 %rd118, %rd5, %rd126; - shl.b64 %rd127, %rd23, 1; - add.s64 %rd119, %rd5, %rd127; - shl.b64 %rd128, %rd24, 1; - add.s64 %rd120, %rd5, %rd128; - cvt.rn.f16.f32 %h161, %f941; - cvt.rn.f16.f32 %h162, %f942; - cvt.rn.f16.f32 %h163, %f943; - cvt.rn.f16.f32 %h164, %f944; - cvt.rn.f16.f32 %h165, %f945; - cvt.rn.f16.f32 %h166, %f946; - cvt.rn.f16.f32 %h167, %f947; - cvt.rn.f16.f32 %h168, %f948; - cvt.rn.f16.f32 %h169, %f949; - cvt.rn.f16.f32 %h170, %f950; - cvt.rn.f16.f32 %h171, %f951; - cvt.rn.f16.f32 %h172, %f952; - cvt.rn.f16.f32 %h173, %f953; - cvt.rn.f16.f32 %h174, %f954; - cvt.rn.f16.f32 %h175, %f955; - cvt.rn.f16.f32 %h176, %f956; - cvt.rn.f16.f32 %h177, %f957; - cvt.rn.f16.f32 %h178, %f958; - cvt.rn.f16.f32 %h179, %f959; - cvt.rn.f16.f32 %h180, %f960; - cvt.rn.f16.f32 %h181, %f961; - cvt.rn.f16.f32 %h182, %f962; - cvt.rn.f16.f32 %h183, %f963; - cvt.rn.f16.f32 %h184, %f964; - cvt.rn.f16.f32 %h185, %f965; - cvt.rn.f16.f32 %h186, %f966; - cvt.rn.f16.f32 %h187, %f967; - cvt.rn.f16.f32 %h188, %f968; - cvt.rn.f16.f32 %h189, %f969; - cvt.rn.f16.f32 %h190, %f970; - cvt.rn.f16.f32 %h191, %f971; - cvt.rn.f16.f32 %h192, %f972; - bar.sync 0; - st.shared.v2.b16 [%r281], {%h161, %h162}; - st.shared.v2.b16 [%r282], {%h163, %h164}; - st.shared.v2.b16 [%r281+32], {%h165, %h166}; - st.shared.v2.b16 [%r282+32], {%h167, %h168}; - st.shared.v2.b16 [%r281+64], {%h169, %h170}; - st.shared.v2.b16 [%r282+64], {%h171, %h172}; - st.shared.v2.b16 [%r281+96], {%h173, %h174}; - st.shared.v2.b16 [%r282+96], {%h175, %h176}; - bar.sync 0; - ld.shared.v4.u32 {%r6153, %r6154, %r6155, %r6156}, [%r283]; - ld.shared.v4.u32 {%r6157, %r6158, %r6159, %r6160}, [%r283+4608]; - bar.sync 0; - st.shared.v2.b16 [%r281], {%h177, %h178}; - st.shared.v2.b16 [%r282], {%h179, %h180}; - st.shared.v2.b16 [%r281+32], {%h181, %h182}; - st.shared.v2.b16 [%r282+32], {%h183, %h184}; - st.shared.v2.b16 [%r281+64], {%h185, %h186}; - st.shared.v2.b16 [%r282+64], {%h187, %h188}; - st.shared.v2.b16 [%r281+96], {%h189, %h190}; - st.shared.v2.b16 [%r282+96], {%h191, %h192}; - bar.sync 0; - ld.shared.v4.u32 {%r6161, %r6162, %r6163, %r6164}, [%r283]; - ld.shared.v4.u32 {%r6165, %r6166, %r6167, %r6168}, [%r283+4608]; - @%p102 st.global.v4.b32 [ %rd117 + 0 ], { %r6153, %r6154, %r6155, %r6156 }; - @%p102 st.global.v4.b32 [ %rd118 + 0 ], { %r6157, %r6158, %r6159, %r6160 }; - @%p102 st.global.v4.b32 [ %rd119 + 0 ], { %r6161, %r6162, %r6163, %r6164 }; - @%p102 st.global.v4.b32 [ %rd120 + 0 ], { %r6165, %r6166, %r6167, %r6168 }; - add.s32 %r6175, %r6175, 1; - add.s32 %r6174, %r6174, %r23; - add.s32 %r6173, %r6173, %r23; - add.s32 %r6172, %r6172, %r23; - add.s32 %r6171, %r6171, %r23; - add.s32 %r6170, %r6170, %r23; - add.s32 %r6169, %r6169, %r23; - setp.lt.s32 %p110, %r6175, %r380; - @%p110 bra LBB0_2; - bra.uni LBB0_6; -LBB0_2: - shl.b32 %r6176, %r6175, 7; - or.b32 %r629, %r6176, %r5; - add.s32 %r630, %r6176, %r6; - add.s32 %r631, %r6176, %r7; - add.s32 %r632, %r6176, %r8; - mad.lo.s32 %r633, %r629, %r379, %r20; - mad.lo.s32 %r634, %r630, %r379, %r20; - mad.lo.s32 %r635, %r631, %r379, %r20; - mad.lo.s32 %r636, %r632, %r379, %r20; - cvt.s64.s32 %rd21, %r633; - mul.wide.s32 %rd77, %r633, 2; - add.s64 %rd66, %rd2, %rd77; - cvt.s64.s32 %rd22, %r634; - mul.wide.s32 %rd78, %r634, 2; - add.s64 %rd67, %rd2, %rd78; - cvt.s64.s32 %rd23, %r635; - mul.wide.s32 %rd79, %r635, 2; - add.s64 %rd68, %rd2, %rd79; - cvt.s64.s32 %rd24, %r636; - mul.wide.s32 %rd80, %r636, 2; - add.s64 %rd69, %rd2, %rd80; - @%p102 ld.global.v4.b32 { %r641, %r642, %r643, %r644 }, [ %rd66 + 0 ]; - mov.b32 %hh1, %r641; - mov.b32 %hh2, %r642; - mov.b32 %hh3, %r643; - mov.b32 %hh4, %r644; - @%p102 ld.global.v4.b32 { %r645, %r646, %r647, %r648 }, [ %rd67 + 0 ]; - mov.b32 %hh5, %r645; - mov.b32 %hh6, %r646; - mov.b32 %hh7, %r647; - mov.b32 %hh8, %r648; - @%p102 ld.global.v4.b32 { %r649, %r650, %r651, %r652 }, [ %rd68 + 0 ]; - mov.b32 %hh9, %r649; - mov.b32 %hh10, %r650; - mov.b32 %hh11, %r651; - mov.b32 %hh12, %r652; - @%p102 ld.global.v4.b32 { %r653, %r654, %r655, %r656 }, [ %rd69 + 0 ]; - mov.b32 %hh13, %r653; - mov.b32 %hh14, %r654; - mov.b32 %hh15, %r655; - mov.b32 %hh16, %r656; - mad.lo.s32 %r637, %r629, %r378, %r20; - mad.lo.s32 %r638, %r630, %r378, %r20; - mad.lo.s32 %r639, %r631, %r378, %r20; - mad.lo.s32 %r640, %r632, %r378, %r20; - cvt.s64.s32 %rd25, %r637; - mul.wide.s32 %rd81, %r637, 2; - add.s64 %rd70, %rd3, %rd81; - cvt.s64.s32 %rd26, %r638; - mul.wide.s32 %rd82, %r638, 2; - add.s64 %rd71, %rd3, %rd82; - cvt.s64.s32 %rd27, %r639; - mul.wide.s32 %rd83, %r639, 2; - add.s64 %rd72, %rd3, %rd83; - cvt.s64.s32 %rd28, %r640; - mul.wide.s32 %rd84, %r640, 2; - add.s64 %rd73, %rd3, %rd84; - @%p102 ld.global.v4.b32 { %r657, %r658, %r659, %r660 }, [ %rd70 + 0 ]; - mov.b32 %hh17, %r657; - mov.b32 %hh18, %r658; - mov.b32 %hh19, %r659; - mov.b32 %hh20, %r660; - @%p102 ld.global.v4.b32 { %r661, %r662, %r663, %r664 }, [ %rd71 + 0 ]; - mov.b32 %hh21, %r661; - mov.b32 %hh22, %r662; - mov.b32 %hh23, %r663; - mov.b32 %hh24, %r664; - @%p102 ld.global.v4.b32 { %r665, %r666, %r667, %r668 }, [ %rd72 + 0 ]; - mov.b32 %hh25, %r665; - mov.b32 %hh26, %r666; - mov.b32 %hh27, %r667; - mov.b32 %hh28, %r668; - @%p102 ld.global.v4.b32 { %r669, %r670, %r671, %r672 }, [ %rd73 + 0 ]; - mov.b32 %hh29, %r669; - mov.b32 %hh30, %r670; - mov.b32 %hh31, %r671; - mov.b32 %hh32, %r672; - bar.sync 0; - st.shared.v4.b32 [%r24], {%r641, %r642, %r643, %r644}; - st.shared.v4.b32 [%r25], {%r645, %r646, %r647, %r648}; - st.shared.v4.b32 [%r26], {%r649, %r650, %r651, %r652}; - st.shared.v4.b32 [%r27], {%r653, %r654, %r655, %r656}; - bar.sync 0; - st.shared.v4.b32 [%r28], {%r657, %r658, %r659, %r660}; - st.shared.v4.b32 [%r29], {%r661, %r662, %r663, %r664}; - st.shared.v4.b32 [%r30], {%r665, %r666, %r667, %r668}; - st.shared.v4.b32 [%r31], {%r669, %r670, %r671, %r672}; - bar.sync 0; - setp.ge.s32 %p10, %r6176, %r22; - shl.b64 %rd129, %rd25, 1; - mov.f32 %f909, %f227; - mov.f32 %f910, %f227; - mov.f32 %f911, %f227; - mov.f32 %f912, %f227; - mov.f32 %f913, %f227; - mov.f32 %f914, %f227; - mov.f32 %f915, %f227; - mov.f32 %f916, %f227; - mov.f32 %f917, %f227; - mov.f32 %f918, %f227; - mov.f32 %f919, %f227; - mov.f32 %f920, %f227; - mov.f32 %f921, %f227; - mov.f32 %f922, %f227; - mov.f32 %f923, %f227; - mov.f32 %f924, %f227; - mov.f32 %f925, %f227; - mov.f32 %f926, %f227; - mov.f32 %f927, %f227; - mov.f32 %f928, %f227; - mov.f32 %f929, %f227; - mov.f32 %f930, %f227; - mov.f32 %f931, %f227; - mov.f32 %f932, %f227; - mov.f32 %f933, %f227; - mov.f32 %f934, %f227; - mov.f32 %f935, %f227; - mov.f32 %f936, %f227; - mov.f32 %f937, %f227; - mov.f32 %f938, %f227; - mov.f32 %f939, %f227; - mov.f32 %f940, %f227; - mov.f32 %f941, %f227; - mov.f32 %f942, %f227; - mov.f32 %f943, %f227; - mov.f32 %f944, %f227; - mov.f32 %f945, %f227; - mov.f32 %f946, %f227; - mov.f32 %f947, %f227; - mov.f32 %f948, %f227; - mov.f32 %f949, %f227; - mov.f32 %f950, %f227; - mov.f32 %f951, %f227; - mov.f32 %f952, %f227; - mov.f32 %f953, %f227; - mov.f32 %f954, %f227; - mov.f32 %f955, %f227; - mov.f32 %f956, %f227; - mov.f32 %f957, %f227; - mov.f32 %f958, %f227; - mov.f32 %f959, %f227; - mov.f32 %f960, %f227; - mov.f32 %f961, %f227; - mov.f32 %f962, %f227; - mov.f32 %f963, %f227; - mov.f32 %f964, %f227; - mov.f32 %f965, %f227; - mov.f32 %f966, %f227; - mov.f32 %f967, %f227; - mov.f32 %f968, %f227; - mov.f32 %f969, %f227; - mov.f32 %f970, %f227; - mov.f32 %f971, %f227; - mov.f32 %f972, %f227; - @%p10 bra LBB0_5; - mul.wide.s32 %rd74, %r6174, 2; - add.s64 %rd138, %rd55, %rd74; - mul.wide.s32 %rd75, %r6173, 2; - add.s64 %rd137, %rd55, %rd75; - mul.wide.s32 %rd76, %r6172, 2; - add.s64 %rd136, %rd55, %rd76; - add.s64 %rd134, %rd54, %rd74; - add.s64 %rd133, %rd54, %rd75; - add.s64 %rd132, %rd54, %rd76; - mul.wide.s32 %rd18, %r6171, 4; - mul.wide.s32 %rd19, %r6170, 4; - mul.wide.s32 %rd20, %r6169, 4; - or.b32 %r300, %r6176, %r11; - or.b32 %r301, %r6176, %r9; - or.b32 %r302, %r301, 1; - or.b32 %r303, %r301, 8; - or.b32 %r304, %r301, 9; - or.b32 %r336, %r301, 64; - or.b32 %r335, %r301, 65; - or.b32 %r332, %r301, 72; - or.b32 %r331, %r301, 73; - or.b32 %r328, %r301, 80; - or.b32 %r327, %r301, 81; - or.b32 %r324, %r301, 88; - or.b32 %r323, %r301, 89; - or.b32 %r320, %r301, 96; - or.b32 %r319, %r301, 97; - or.b32 %r316, %r301, 104; - or.b32 %r315, %r301, 105; - or.b32 %r312, %r301, 112; - or.b32 %r311, %r301, 113; - or.b32 %r308, %r301, 120; - or.b32 %r307, %r301, 121; - or.b32 %r352, %r301, 32; - or.b32 %r351, %r301, 33; - or.b32 %r348, %r301, 40; - or.b32 %r347, %r301, 41; - or.b32 %r344, %r301, 48; - or.b32 %r343, %r301, 49; - or.b32 %r340, %r301, 56; - or.b32 %r339, %r301, 57; - or.b32 %r360, %r301, 16; - or.b32 %r359, %r301, 17; - or.b32 %r356, %r301, 24; - or.b32 %r355, %r301, 25; - add.s32 %r673, %r300, 112; - mul.lo.s32 %r674, %r673, %r378; - add.s32 %r675, %r674, %r21; - shl.b32 %r676, %r378, 4; - sub.s32 %r677, %r674, %r676; - add.s32 %r678, %r677, %r21; - sub.s32 %r679, %r677, %r676; - add.s32 %r680, %r679, %r21; - sub.s32 %r681, %r679, %r676; - add.s32 %r682, %r681, %r21; - mad.lo.s32 %r683, %r300, %r378, %r21; - add.s64 %rd135, %rd55, %rd129; - add.s64 %rd131, %rd54, %rd129; - mul.wide.s32 %rd31, %r675, 4; - mul.wide.s32 %rd32, %r678, 4; - mul.wide.s32 %rd33, %r680, 4; - mul.wide.s32 %rd34, %r682, 4; - mul.wide.s32 %rd35, %r683, 4; - mov.f32 %f259, 0f00000000; - mov.u64 %rd130, %rd4; - mov.f32 %f941, %f259; - mov.f32 %f942, %f259; - mov.f32 %f943, %f259; - mov.f32 %f944, %f259; - mov.f32 %f945, %f259; - mov.f32 %f946, %f259; - mov.f32 %f947, %f259; - mov.f32 %f948, %f259; - mov.f32 %f949, %f259; - mov.f32 %f950, %f259; - mov.f32 %f951, %f259; - mov.f32 %f952, %f259; - mov.f32 %f953, %f259; - mov.f32 %f954, %f259; - mov.f32 %f955, %f259; - mov.f32 %f956, %f259; - mov.f32 %f957, %f259; - mov.f32 %f958, %f259; - mov.f32 %f959, %f259; - mov.f32 %f960, %f259; - mov.f32 %f961, %f259; - mov.f32 %f962, %f259; - mov.f32 %f963, %f259; - mov.f32 %f964, %f259; - mov.f32 %f965, %f259; - mov.f32 %f966, %f259; - mov.f32 %f967, %f259; - mov.f32 %f968, %f259; - mov.f32 %f969, %f259; - mov.f32 %f970, %f259; - mov.f32 %f971, %f259; - mov.f32 %f972, %f259; - mov.f32 %f909, %f259; - mov.f32 %f910, %f259; - mov.f32 %f911, %f259; - mov.f32 %f912, %f259; - mov.f32 %f913, %f259; - mov.f32 %f914, %f259; - mov.f32 %f915, %f259; - mov.f32 %f916, %f259; - mov.f32 %f917, %f259; - mov.f32 %f918, %f259; - mov.f32 %f919, %f259; - mov.f32 %f920, %f259; - mov.f32 %f921, %f259; - mov.f32 %f922, %f259; - mov.f32 %f923, %f259; - mov.f32 %f924, %f259; - mov.f32 %f925, %f259; - mov.f32 %f926, %f259; - mov.f32 %f927, %f259; - mov.f32 %f928, %f259; - mov.f32 %f929, %f259; - mov.f32 %f930, %f259; - mov.f32 %f931, %f259; - mov.f32 %f932, %f259; - mov.f32 %f933, %f259; - mov.f32 %f934, %f259; - mov.f32 %f935, %f259; - mov.f32 %f936, %f259; - mov.f32 %f937, %f259; - mov.f32 %f938, %f259; - mov.f32 %f939, %f259; - mov.f32 %f940, %f259; -LBB0_4: - add.s64 %rd94, %rd138, %rd9; - add.s64 %rd93, %rd137, %rd9; - add.s64 %rd92, %rd136, %rd9; - add.s64 %rd91, %rd135, %rd9; - add.s64 %rd89, %rd134, %rd9; - add.s64 %rd88, %rd133, %rd9; - add.s64 %rd87, %rd132, %rd9; - add.s64 %rd86, %rd131, %rd9; - add.s64 %rd103, %rd130, %rd31; - add.s64 %rd102, %rd130, %rd32; - add.s64 %rd101, %rd130, %rd33; - add.s64 %rd100, %rd130, %rd34; - add.s64 %rd99, %rd130, %rd18; - add.s64 %rd98, %rd130, %rd19; - add.s64 %rd97, %rd130, %rd20; - add.s64 %rd96, %rd130, %rd35; - or.b32 %r6102, %r6176, %r3; - @%p102 ld.global.v4.b32 { %r6103, %r6104, %r6105, %r6106 }, [ %rd86 + 0 ]; - mov.b32 %hh33, %r6103; - mov.b32 %hh34, %r6104; - mov.b32 %hh35, %r6105; - mov.b32 %hh36, %r6106; - @%p102 ld.global.v4.b32 { %r6107, %r6108, %r6109, %r6110 }, [ %rd87 + 0 ]; - mov.b32 %hh37, %r6107; - mov.b32 %hh38, %r6108; - mov.b32 %hh39, %r6109; - mov.b32 %hh40, %r6110; - @%p102 ld.global.v4.b32 { %r6111, %r6112, %r6113, %r6114 }, [ %rd88 + 0 ]; - mov.b32 %hh41, %r6111; - mov.b32 %hh42, %r6112; - mov.b32 %hh43, %r6113; - mov.b32 %hh44, %r6114; - @%p102 ld.global.v4.b32 { %r6115, %r6116, %r6117, %r6118 }, [ %rd89 + 0 ]; - mov.b32 %hh45, %r6115; - mov.b32 %hh46, %r6116; - mov.b32 %hh47, %r6117; - mov.b32 %hh48, %r6118; - bar.sync 0; - st.shared.v4.b32 [%r32], {%r6103, %r6104, %r6105, %r6106}; - st.shared.v4.b32 [%r33], {%r6107, %r6108, %r6109, %r6110}; - st.shared.v4.b32 [%r34], {%r6111, %r6112, %r6113, %r6114}; - st.shared.v4.b32 [%r35], {%r6115, %r6116, %r6117, %r6118}; - bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r884, %r885, %r886, %r887 }, [ %r704 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1108, %r1109, %r1110, %r1111 }, [ %r709 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1332, %r1333, %r1334, %r1335 }, [ %r714 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1556, %r1557, %r1558, %r1559 }, [ %r719 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r720, %r721, %r722, %r723 }, [ %r724 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r725, %r726, %r727, %r728 }, [ %r729 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r730, %r731, %r732, %r733 }, [ %r734 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r735, %r736, %r737, %r738 }, [ %r739 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r740, %r741, %r742, %r743 }, [ %r744 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r745, %r746, %r747, %r748 }, [ %r749 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r750, %r751, %r752, %r753 }, [ %r754 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r755, %r756, %r757, %r758 }, [ %r759 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r760, %r761, %r762, %r763 }, [ %r764 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r765, %r766, %r767, %r768 }, [ %r769 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r770, %r771, %r772, %r773 }, [ %r774 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r775, %r776, %r777, %r778 }, [ %r779 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r780, %r781, %r782, %r783 }, [ %r784 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r785, %r786, %r787, %r788 }, [ %r789 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r790, %r791, %r792, %r793 }, [ %r794 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r795, %r796, %r797, %r798 }, [ %r799 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r800, %r801, %r802, %r803 }, [ %r804 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r805, %r806, %r807, %r808 }, [ %r809 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r810, %r811, %r812, %r813 }, [ %r814 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r815, %r816, %r817, %r818 }, [ %r819 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r820, %r821, %r822, %r823 }, [ %r824 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r825, %r826, %r827, %r828 }, [ %r829 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r830, %r831, %r832, %r833 }, [ %r834 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r835, %r836, %r837, %r838 }, [ %r839 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r840, %r841, %r842, %r843 }, [ %r844 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r845, %r846, %r847, %r848 }, [ %r849 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r850, %r851, %r852, %r853 }, [ %r854 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r855, %r856, %r857, %r858 }, [ %r859 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r860, %r861, %r862, %r863 }, [ %r864 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r865, %r866, %r867, %r868 }, [ %r869 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r870, %r871, %r872, %r873 }, [ %r874 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r875, %r876, %r877, %r878 }, [ %r879 + 0 ]; - mov.u32 %r1317, 0; - mov.u32 %r1104, %r1317; - mov.u32 %r1105, %r1317; - mov.u32 %r1106, %r1317; - mov.u32 %r1107, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r884, %r885, %r886, %r887 }, { %r720, %r721 }, { %r1104, %r1105, %r1106, %r1107 }; - mov.u32 %r1118, %r1317; - mov.u32 %r1119, %r1317; - mov.u32 %r1120, %r1317; - mov.u32 %r1121, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r884, %r885, %r886, %r887 }, { %r722, %r723 }, { %r1118, %r1119, %r1120, %r1121 }; - mov.u32 %r1132, %r1317; - mov.u32 %r1133, %r1317; - mov.u32 %r1134, %r1317; - mov.u32 %r1135, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r884, %r885, %r886, %r887 }, { %r740, %r741 }, { %r1132, %r1133, %r1134, %r1135 }; - mov.u32 %r1146, %r1317; - mov.u32 %r1147, %r1317; - mov.u32 %r1148, %r1317; - mov.u32 %r1149, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r884, %r885, %r886, %r887 }, { %r742, %r743 }, { %r1146, %r1147, %r1148, %r1149 }; - mov.u32 %r1160, %r1317; - mov.u32 %r1161, %r1317; - mov.u32 %r1162, %r1317; - mov.u32 %r1163, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r884, %r885, %r886, %r887 }, { %r760, %r761 }, { %r1160, %r1161, %r1162, %r1163 }; - mov.u32 %r1174, %r1317; - mov.u32 %r1175, %r1317; - mov.u32 %r1176, %r1317; - mov.u32 %r1177, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r884, %r885, %r886, %r887 }, { %r762, %r763 }, { %r1174, %r1175, %r1176, %r1177 }; - mov.u32 %r1188, %r1317; - mov.u32 %r1189, %r1317; - mov.u32 %r1190, %r1317; - mov.u32 %r1191, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r884, %r885, %r886, %r887 }, { %r780, %r781 }, { %r1188, %r1189, %r1190, %r1191 }; - mov.u32 %r1202, %r1317; - mov.u32 %r1203, %r1317; - mov.u32 %r1204, %r1317; - mov.u32 %r1205, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r884, %r885, %r886, %r887 }, { %r782, %r783 }, { %r1202, %r1203, %r1204, %r1205 }; - mov.u32 %r1216, %r1317; - mov.u32 %r1217, %r1317; - mov.u32 %r1218, %r1317; - mov.u32 %r1219, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r884, %r885, %r886, %r887 }, { %r800, %r801 }, { %r1216, %r1217, %r1218, %r1219 }; - mov.u32 %r1230, %r1317; - mov.u32 %r1231, %r1317; - mov.u32 %r1232, %r1317; - mov.u32 %r1233, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r884, %r885, %r886, %r887 }, { %r802, %r803 }, { %r1230, %r1231, %r1232, %r1233 }; - mov.u32 %r1244, %r1317; - mov.u32 %r1245, %r1317; - mov.u32 %r1246, %r1317; - mov.u32 %r1247, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r884, %r885, %r886, %r887 }, { %r820, %r821 }, { %r1244, %r1245, %r1246, %r1247 }; - mov.u32 %r1258, %r1317; - mov.u32 %r1259, %r1317; - mov.u32 %r1260, %r1317; - mov.u32 %r1261, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r884, %r885, %r886, %r887 }, { %r822, %r823 }, { %r1258, %r1259, %r1260, %r1261 }; - mov.u32 %r1272, %r1317; - mov.u32 %r1273, %r1317; - mov.u32 %r1274, %r1317; - mov.u32 %r1275, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r884, %r885, %r886, %r887 }, { %r840, %r841 }, { %r1272, %r1273, %r1274, %r1275 }; - mov.u32 %r1286, %r1317; - mov.u32 %r1287, %r1317; - mov.u32 %r1288, %r1317; - mov.u32 %r1289, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r884, %r885, %r886, %r887 }, { %r842, %r843 }, { %r1286, %r1287, %r1288, %r1289 }; - mov.u32 %r1300, %r1317; - mov.u32 %r1301, %r1317; - mov.u32 %r1302, %r1317; - mov.u32 %r1303, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r884, %r885, %r886, %r887 }, { %r860, %r861 }, { %r1300, %r1301, %r1302, %r1303 }; - mov.u32 %r1314, %r1317; - mov.u32 %r1315, %r1317; - mov.u32 %r1316, %r1317; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r884, %r885, %r886, %r887 }, { %r862, %r863 }, { %r1314, %r1315, %r1316, %r1317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1108, %r1109, %r1110, %r1111 }, { %r725, %r726 }, { %r1104, %r1105, %r1106, %r1107 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1108, %r1109, %r1110, %r1111 }, { %r727, %r728 }, { %r1118, %r1119, %r1120, %r1121 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1108, %r1109, %r1110, %r1111 }, { %r745, %r746 }, { %r1132, %r1133, %r1134, %r1135 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1108, %r1109, %r1110, %r1111 }, { %r747, %r748 }, { %r1146, %r1147, %r1148, %r1149 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1108, %r1109, %r1110, %r1111 }, { %r765, %r766 }, { %r1160, %r1161, %r1162, %r1163 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1108, %r1109, %r1110, %r1111 }, { %r767, %r768 }, { %r1174, %r1175, %r1176, %r1177 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1108, %r1109, %r1110, %r1111 }, { %r785, %r786 }, { %r1188, %r1189, %r1190, %r1191 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1108, %r1109, %r1110, %r1111 }, { %r787, %r788 }, { %r1202, %r1203, %r1204, %r1205 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1108, %r1109, %r1110, %r1111 }, { %r805, %r806 }, { %r1216, %r1217, %r1218, %r1219 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1108, %r1109, %r1110, %r1111 }, { %r807, %r808 }, { %r1230, %r1231, %r1232, %r1233 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1108, %r1109, %r1110, %r1111 }, { %r825, %r826 }, { %r1244, %r1245, %r1246, %r1247 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1108, %r1109, %r1110, %r1111 }, { %r827, %r828 }, { %r1258, %r1259, %r1260, %r1261 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1108, %r1109, %r1110, %r1111 }, { %r845, %r846 }, { %r1272, %r1273, %r1274, %r1275 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1108, %r1109, %r1110, %r1111 }, { %r847, %r848 }, { %r1286, %r1287, %r1288, %r1289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1108, %r1109, %r1110, %r1111 }, { %r865, %r866 }, { %r1300, %r1301, %r1302, %r1303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1108, %r1109, %r1110, %r1111 }, { %r867, %r868 }, { %r1314, %r1315, %r1316, %r1317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1332, %r1333, %r1334, %r1335 }, { %r730, %r731 }, { %r1104, %r1105, %r1106, %r1107 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1332, %r1333, %r1334, %r1335 }, { %r732, %r733 }, { %r1118, %r1119, %r1120, %r1121 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1332, %r1333, %r1334, %r1335 }, { %r750, %r751 }, { %r1132, %r1133, %r1134, %r1135 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1332, %r1333, %r1334, %r1335 }, { %r752, %r753 }, { %r1146, %r1147, %r1148, %r1149 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1332, %r1333, %r1334, %r1335 }, { %r770, %r771 }, { %r1160, %r1161, %r1162, %r1163 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1332, %r1333, %r1334, %r1335 }, { %r772, %r773 }, { %r1174, %r1175, %r1176, %r1177 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1332, %r1333, %r1334, %r1335 }, { %r790, %r791 }, { %r1188, %r1189, %r1190, %r1191 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1332, %r1333, %r1334, %r1335 }, { %r792, %r793 }, { %r1202, %r1203, %r1204, %r1205 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1332, %r1333, %r1334, %r1335 }, { %r810, %r811 }, { %r1216, %r1217, %r1218, %r1219 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1332, %r1333, %r1334, %r1335 }, { %r812, %r813 }, { %r1230, %r1231, %r1232, %r1233 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1332, %r1333, %r1334, %r1335 }, { %r830, %r831 }, { %r1244, %r1245, %r1246, %r1247 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1332, %r1333, %r1334, %r1335 }, { %r832, %r833 }, { %r1258, %r1259, %r1260, %r1261 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1332, %r1333, %r1334, %r1335 }, { %r850, %r851 }, { %r1272, %r1273, %r1274, %r1275 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1332, %r1333, %r1334, %r1335 }, { %r852, %r853 }, { %r1286, %r1287, %r1288, %r1289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1332, %r1333, %r1334, %r1335 }, { %r870, %r871 }, { %r1300, %r1301, %r1302, %r1303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1332, %r1333, %r1334, %r1335 }, { %r872, %r873 }, { %r1314, %r1315, %r1316, %r1317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1556, %r1557, %r1558, %r1559 }, { %r735, %r736 }, { %r1104, %r1105, %r1106, %r1107 }; - mov.b32 %f388, %r1107; - mov.b32 %f389, %r1106; - mov.b32 %f390, %r1105; - mov.b32 %f391, %r1104; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1556, %r1557, %r1558, %r1559 }, { %r737, %r738 }, { %r1118, %r1119, %r1120, %r1121 }; - mov.b32 %f392, %r1121; - mov.b32 %f393, %r1120; - mov.b32 %f394, %r1119; - mov.b32 %f395, %r1118; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1556, %r1557, %r1558, %r1559 }, { %r755, %r756 }, { %r1132, %r1133, %r1134, %r1135 }; - mov.b32 %f396, %r1135; - mov.b32 %f397, %r1134; - mov.b32 %f398, %r1133; - mov.b32 %f399, %r1132; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1556, %r1557, %r1558, %r1559 }, { %r757, %r758 }, { %r1146, %r1147, %r1148, %r1149 }; - mov.b32 %f400, %r1149; - mov.b32 %f401, %r1148; - mov.b32 %f402, %r1147; - mov.b32 %f403, %r1146; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1556, %r1557, %r1558, %r1559 }, { %r775, %r776 }, { %r1160, %r1161, %r1162, %r1163 }; - mov.b32 %f404, %r1163; - mov.b32 %f405, %r1162; - mov.b32 %f406, %r1161; - mov.b32 %f407, %r1160; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1556, %r1557, %r1558, %r1559 }, { %r777, %r778 }, { %r1174, %r1175, %r1176, %r1177 }; - mov.b32 %f408, %r1177; - mov.b32 %f409, %r1176; - mov.b32 %f410, %r1175; - mov.b32 %f411, %r1174; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1556, %r1557, %r1558, %r1559 }, { %r795, %r796 }, { %r1188, %r1189, %r1190, %r1191 }; - mov.b32 %f412, %r1191; - mov.b32 %f413, %r1190; - mov.b32 %f414, %r1189; - mov.b32 %f415, %r1188; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1556, %r1557, %r1558, %r1559 }, { %r797, %r798 }, { %r1202, %r1203, %r1204, %r1205 }; - mov.b32 %f416, %r1205; - mov.b32 %f417, %r1204; - mov.b32 %f418, %r1203; - mov.b32 %f419, %r1202; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1556, %r1557, %r1558, %r1559 }, { %r815, %r816 }, { %r1216, %r1217, %r1218, %r1219 }; - mov.b32 %f420, %r1219; - mov.b32 %f421, %r1218; - mov.b32 %f422, %r1217; - mov.b32 %f423, %r1216; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1556, %r1557, %r1558, %r1559 }, { %r817, %r818 }, { %r1230, %r1231, %r1232, %r1233 }; - mov.b32 %f424, %r1233; - mov.b32 %f425, %r1232; - mov.b32 %f426, %r1231; - mov.b32 %f427, %r1230; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1556, %r1557, %r1558, %r1559 }, { %r835, %r836 }, { %r1244, %r1245, %r1246, %r1247 }; - mov.b32 %f428, %r1247; - mov.b32 %f429, %r1246; - mov.b32 %f430, %r1245; - mov.b32 %f431, %r1244; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1556, %r1557, %r1558, %r1559 }, { %r837, %r838 }, { %r1258, %r1259, %r1260, %r1261 }; - mov.b32 %f432, %r1261; - mov.b32 %f433, %r1260; - mov.b32 %f434, %r1259; - mov.b32 %f435, %r1258; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1556, %r1557, %r1558, %r1559 }, { %r855, %r856 }, { %r1272, %r1273, %r1274, %r1275 }; - mov.b32 %f436, %r1275; - mov.b32 %f437, %r1274; - mov.b32 %f438, %r1273; - mov.b32 %f439, %r1272; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1556, %r1557, %r1558, %r1559 }, { %r857, %r858 }, { %r1286, %r1287, %r1288, %r1289 }; - mov.b32 %f440, %r1289; - mov.b32 %f441, %r1288; - mov.b32 %f442, %r1287; - mov.b32 %f443, %r1286; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1556, %r1557, %r1558, %r1559 }, { %r875, %r876 }, { %r1300, %r1301, %r1302, %r1303 }; - mov.b32 %f444, %r1303; - mov.b32 %f445, %r1302; - mov.b32 %f446, %r1301; - mov.b32 %f447, %r1300; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1556, %r1557, %r1558, %r1559 }, { %r877, %r878 }, { %r1314, %r1315, %r1316, %r1317 }; - mov.b32 %f448, %r1317; - mov.b32 %f449, %r1316; - mov.b32 %f450, %r1315; - mov.b32 %f451, %r1314; - or.b32 %r6119, %r6176, %r17; - or.b32 %r6120, %r6176, %r18; - setp.lt.s32 %p37, %r6120, %r307; - setp.lt.s32 %p38, %r6120, %r308; - setp.lt.s32 %p39, %r6119, %r307; - setp.lt.s32 %p40, %r6119, %r308; - setp.lt.s32 %p41, %r6120, %r311; - setp.lt.s32 %p42, %r6120, %r312; - setp.lt.s32 %p43, %r6119, %r311; - setp.lt.s32 %p44, %r6119, %r312; - setp.lt.s32 %p45, %r6120, %r315; - setp.lt.s32 %p46, %r6120, %r316; - setp.lt.s32 %p47, %r6119, %r315; - setp.lt.s32 %p48, %r6119, %r316; - setp.lt.s32 %p49, %r6120, %r319; - setp.lt.s32 %p50, %r6120, %r320; - setp.lt.s32 %p51, %r6119, %r319; - setp.lt.s32 %p52, %r6119, %r320; - setp.lt.s32 %p53, %r6120, %r323; - setp.lt.s32 %p54, %r6120, %r324; - setp.lt.s32 %p55, %r6119, %r323; - setp.lt.s32 %p56, %r6119, %r324; - setp.lt.s32 %p57, %r6120, %r327; - setp.lt.s32 %p58, %r6120, %r328; - setp.lt.s32 %p59, %r6119, %r327; - setp.lt.s32 %p60, %r6119, %r328; - setp.lt.s32 %p61, %r6120, %r331; - setp.lt.s32 %p62, %r6120, %r332; - setp.lt.s32 %p63, %r6119, %r331; - setp.lt.s32 %p64, %r6119, %r332; - setp.lt.s32 %p65, %r6120, %r335; - setp.lt.s32 %p66, %r6120, %r336; - setp.lt.s32 %p67, %r6119, %r335; - setp.lt.s32 %p68, %r6119, %r336; - setp.lt.s32 %p69, %r6120, %r339; - setp.lt.s32 %p70, %r6120, %r340; - setp.lt.s32 %p71, %r6119, %r339; - setp.lt.s32 %p72, %r6119, %r340; - setp.lt.s32 %p73, %r6120, %r343; - setp.lt.s32 %p74, %r6120, %r344; - setp.lt.s32 %p75, %r6119, %r343; - setp.lt.s32 %p76, %r6119, %r344; - setp.lt.s32 %p77, %r6120, %r347; - setp.lt.s32 %p78, %r6120, %r348; - setp.lt.s32 %p79, %r6119, %r347; - setp.lt.s32 %p80, %r6119, %r348; - setp.lt.s32 %p81, %r6120, %r351; - setp.lt.s32 %p82, %r6120, %r352; - setp.lt.s32 %p83, %r6119, %r351; - setp.lt.s32 %p84, %r6119, %r352; - setp.lt.s32 %p85, %r6120, %r355; - setp.lt.s32 %p86, %r6120, %r356; - setp.lt.s32 %p87, %r6119, %r355; - setp.lt.s32 %p88, %r6119, %r356; - setp.lt.s32 %p89, %r6120, %r359; - setp.lt.s32 %p90, %r6120, %r360; - setp.lt.s32 %p91, %r6119, %r359; - setp.lt.s32 %p92, %r6119, %r360; - setp.lt.s32 %p93, %r6120, %r304; - setp.lt.s32 %p94, %r6120, %r303; - setp.lt.s32 %p95, %r6119, %r304; - setp.lt.s32 %p96, %r6119, %r303; - setp.lt.s32 %p97, %r6120, %r302; - setp.lt.s32 %p98, %r6120, %r301; - setp.lt.s32 %p99, %r6119, %r302; - setp.lt.s32 %p100, %r6119, %r301; - selp.f32 %f452, 0fFF800000, %f391, %p100; - selp.f32 %f453, 0fFF800000, %f390, %p99; - selp.f32 %f454, 0fFF800000, %f389, %p98; - selp.f32 %f455, 0fFF800000, %f388, %p97; - selp.f32 %f456, 0fFF800000, %f395, %p96; - selp.f32 %f457, 0fFF800000, %f394, %p95; - selp.f32 %f458, 0fFF800000, %f393, %p94; - selp.f32 %f459, 0fFF800000, %f392, %p93; - selp.f32 %f460, 0fFF800000, %f399, %p92; - selp.f32 %f461, 0fFF800000, %f398, %p91; - selp.f32 %f462, 0fFF800000, %f397, %p90; - selp.f32 %f463, 0fFF800000, %f396, %p89; - selp.f32 %f464, 0fFF800000, %f403, %p88; - selp.f32 %f465, 0fFF800000, %f402, %p87; - selp.f32 %f466, 0fFF800000, %f401, %p86; - selp.f32 %f467, 0fFF800000, %f400, %p85; - selp.f32 %f468, 0fFF800000, %f407, %p84; - selp.f32 %f469, 0fFF800000, %f406, %p83; - selp.f32 %f470, 0fFF800000, %f405, %p82; - selp.f32 %f471, 0fFF800000, %f404, %p81; - selp.f32 %f472, 0fFF800000, %f411, %p80; - selp.f32 %f473, 0fFF800000, %f410, %p79; - selp.f32 %f474, 0fFF800000, %f409, %p78; - selp.f32 %f475, 0fFF800000, %f408, %p77; - selp.f32 %f476, 0fFF800000, %f415, %p76; - selp.f32 %f477, 0fFF800000, %f414, %p75; - selp.f32 %f478, 0fFF800000, %f413, %p74; - selp.f32 %f479, 0fFF800000, %f412, %p73; - selp.f32 %f480, 0fFF800000, %f419, %p72; - selp.f32 %f481, 0fFF800000, %f418, %p71; - selp.f32 %f482, 0fFF800000, %f417, %p70; - selp.f32 %f483, 0fFF800000, %f416, %p69; - selp.f32 %f484, 0fFF800000, %f423, %p68; - selp.f32 %f485, 0fFF800000, %f422, %p67; - selp.f32 %f486, 0fFF800000, %f421, %p66; - selp.f32 %f487, 0fFF800000, %f420, %p65; - selp.f32 %f488, 0fFF800000, %f427, %p64; - selp.f32 %f489, 0fFF800000, %f426, %p63; - selp.f32 %f490, 0fFF800000, %f425, %p62; - selp.f32 %f491, 0fFF800000, %f424, %p61; - selp.f32 %f492, 0fFF800000, %f431, %p60; - selp.f32 %f493, 0fFF800000, %f430, %p59; - selp.f32 %f494, 0fFF800000, %f429, %p58; - selp.f32 %f495, 0fFF800000, %f428, %p57; - selp.f32 %f496, 0fFF800000, %f435, %p56; - selp.f32 %f497, 0fFF800000, %f434, %p55; - selp.f32 %f498, 0fFF800000, %f433, %p54; - selp.f32 %f499, 0fFF800000, %f432, %p53; - selp.f32 %f500, 0fFF800000, %f439, %p52; - selp.f32 %f501, 0fFF800000, %f438, %p51; - selp.f32 %f502, 0fFF800000, %f437, %p50; - selp.f32 %f503, 0fFF800000, %f436, %p49; - selp.f32 %f504, 0fFF800000, %f443, %p48; - selp.f32 %f505, 0fFF800000, %f442, %p47; - selp.f32 %f506, 0fFF800000, %f441, %p46; - selp.f32 %f507, 0fFF800000, %f440, %p45; - selp.f32 %f508, 0fFF800000, %f447, %p44; - selp.f32 %f509, 0fFF800000, %f446, %p43; - selp.f32 %f510, 0fFF800000, %f445, %p42; - selp.f32 %f511, 0fFF800000, %f444, %p41; - selp.f32 %f512, 0fFF800000, %f451, %p40; - selp.f32 %f513, 0fFF800000, %f450, %p39; - selp.f32 %f514, 0fFF800000, %f449, %p38; - selp.f32 %f515, 0fFF800000, %f448, %p37; - mul.wide.s32 %rd112, %r6102, 4; - add.s64 %rd90, %rd8, %rd112; - @%p102 ld.global.b32 { %r1776 }, [ %rd90 + 0 ]; - st.shared.u32 [%r72], %r1776; - bar.sync 0; - ld.shared.f32 %f516, [%r73]; - ld.shared.f32 %f517, [%r74+32]; - neg.f32 %f518, %f516; - fma.rn.f32 %f519, %f452, %f195, %f518; - fma.rn.f32 %f520, %f453, %f195, %f518; - neg.f32 %f521, %f517; - fma.rn.f32 %f522, %f454, %f195, %f521; - fma.rn.f32 %f523, %f455, %f195, %f521; - fma.rn.f32 %f524, %f456, %f195, %f518; - fma.rn.f32 %f525, %f457, %f195, %f518; - fma.rn.f32 %f526, %f458, %f195, %f521; - fma.rn.f32 %f527, %f459, %f195, %f521; - fma.rn.f32 %f528, %f460, %f195, %f518; - fma.rn.f32 %f529, %f461, %f195, %f518; - fma.rn.f32 %f530, %f462, %f195, %f521; - fma.rn.f32 %f531, %f463, %f195, %f521; - fma.rn.f32 %f532, %f464, %f195, %f518; - fma.rn.f32 %f533, %f465, %f195, %f518; - fma.rn.f32 %f534, %f466, %f195, %f521; - fma.rn.f32 %f535, %f467, %f195, %f521; - fma.rn.f32 %f536, %f468, %f195, %f518; - fma.rn.f32 %f537, %f469, %f195, %f518; - fma.rn.f32 %f538, %f470, %f195, %f521; - fma.rn.f32 %f539, %f471, %f195, %f521; - fma.rn.f32 %f540, %f472, %f195, %f518; - fma.rn.f32 %f541, %f473, %f195, %f518; - fma.rn.f32 %f542, %f474, %f195, %f521; - fma.rn.f32 %f543, %f475, %f195, %f521; - fma.rn.f32 %f544, %f476, %f195, %f518; - fma.rn.f32 %f545, %f477, %f195, %f518; - fma.rn.f32 %f546, %f478, %f195, %f521; - fma.rn.f32 %f547, %f479, %f195, %f521; - fma.rn.f32 %f548, %f480, %f195, %f518; - fma.rn.f32 %f549, %f481, %f195, %f518; - fma.rn.f32 %f550, %f482, %f195, %f521; - fma.rn.f32 %f551, %f483, %f195, %f521; - fma.rn.f32 %f552, %f484, %f195, %f518; - fma.rn.f32 %f553, %f485, %f195, %f518; - fma.rn.f32 %f554, %f486, %f195, %f521; - fma.rn.f32 %f555, %f487, %f195, %f521; - fma.rn.f32 %f556, %f488, %f195, %f518; - fma.rn.f32 %f557, %f489, %f195, %f518; - fma.rn.f32 %f558, %f490, %f195, %f521; - fma.rn.f32 %f559, %f491, %f195, %f521; - fma.rn.f32 %f560, %f492, %f195, %f518; - fma.rn.f32 %f561, %f493, %f195, %f518; - fma.rn.f32 %f562, %f494, %f195, %f521; - fma.rn.f32 %f563, %f495, %f195, %f521; - fma.rn.f32 %f564, %f496, %f195, %f518; - fma.rn.f32 %f565, %f497, %f195, %f518; - fma.rn.f32 %f566, %f498, %f195, %f521; - fma.rn.f32 %f567, %f499, %f195, %f521; - fma.rn.f32 %f568, %f500, %f195, %f518; - fma.rn.f32 %f569, %f501, %f195, %f518; - fma.rn.f32 %f570, %f502, %f195, %f521; - fma.rn.f32 %f571, %f503, %f195, %f521; - fma.rn.f32 %f572, %f504, %f195, %f518; - fma.rn.f32 %f573, %f505, %f195, %f518; - fma.rn.f32 %f574, %f506, %f195, %f521; - fma.rn.f32 %f575, %f507, %f195, %f521; - fma.rn.f32 %f576, %f508, %f195, %f518; - fma.rn.f32 %f577, %f509, %f195, %f518; - fma.rn.f32 %f578, %f510, %f195, %f521; - fma.rn.f32 %f579, %f511, %f195, %f521; - fma.rn.f32 %f580, %f512, %f195, %f518; - fma.rn.f32 %f581, %f513, %f195, %f518; - fma.rn.f32 %f582, %f514, %f195, %f521; - fma.rn.f32 %f583, %f515, %f195, %f521; - mul.f32 %f261, %f519, 0f3FB8AA3B; - ex2.approx.f32 %f260, %f261; - mul.f32 %f263, %f520, 0f3FB8AA3B; - ex2.approx.f32 %f262, %f263; - mul.f32 %f265, %f522, 0f3FB8AA3B; - ex2.approx.f32 %f264, %f265; - mul.f32 %f267, %f523, 0f3FB8AA3B; - ex2.approx.f32 %f266, %f267; - mul.f32 %f269, %f524, 0f3FB8AA3B; - ex2.approx.f32 %f268, %f269; - mul.f32 %f271, %f525, 0f3FB8AA3B; - ex2.approx.f32 %f270, %f271; - mul.f32 %f273, %f526, 0f3FB8AA3B; - ex2.approx.f32 %f272, %f273; - mul.f32 %f275, %f527, 0f3FB8AA3B; - ex2.approx.f32 %f274, %f275; - mul.f32 %f277, %f528, 0f3FB8AA3B; - ex2.approx.f32 %f276, %f277; - mul.f32 %f279, %f529, 0f3FB8AA3B; - ex2.approx.f32 %f278, %f279; - mul.f32 %f281, %f530, 0f3FB8AA3B; - ex2.approx.f32 %f280, %f281; - mul.f32 %f283, %f531, 0f3FB8AA3B; - ex2.approx.f32 %f282, %f283; - mul.f32 %f285, %f532, 0f3FB8AA3B; - ex2.approx.f32 %f284, %f285; - mul.f32 %f287, %f533, 0f3FB8AA3B; - ex2.approx.f32 %f286, %f287; - mul.f32 %f289, %f534, 0f3FB8AA3B; - ex2.approx.f32 %f288, %f289; - mul.f32 %f291, %f535, 0f3FB8AA3B; - ex2.approx.f32 %f290, %f291; - mul.f32 %f293, %f536, 0f3FB8AA3B; - ex2.approx.f32 %f292, %f293; - mul.f32 %f295, %f537, 0f3FB8AA3B; - ex2.approx.f32 %f294, %f295; - mul.f32 %f297, %f538, 0f3FB8AA3B; - ex2.approx.f32 %f296, %f297; - mul.f32 %f299, %f539, 0f3FB8AA3B; - ex2.approx.f32 %f298, %f299; - mul.f32 %f301, %f540, 0f3FB8AA3B; - ex2.approx.f32 %f300, %f301; - mul.f32 %f303, %f541, 0f3FB8AA3B; - ex2.approx.f32 %f302, %f303; - mul.f32 %f305, %f542, 0f3FB8AA3B; - ex2.approx.f32 %f304, %f305; - mul.f32 %f307, %f543, 0f3FB8AA3B; - ex2.approx.f32 %f306, %f307; - mul.f32 %f309, %f544, 0f3FB8AA3B; - ex2.approx.f32 %f308, %f309; - mul.f32 %f311, %f545, 0f3FB8AA3B; - ex2.approx.f32 %f310, %f311; - mul.f32 %f313, %f546, 0f3FB8AA3B; - ex2.approx.f32 %f312, %f313; - mul.f32 %f315, %f547, 0f3FB8AA3B; - ex2.approx.f32 %f314, %f315; - mul.f32 %f317, %f548, 0f3FB8AA3B; - ex2.approx.f32 %f316, %f317; - mul.f32 %f319, %f549, 0f3FB8AA3B; - ex2.approx.f32 %f318, %f319; - mul.f32 %f321, %f550, 0f3FB8AA3B; - ex2.approx.f32 %f320, %f321; - mul.f32 %f323, %f551, 0f3FB8AA3B; - ex2.approx.f32 %f322, %f323; - mul.f32 %f325, %f552, 0f3FB8AA3B; - ex2.approx.f32 %f324, %f325; - mul.f32 %f327, %f553, 0f3FB8AA3B; - ex2.approx.f32 %f326, %f327; - mul.f32 %f329, %f554, 0f3FB8AA3B; - ex2.approx.f32 %f328, %f329; - mul.f32 %f331, %f555, 0f3FB8AA3B; - ex2.approx.f32 %f330, %f331; - mul.f32 %f333, %f556, 0f3FB8AA3B; - ex2.approx.f32 %f332, %f333; - mul.f32 %f335, %f557, 0f3FB8AA3B; - ex2.approx.f32 %f334, %f335; - mul.f32 %f337, %f558, 0f3FB8AA3B; - ex2.approx.f32 %f336, %f337; - mul.f32 %f339, %f559, 0f3FB8AA3B; - ex2.approx.f32 %f338, %f339; - mul.f32 %f341, %f560, 0f3FB8AA3B; - ex2.approx.f32 %f340, %f341; - mul.f32 %f343, %f561, 0f3FB8AA3B; - ex2.approx.f32 %f342, %f343; - mul.f32 %f345, %f562, 0f3FB8AA3B; - ex2.approx.f32 %f344, %f345; - mul.f32 %f347, %f563, 0f3FB8AA3B; - ex2.approx.f32 %f346, %f347; - mul.f32 %f349, %f564, 0f3FB8AA3B; - ex2.approx.f32 %f348, %f349; - mul.f32 %f351, %f565, 0f3FB8AA3B; - ex2.approx.f32 %f350, %f351; - mul.f32 %f353, %f566, 0f3FB8AA3B; - ex2.approx.f32 %f352, %f353; - mul.f32 %f355, %f567, 0f3FB8AA3B; - ex2.approx.f32 %f354, %f355; - mul.f32 %f357, %f568, 0f3FB8AA3B; - ex2.approx.f32 %f356, %f357; - mul.f32 %f359, %f569, 0f3FB8AA3B; - ex2.approx.f32 %f358, %f359; - mul.f32 %f361, %f570, 0f3FB8AA3B; - ex2.approx.f32 %f360, %f361; - mul.f32 %f363, %f571, 0f3FB8AA3B; - ex2.approx.f32 %f362, %f363; - mul.f32 %f365, %f572, 0f3FB8AA3B; - ex2.approx.f32 %f364, %f365; - mul.f32 %f367, %f573, 0f3FB8AA3B; - ex2.approx.f32 %f366, %f367; - mul.f32 %f369, %f574, 0f3FB8AA3B; - ex2.approx.f32 %f368, %f369; - mul.f32 %f371, %f575, 0f3FB8AA3B; - ex2.approx.f32 %f370, %f371; - mul.f32 %f373, %f576, 0f3FB8AA3B; - ex2.approx.f32 %f372, %f373; - mul.f32 %f375, %f577, 0f3FB8AA3B; - ex2.approx.f32 %f374, %f375; - mul.f32 %f377, %f578, 0f3FB8AA3B; - ex2.approx.f32 %f376, %f377; - mul.f32 %f379, %f579, 0f3FB8AA3B; - ex2.approx.f32 %f378, %f379; - mul.f32 %f381, %f580, 0f3FB8AA3B; - ex2.approx.f32 %f380, %f381; - mul.f32 %f383, %f581, 0f3FB8AA3B; - ex2.approx.f32 %f382, %f383; - mul.f32 %f385, %f582, 0f3FB8AA3B; - ex2.approx.f32 %f384, %f385; - mul.f32 %f387, %f583, 0f3FB8AA3B; - ex2.approx.f32 %f386, %f387; - @%p102 ld.global.v4.b32 { %r6121, %r6122, %r6123, %r6124 }, [ %rd91 + 0 ]; - mov.b32 %hh49, %r6121; - mov.b32 %hh50, %r6122; - mov.b32 %hh51, %r6123; - mov.b32 %hh52, %r6124; - @%p102 ld.global.v4.b32 { %r6125, %r6126, %r6127, %r6128 }, [ %rd92 + 0 ]; - mov.b32 %hh53, %r6125; - mov.b32 %hh54, %r6126; - mov.b32 %hh55, %r6127; - mov.b32 %hh56, %r6128; - @%p102 ld.global.v4.b32 { %r6129, %r6130, %r6131, %r6132 }, [ %rd93 + 0 ]; - mov.b32 %hh57, %r6129; - mov.b32 %hh58, %r6130; - mov.b32 %hh59, %r6131; - mov.b32 %hh60, %r6132; - @%p102 ld.global.v4.b32 { %r6133, %r6134, %r6135, %r6136 }, [ %rd94 + 0 ]; - mov.b32 %hh61, %r6133; - mov.b32 %hh62, %r6134; - mov.b32 %hh63, %r6135; - mov.b32 %hh64, %r6136; - bar.sync 0; - st.shared.v4.b32 [%r75], {%r6121, %r6122, %r6123, %r6124}; - st.shared.v4.b32 [%r76], {%r6125, %r6126, %r6127, %r6128}; - st.shared.v4.b32 [%r77], {%r6129, %r6130, %r6131, %r6132}; - st.shared.v4.b32 [%r78], {%r6133, %r6134, %r6135, %r6136}; - cvt.rn.f16.f32 %h1, %f262; - cvt.rn.f16.f32 %h2, %f260; - cvt.rn.f16.f32 %h3, %f266; - cvt.rn.f16.f32 %h4, %f264; - cvt.rn.f16.f32 %h5, %f270; - cvt.rn.f16.f32 %h6, %f268; - cvt.rn.f16.f32 %h7, %f274; - cvt.rn.f16.f32 %h8, %f272; - cvt.rn.f16.f32 %h9, %f278; - cvt.rn.f16.f32 %h10, %f276; - cvt.rn.f16.f32 %h11, %f282; - cvt.rn.f16.f32 %h12, %f280; - cvt.rn.f16.f32 %h13, %f286; - cvt.rn.f16.f32 %h14, %f284; - cvt.rn.f16.f32 %h15, %f290; - cvt.rn.f16.f32 %h16, %f288; - cvt.rn.f16.f32 %h17, %f294; - cvt.rn.f16.f32 %h18, %f292; - cvt.rn.f16.f32 %h19, %f298; - cvt.rn.f16.f32 %h20, %f296; - cvt.rn.f16.f32 %h21, %f302; - cvt.rn.f16.f32 %h22, %f300; - cvt.rn.f16.f32 %h23, %f306; - cvt.rn.f16.f32 %h24, %f304; - cvt.rn.f16.f32 %h25, %f310; - cvt.rn.f16.f32 %h26, %f308; - cvt.rn.f16.f32 %h27, %f314; - cvt.rn.f16.f32 %h28, %f312; - cvt.rn.f16.f32 %h29, %f318; - cvt.rn.f16.f32 %h30, %f316; - cvt.rn.f16.f32 %h31, %f322; - cvt.rn.f16.f32 %h32, %f320; - cvt.rn.f16.f32 %h33, %f326; - cvt.rn.f16.f32 %h34, %f324; - cvt.rn.f16.f32 %h35, %f330; - cvt.rn.f16.f32 %h36, %f328; - cvt.rn.f16.f32 %h37, %f334; - cvt.rn.f16.f32 %h38, %f332; - cvt.rn.f16.f32 %h39, %f338; - cvt.rn.f16.f32 %h40, %f336; - cvt.rn.f16.f32 %h41, %f342; - cvt.rn.f16.f32 %h42, %f340; - cvt.rn.f16.f32 %h43, %f346; - cvt.rn.f16.f32 %h44, %f344; - cvt.rn.f16.f32 %h45, %f350; - cvt.rn.f16.f32 %h46, %f348; - cvt.rn.f16.f32 %h47, %f354; - cvt.rn.f16.f32 %h48, %f352; - cvt.rn.f16.f32 %h49, %f358; - cvt.rn.f16.f32 %h50, %f356; - cvt.rn.f16.f32 %h51, %f362; - cvt.rn.f16.f32 %h52, %f360; - cvt.rn.f16.f32 %h53, %f366; - cvt.rn.f16.f32 %h54, %f364; - cvt.rn.f16.f32 %h55, %f370; - cvt.rn.f16.f32 %h56, %f368; - cvt.rn.f16.f32 %h57, %f374; - cvt.rn.f16.f32 %h58, %f372; - cvt.rn.f16.f32 %h59, %f378; - cvt.rn.f16.f32 %h60, %f376; - cvt.rn.f16.f32 %h61, %f382; - cvt.rn.f16.f32 %h62, %f380; - cvt.rn.f16.f32 %h63, %f386; - cvt.rn.f16.f32 %h64, %f384; - st.shared.v2.b16 [%r79], {%h2, %h1}; - st.shared.v2.b16 [%r80], {%h4, %h3}; - st.shared.v2.b16 [%r81], {%h6, %h5}; - st.shared.v2.b16 [%r82], {%h8, %h7}; - st.shared.v2.b16 [%r83], {%h10, %h9}; - st.shared.v2.b16 [%r84], {%h12, %h11}; - st.shared.v2.b16 [%r85], {%h14, %h13}; - st.shared.v2.b16 [%r86], {%h16, %h15}; - st.shared.v2.b16 [%r87], {%h18, %h17}; - st.shared.v2.b16 [%r88], {%h20, %h19}; - st.shared.v2.b16 [%r89], {%h22, %h21}; - st.shared.v2.b16 [%r90], {%h24, %h23}; - st.shared.v2.b16 [%r91], {%h26, %h25}; - st.shared.v2.b16 [%r92], {%h28, %h27}; - st.shared.v2.b16 [%r93], {%h30, %h29}; - st.shared.v2.b16 [%r94], {%h32, %h31}; - st.shared.v2.b16 [%r79+128], {%h34, %h33}; - st.shared.v2.b16 [%r80+128], {%h36, %h35}; - st.shared.v2.b16 [%r97], {%h38, %h37}; - st.shared.v2.b16 [%r98], {%h40, %h39}; - st.shared.v2.b16 [%r99], {%h42, %h41}; - st.shared.v2.b16 [%r100], {%h44, %h43}; - st.shared.v2.b16 [%r101], {%h46, %h45}; - st.shared.v2.b16 [%r102], {%h48, %h47}; - st.shared.v2.b16 [%r103], {%h50, %h49}; - st.shared.v2.b16 [%r104], {%h52, %h51}; - st.shared.v2.b16 [%r105], {%h54, %h53}; - st.shared.v2.b16 [%r106], {%h56, %h55}; - st.shared.v2.b16 [%r107], {%h58, %h57}; - st.shared.v2.b16 [%r108], {%h60, %h59}; - st.shared.v2.b16 [%r109], {%h62, %h61}; - st.shared.v2.b16 [%r110], {%h64, %h63}; - bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1957, %r1958, %r1959, %r1960 }, [ %r1797 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2069, %r2070, %r2071, %r2072 }, [ %r1802 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2181, %r2182, %r2183, %r2184 }, [ %r1807 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2293, %r2294, %r2295, %r2296 }, [ %r1812 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2405, %r2406, %r2407, %r2408 }, [ %r1817 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2517, %r2518, %r2519, %r2520 }, [ %r1822 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2629, %r2630, %r2631, %r2632 }, [ %r1827 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2741, %r2742, %r2743, %r2744 }, [ %r1832 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2013, %r2014, %r2015, %r2016 }, [ %r1837 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2125, %r2126, %r2127, %r2128 }, [ %r1842 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2237, %r2238, %r2239, %r2240 }, [ %r1847 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2349, %r2350, %r2351, %r2352 }, [ %r1852 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2461, %r2462, %r2463, %r2464 }, [ %r1857 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2573, %r2574, %r2575, %r2576 }, [ %r1862 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2685, %r2686, %r2687, %r2688 }, [ %r1867 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2797, %r2798, %r2799, %r2800 }, [ %r1872 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1961, %r1962, %r1975, %r1976 }, [ %r1877 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2073, %r2074, %r2087, %r2088 }, [ %r1882 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2185, %r2186, %r2199, %r2200 }, [ %r1887 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2297, %r2298, %r2311, %r2312 }, [ %r1892 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2409, %r2410, %r2423, %r2424 }, [ %r1897 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2521, %r2522, %r2535, %r2536 }, [ %r1902 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2633, %r2634, %r2647, %r2648 }, [ %r1907 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2745, %r2746, %r2759, %r2760 }, [ %r1912 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1989, %r1990, %r2003, %r2004 }, [ %r1917 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2101, %r2102, %r2115, %r2116 }, [ %r1922 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2213, %r2214, %r2227, %r2228 }, [ %r1927 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2325, %r2326, %r2339, %r2340 }, [ %r1932 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2437, %r2438, %r2451, %r2452 }, [ %r1937 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2549, %r2550, %r2563, %r2564 }, [ %r1942 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2661, %r2662, %r2675, %r2676 }, [ %r1947 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2773, %r2774, %r2787, %r2788 }, [ %r1952 + 0 ]; - mov.b32 %r2065, %f909; - mov.b32 %r2066, %f910; - mov.b32 %r2067, %f911; - mov.b32 %r2068, %f912; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1961, %r1962 }, { %r2065, %r2066, %r2067, %r2068 }; - mov.b32 %r2079, %f913; - mov.b32 %r2080, %f914; - mov.b32 %r2081, %f915; - mov.b32 %r2082, %f916; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1975, %r1976 }, { %r2079, %r2080, %r2081, %r2082 }; - mov.b32 %r2093, %f917; - mov.b32 %r2094, %f918; - mov.b32 %r2095, %f919; - mov.b32 %r2096, %f920; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1989, %r1990 }, { %r2093, %r2094, %r2095, %r2096 }; - mov.b32 %r2107, %f921; - mov.b32 %r2108, %f922; - mov.b32 %r2109, %f923; - mov.b32 %r2110, %f924; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r1957, %r1958, %r1959, %r1960 }, { %r2003, %r2004 }, { %r2107, %r2108, %r2109, %r2110 }; - mov.b32 %r2121, %f925; - mov.b32 %r2122, %f926; - mov.b32 %r2123, %f927; - mov.b32 %r2124, %f928; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1961, %r1962 }, { %r2121, %r2122, %r2123, %r2124 }; - mov.b32 %r2135, %f929; - mov.b32 %r2136, %f930; - mov.b32 %r2137, %f931; - mov.b32 %r2138, %f932; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1975, %r1976 }, { %r2135, %r2136, %r2137, %r2138 }; - mov.b32 %r2149, %f933; - mov.b32 %r2150, %f934; - mov.b32 %r2151, %f935; - mov.b32 %r2152, %f936; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1989, %r1990 }, { %r2149, %r2150, %r2151, %r2152 }; - mov.b32 %r2163, %f937; - mov.b32 %r2164, %f938; - mov.b32 %r2165, %f939; - mov.b32 %r2166, %f940; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2013, %r2014, %r2015, %r2016 }, { %r2003, %r2004 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2073, %r2074 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2087, %r2088 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2101, %r2102 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2115, %r2116 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2073, %r2074 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2087, %r2088 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2101, %r2102 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2115, %r2116 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2185, %r2186 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2199, %r2200 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2213, %r2214 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2227, %r2228 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2185, %r2186 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2199, %r2200 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2213, %r2214 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2227, %r2228 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2297, %r2298 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2311, %r2312 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2325, %r2326 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2339, %r2340 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2297, %r2298 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2311, %r2312 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2325, %r2326 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2339, %r2340 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2409, %r2410 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2423, %r2424 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2437, %r2438 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2451, %r2452 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2409, %r2410 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2423, %r2424 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2437, %r2438 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2451, %r2452 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2521, %r2522 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2535, %r2536 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2549, %r2550 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2563, %r2564 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2521, %r2522 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2535, %r2536 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2549, %r2550 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2563, %r2564 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2633, %r2634 }, { %r2065, %r2066, %r2067, %r2068 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2647, %r2648 }, { %r2079, %r2080, %r2081, %r2082 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2661, %r2662 }, { %r2093, %r2094, %r2095, %r2096 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2675, %r2676 }, { %r2107, %r2108, %r2109, %r2110 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2633, %r2634 }, { %r2121, %r2122, %r2123, %r2124 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2647, %r2648 }, { %r2135, %r2136, %r2137, %r2138 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2661, %r2662 }, { %r2149, %r2150, %r2151, %r2152 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2675, %r2676 }, { %r2163, %r2164, %r2165, %r2166 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2745, %r2746 }, { %r2065, %r2066, %r2067, %r2068 }; - mov.b32 %f912, %r2068; - mov.b32 %f911, %r2067; - mov.b32 %f910, %r2066; - mov.b32 %f909, %r2065; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2759, %r2760 }, { %r2079, %r2080, %r2081, %r2082 }; - mov.b32 %f916, %r2082; - mov.b32 %f915, %r2081; - mov.b32 %f914, %r2080; - mov.b32 %f913, %r2079; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2773, %r2774 }, { %r2093, %r2094, %r2095, %r2096 }; - mov.b32 %f920, %r2096; - mov.b32 %f919, %r2095; - mov.b32 %f918, %r2094; - mov.b32 %f917, %r2093; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2787, %r2788 }, { %r2107, %r2108, %r2109, %r2110 }; - mov.b32 %f924, %r2110; - mov.b32 %f923, %r2109; - mov.b32 %f922, %r2108; - mov.b32 %f921, %r2107; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2745, %r2746 }, { %r2121, %r2122, %r2123, %r2124 }; - mov.b32 %f928, %r2124; - mov.b32 %f927, %r2123; - mov.b32 %f926, %r2122; - mov.b32 %f925, %r2121; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2759, %r2760 }, { %r2135, %r2136, %r2137, %r2138 }; - mov.b32 %f932, %r2138; - mov.b32 %f931, %r2137; - mov.b32 %f930, %r2136; - mov.b32 %f929, %r2135; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2773, %r2774 }, { %r2149, %r2150, %r2151, %r2152 }; - mov.b32 %f936, %r2152; - mov.b32 %f935, %r2151; - mov.b32 %f934, %r2150; - mov.b32 %f933, %r2149; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2787, %r2788 }, { %r2163, %r2164, %r2165, %r2166 }; - mov.b32 %f940, %r2166; - mov.b32 %f939, %r2165; - mov.b32 %f938, %r2164; - mov.b32 %f937, %r2163; - add.s64 %rd95, %rd7, %rd112; - @%p102 ld.global.b32 { %r2849 }, [ %rd95 + 0 ]; - bar.sync 0; - st.shared.u32 [%r143], %r2849; - bar.sync 0; - ld.shared.f32 %f584, [%r144]; - ld.shared.f32 %f585, [%r145+32]; - sub.f32 %f587, %f259, %f584; - sub.f32 %f588, %f259, %f585; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3034, %r3035, %r3036, %r3037 }, [ %r2854 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3258, %r3259, %r3260, %r3261 }, [ %r2859 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3482, %r3483, %r3484, %r3485 }, [ %r2864 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3706, %r3707, %r3708, %r3709 }, [ %r2869 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2870, %r2871, %r2872, %r2873 }, [ %r2874 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2875, %r2876, %r2877, %r2878 }, [ %r2879 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2880, %r2881, %r2882, %r2883 }, [ %r2884 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2885, %r2886, %r2887, %r2888 }, [ %r2889 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2890, %r2891, %r2892, %r2893 }, [ %r2894 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2895, %r2896, %r2897, %r2898 }, [ %r2899 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2900, %r2901, %r2902, %r2903 }, [ %r2904 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2905, %r2906, %r2907, %r2908 }, [ %r2909 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2910, %r2911, %r2912, %r2913 }, [ %r2914 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2915, %r2916, %r2917, %r2918 }, [ %r2919 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2920, %r2921, %r2922, %r2923 }, [ %r2924 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2925, %r2926, %r2927, %r2928 }, [ %r2929 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2930, %r2931, %r2932, %r2933 }, [ %r2934 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2935, %r2936, %r2937, %r2938 }, [ %r2939 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2940, %r2941, %r2942, %r2943 }, [ %r2944 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2945, %r2946, %r2947, %r2948 }, [ %r2949 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2950, %r2951, %r2952, %r2953 }, [ %r2954 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2955, %r2956, %r2957, %r2958 }, [ %r2959 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2960, %r2961, %r2962, %r2963 }, [ %r2964 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2965, %r2966, %r2967, %r2968 }, [ %r2969 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2970, %r2971, %r2972, %r2973 }, [ %r2974 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2975, %r2976, %r2977, %r2978 }, [ %r2979 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2980, %r2981, %r2982, %r2983 }, [ %r2984 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2985, %r2986, %r2987, %r2988 }, [ %r2989 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2990, %r2991, %r2992, %r2993 }, [ %r2994 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2995, %r2996, %r2997, %r2998 }, [ %r2999 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3000, %r3001, %r3002, %r3003 }, [ %r3004 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3005, %r3006, %r3007, %r3008 }, [ %r3009 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3010, %r3011, %r3012, %r3013 }, [ %r3014 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3015, %r3016, %r3017, %r3018 }, [ %r3019 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3020, %r3021, %r3022, %r3023 }, [ %r3024 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3025, %r3026, %r3027, %r3028 }, [ %r3029 + 0 ]; - mov.b32 %r3465, %f587; - mov.b32 %r3467, %f588; - mov.u32 %r3254, %r3465; - mov.u32 %r3255, %r3465; - mov.u32 %r3256, %r3467; - mov.u32 %r3257, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2870, %r2871 }, { %r3254, %r3255, %r3256, %r3257 }; - mov.u32 %r3268, %r3465; - mov.u32 %r3269, %r3465; - mov.u32 %r3270, %r3467; - mov.u32 %r3271, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2872, %r2873 }, { %r3268, %r3269, %r3270, %r3271 }; - mov.u32 %r3282, %r3465; - mov.u32 %r3283, %r3465; - mov.u32 %r3284, %r3467; - mov.u32 %r3285, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2890, %r2891 }, { %r3282, %r3283, %r3284, %r3285 }; - mov.u32 %r3296, %r3465; - mov.u32 %r3297, %r3465; - mov.u32 %r3298, %r3467; - mov.u32 %r3299, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2892, %r2893 }, { %r3296, %r3297, %r3298, %r3299 }; - mov.u32 %r3310, %r3465; - mov.u32 %r3311, %r3465; - mov.u32 %r3312, %r3467; - mov.u32 %r3313, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2910, %r2911 }, { %r3310, %r3311, %r3312, %r3313 }; - mov.u32 %r3324, %r3465; - mov.u32 %r3325, %r3465; - mov.u32 %r3326, %r3467; - mov.u32 %r3327, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2912, %r2913 }, { %r3324, %r3325, %r3326, %r3327 }; - mov.u32 %r3338, %r3465; - mov.u32 %r3339, %r3465; - mov.u32 %r3340, %r3467; - mov.u32 %r3341, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2930, %r2931 }, { %r3338, %r3339, %r3340, %r3341 }; - mov.u32 %r3352, %r3465; - mov.u32 %r3353, %r3465; - mov.u32 %r3354, %r3467; - mov.u32 %r3355, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2932, %r2933 }, { %r3352, %r3353, %r3354, %r3355 }; - mov.u32 %r3366, %r3465; - mov.u32 %r3367, %r3465; - mov.u32 %r3368, %r3467; - mov.u32 %r3369, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2950, %r2951 }, { %r3366, %r3367, %r3368, %r3369 }; - mov.u32 %r3380, %r3465; - mov.u32 %r3381, %r3465; - mov.u32 %r3382, %r3467; - mov.u32 %r3383, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2952, %r2953 }, { %r3380, %r3381, %r3382, %r3383 }; - mov.u32 %r3394, %r3465; - mov.u32 %r3395, %r3465; - mov.u32 %r3396, %r3467; - mov.u32 %r3397, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2970, %r2971 }, { %r3394, %r3395, %r3396, %r3397 }; - mov.u32 %r3408, %r3465; - mov.u32 %r3409, %r3465; - mov.u32 %r3410, %r3467; - mov.u32 %r3411, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2972, %r2973 }, { %r3408, %r3409, %r3410, %r3411 }; - mov.u32 %r3422, %r3465; - mov.u32 %r3423, %r3465; - mov.u32 %r3424, %r3467; - mov.u32 %r3425, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2990, %r2991 }, { %r3422, %r3423, %r3424, %r3425 }; - mov.u32 %r3436, %r3465; - mov.u32 %r3437, %r3465; - mov.u32 %r3438, %r3467; - mov.u32 %r3439, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2992, %r2993 }, { %r3436, %r3437, %r3438, %r3439 }; - mov.u32 %r3450, %r3465; - mov.u32 %r3451, %r3465; - mov.u32 %r3452, %r3467; - mov.u32 %r3453, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3010, %r3011 }, { %r3450, %r3451, %r3452, %r3453 }; - mov.u32 %r3464, %r3465; - mov.u32 %r3466, %r3467; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3012, %r3013 }, { %r3464, %r3465, %r3466, %r3467 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2875, %r2876 }, { %r3254, %r3255, %r3256, %r3257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2877, %r2878 }, { %r3268, %r3269, %r3270, %r3271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2895, %r2896 }, { %r3282, %r3283, %r3284, %r3285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2897, %r2898 }, { %r3296, %r3297, %r3298, %r3299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2915, %r2916 }, { %r3310, %r3311, %r3312, %r3313 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2917, %r2918 }, { %r3324, %r3325, %r3326, %r3327 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2935, %r2936 }, { %r3338, %r3339, %r3340, %r3341 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2937, %r2938 }, { %r3352, %r3353, %r3354, %r3355 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2955, %r2956 }, { %r3366, %r3367, %r3368, %r3369 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2957, %r2958 }, { %r3380, %r3381, %r3382, %r3383 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2975, %r2976 }, { %r3394, %r3395, %r3396, %r3397 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2977, %r2978 }, { %r3408, %r3409, %r3410, %r3411 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2995, %r2996 }, { %r3422, %r3423, %r3424, %r3425 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2997, %r2998 }, { %r3436, %r3437, %r3438, %r3439 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3015, %r3016 }, { %r3450, %r3451, %r3452, %r3453 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3017, %r3018 }, { %r3464, %r3465, %r3466, %r3467 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2880, %r2881 }, { %r3254, %r3255, %r3256, %r3257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2882, %r2883 }, { %r3268, %r3269, %r3270, %r3271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2900, %r2901 }, { %r3282, %r3283, %r3284, %r3285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2902, %r2903 }, { %r3296, %r3297, %r3298, %r3299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2920, %r2921 }, { %r3310, %r3311, %r3312, %r3313 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2922, %r2923 }, { %r3324, %r3325, %r3326, %r3327 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2940, %r2941 }, { %r3338, %r3339, %r3340, %r3341 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2942, %r2943 }, { %r3352, %r3353, %r3354, %r3355 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2960, %r2961 }, { %r3366, %r3367, %r3368, %r3369 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2962, %r2963 }, { %r3380, %r3381, %r3382, %r3383 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2980, %r2981 }, { %r3394, %r3395, %r3396, %r3397 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2982, %r2983 }, { %r3408, %r3409, %r3410, %r3411 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3000, %r3001 }, { %r3422, %r3423, %r3424, %r3425 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3002, %r3003 }, { %r3436, %r3437, %r3438, %r3439 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3020, %r3021 }, { %r3450, %r3451, %r3452, %r3453 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3022, %r3023 }, { %r3464, %r3465, %r3466, %r3467 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2885, %r2886 }, { %r3254, %r3255, %r3256, %r3257 }; - mov.b32 %f589, %r3256; - mov.b32 %f590, %r3257; - mov.b32 %f591, %r3254; - mov.b32 %f592, %r3255; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2887, %r2888 }, { %r3268, %r3269, %r3270, %r3271 }; - mov.b32 %f593, %r3270; - mov.b32 %f594, %r3271; - mov.b32 %f595, %r3268; - mov.b32 %f596, %r3269; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2905, %r2906 }, { %r3282, %r3283, %r3284, %r3285 }; - mov.b32 %f597, %r3284; - mov.b32 %f598, %r3285; - mov.b32 %f599, %r3282; - mov.b32 %f600, %r3283; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2907, %r2908 }, { %r3296, %r3297, %r3298, %r3299 }; - mov.b32 %f601, %r3298; - mov.b32 %f602, %r3299; - mov.b32 %f603, %r3296; - mov.b32 %f604, %r3297; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2925, %r2926 }, { %r3310, %r3311, %r3312, %r3313 }; - mov.b32 %f605, %r3312; - mov.b32 %f606, %r3313; - mov.b32 %f607, %r3310; - mov.b32 %f608, %r3311; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2927, %r2928 }, { %r3324, %r3325, %r3326, %r3327 }; - mov.b32 %f609, %r3326; - mov.b32 %f610, %r3327; - mov.b32 %f611, %r3324; - mov.b32 %f612, %r3325; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2945, %r2946 }, { %r3338, %r3339, %r3340, %r3341 }; - mov.b32 %f613, %r3340; - mov.b32 %f614, %r3341; - mov.b32 %f615, %r3338; - mov.b32 %f616, %r3339; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2947, %r2948 }, { %r3352, %r3353, %r3354, %r3355 }; - mov.b32 %f617, %r3354; - mov.b32 %f618, %r3355; - mov.b32 %f619, %r3352; - mov.b32 %f620, %r3353; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2965, %r2966 }, { %r3366, %r3367, %r3368, %r3369 }; - mov.b32 %f621, %r3368; - mov.b32 %f622, %r3369; - mov.b32 %f623, %r3366; - mov.b32 %f624, %r3367; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2967, %r2968 }, { %r3380, %r3381, %r3382, %r3383 }; - mov.b32 %f625, %r3382; - mov.b32 %f626, %r3383; - mov.b32 %f627, %r3380; - mov.b32 %f628, %r3381; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2985, %r2986 }, { %r3394, %r3395, %r3396, %r3397 }; - mov.b32 %f629, %r3396; - mov.b32 %f630, %r3397; - mov.b32 %f631, %r3394; - mov.b32 %f632, %r3395; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2987, %r2988 }, { %r3408, %r3409, %r3410, %r3411 }; - mov.b32 %f633, %r3410; - mov.b32 %f634, %r3411; - mov.b32 %f635, %r3408; - mov.b32 %f636, %r3409; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3005, %r3006 }, { %r3422, %r3423, %r3424, %r3425 }; - mov.b32 %f637, %r3424; - mov.b32 %f638, %r3425; - mov.b32 %f639, %r3422; - mov.b32 %f640, %r3423; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3007, %r3008 }, { %r3436, %r3437, %r3438, %r3439 }; - mov.b32 %f641, %r3438; - mov.b32 %f642, %r3439; - mov.b32 %f643, %r3436; - mov.b32 %f644, %r3437; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3025, %r3026 }, { %r3450, %r3451, %r3452, %r3453 }; - mov.b32 %f645, %r3452; - mov.b32 %f646, %r3453; - mov.b32 %f647, %r3450; - mov.b32 %f648, %r3451; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3027, %r3028 }, { %r3464, %r3465, %r3466, %r3467 }; - mov.b32 %f649, %r3466; - mov.b32 %f650, %r3467; - mov.b32 %f651, %r3464; - mov.b32 %f652, %r3465; - mul.f32 %f653, %f262, %f592; - mul.f32 %f654, %f260, %f591; - mul.f32 %f655, %f266, %f590; - mul.f32 %f656, %f264, %f589; - mul.f32 %f657, %f270, %f596; - mul.f32 %f658, %f268, %f595; - mul.f32 %f659, %f274, %f594; - mul.f32 %f660, %f272, %f593; - mul.f32 %f661, %f278, %f600; - mul.f32 %f662, %f276, %f599; - mul.f32 %f663, %f282, %f598; - mul.f32 %f664, %f280, %f597; - mul.f32 %f665, %f286, %f604; - mul.f32 %f666, %f284, %f603; - mul.f32 %f667, %f290, %f602; - mul.f32 %f668, %f288, %f601; - mul.f32 %f669, %f294, %f608; - mul.f32 %f670, %f292, %f607; - mul.f32 %f671, %f298, %f606; - mul.f32 %f672, %f296, %f605; - mul.f32 %f673, %f302, %f612; - mul.f32 %f674, %f300, %f611; - mul.f32 %f675, %f306, %f610; - mul.f32 %f676, %f304, %f609; - mul.f32 %f677, %f310, %f616; - mul.f32 %f678, %f308, %f615; - mul.f32 %f679, %f314, %f614; - mul.f32 %f680, %f312, %f613; - mul.f32 %f681, %f318, %f620; - mul.f32 %f682, %f316, %f619; - mul.f32 %f683, %f322, %f618; - mul.f32 %f684, %f320, %f617; - mul.f32 %f685, %f326, %f624; - mul.f32 %f686, %f324, %f623; - mul.f32 %f687, %f330, %f622; - mul.f32 %f688, %f328, %f621; - mul.f32 %f689, %f334, %f628; - mul.f32 %f690, %f332, %f627; - mul.f32 %f691, %f338, %f626; - mul.f32 %f692, %f336, %f625; - mul.f32 %f693, %f342, %f632; - mul.f32 %f694, %f340, %f631; - mul.f32 %f695, %f346, %f630; - mul.f32 %f696, %f344, %f629; - mul.f32 %f697, %f350, %f636; - mul.f32 %f698, %f348, %f635; - mul.f32 %f699, %f354, %f634; - mul.f32 %f700, %f352, %f633; - mul.f32 %f701, %f358, %f640; - mul.f32 %f702, %f356, %f639; - mul.f32 %f703, %f362, %f638; - mul.f32 %f704, %f360, %f637; - mul.f32 %f705, %f366, %f644; - mul.f32 %f706, %f364, %f643; - mul.f32 %f707, %f370, %f642; - mul.f32 %f708, %f368, %f641; - mul.f32 %f709, %f374, %f648; - mul.f32 %f710, %f372, %f647; - mul.f32 %f711, %f378, %f646; - mul.f32 %f712, %f376, %f645; - mul.f32 %f713, %f382, %f652; - mul.f32 %f714, %f380, %f651; - mul.f32 %f715, %f386, %f650; - mul.f32 %f716, %f384, %f649; - mul.f32 %f717, %f654, %f195; - mul.f32 %f718, %f653, %f195; - mul.f32 %f719, %f656, %f195; - mul.f32 %f720, %f655, %f195; - mul.f32 %f721, %f658, %f195; - mul.f32 %f722, %f657, %f195; - mul.f32 %f723, %f660, %f195; - mul.f32 %f724, %f659, %f195; - mul.f32 %f725, %f662, %f195; - mul.f32 %f726, %f661, %f195; - mul.f32 %f727, %f664, %f195; - mul.f32 %f728, %f663, %f195; - mul.f32 %f729, %f666, %f195; - mul.f32 %f730, %f665, %f195; - mul.f32 %f731, %f668, %f195; - mul.f32 %f732, %f667, %f195; - mul.f32 %f733, %f670, %f195; - mul.f32 %f734, %f669, %f195; - mul.f32 %f735, %f672, %f195; - mul.f32 %f736, %f671, %f195; - mul.f32 %f737, %f674, %f195; - mul.f32 %f738, %f673, %f195; - mul.f32 %f739, %f676, %f195; - mul.f32 %f740, %f675, %f195; - mul.f32 %f741, %f678, %f195; - mul.f32 %f742, %f677, %f195; - mul.f32 %f743, %f680, %f195; - mul.f32 %f744, %f679, %f195; - mul.f32 %f745, %f682, %f195; - mul.f32 %f746, %f681, %f195; - mul.f32 %f747, %f684, %f195; - mul.f32 %f748, %f683, %f195; - mul.f32 %f749, %f686, %f195; - mul.f32 %f750, %f685, %f195; - mul.f32 %f751, %f688, %f195; - mul.f32 %f752, %f687, %f195; - mul.f32 %f753, %f690, %f195; - mul.f32 %f754, %f689, %f195; - mul.f32 %f755, %f692, %f195; - mul.f32 %f756, %f691, %f195; - mul.f32 %f757, %f694, %f195; - mul.f32 %f758, %f693, %f195; - mul.f32 %f759, %f696, %f195; - mul.f32 %f760, %f695, %f195; - mul.f32 %f761, %f698, %f195; - mul.f32 %f762, %f697, %f195; - mul.f32 %f763, %f700, %f195; - mul.f32 %f764, %f699, %f195; - mul.f32 %f765, %f702, %f195; - mul.f32 %f766, %f701, %f195; - mul.f32 %f767, %f704, %f195; - mul.f32 %f768, %f703, %f195; - mul.f32 %f769, %f706, %f195; - mul.f32 %f770, %f705, %f195; - mul.f32 %f771, %f708, %f195; - mul.f32 %f772, %f707, %f195; - mul.f32 %f773, %f710, %f195; - mul.f32 %f774, %f709, %f195; - mul.f32 %f775, %f712, %f195; - mul.f32 %f776, %f711, %f195; - mul.f32 %f777, %f714, %f195; - mul.f32 %f778, %f713, %f195; - mul.f32 %f779, %f716, %f195; - mul.f32 %f780, %f715, %f195; - cvt.rn.f16.f32 %h65, %f718; - cvt.rn.f16.f32 %h66, %f717; - cvt.rn.f16.f32 %h67, %f720; - cvt.rn.f16.f32 %h68, %f719; - cvt.rn.f16.f32 %h69, %f722; - cvt.rn.f16.f32 %h70, %f721; - cvt.rn.f16.f32 %h71, %f724; - cvt.rn.f16.f32 %h72, %f723; - cvt.rn.f16.f32 %h73, %f726; - cvt.rn.f16.f32 %h74, %f725; - cvt.rn.f16.f32 %h75, %f728; - cvt.rn.f16.f32 %h76, %f727; - cvt.rn.f16.f32 %h77, %f730; - cvt.rn.f16.f32 %h78, %f729; - cvt.rn.f16.f32 %h79, %f732; - cvt.rn.f16.f32 %h80, %f731; - cvt.rn.f16.f32 %h81, %f734; - cvt.rn.f16.f32 %h82, %f733; - cvt.rn.f16.f32 %h83, %f736; - cvt.rn.f16.f32 %h84, %f735; - cvt.rn.f16.f32 %h85, %f738; - cvt.rn.f16.f32 %h86, %f737; - cvt.rn.f16.f32 %h87, %f740; - cvt.rn.f16.f32 %h88, %f739; - cvt.rn.f16.f32 %h89, %f742; - cvt.rn.f16.f32 %h90, %f741; - cvt.rn.f16.f32 %h91, %f744; - cvt.rn.f16.f32 %h92, %f743; - cvt.rn.f16.f32 %h93, %f746; - cvt.rn.f16.f32 %h94, %f745; - cvt.rn.f16.f32 %h95, %f748; - cvt.rn.f16.f32 %h96, %f747; - cvt.rn.f16.f32 %h97, %f750; - cvt.rn.f16.f32 %h98, %f749; - cvt.rn.f16.f32 %h99, %f752; - cvt.rn.f16.f32 %h100, %f751; - cvt.rn.f16.f32 %h101, %f754; - cvt.rn.f16.f32 %h102, %f753; - cvt.rn.f16.f32 %h103, %f756; - cvt.rn.f16.f32 %h104, %f755; - cvt.rn.f16.f32 %h105, %f758; - cvt.rn.f16.f32 %h106, %f757; - cvt.rn.f16.f32 %h107, %f760; - cvt.rn.f16.f32 %h108, %f759; - cvt.rn.f16.f32 %h109, %f762; - cvt.rn.f16.f32 %h110, %f761; - cvt.rn.f16.f32 %h111, %f764; - cvt.rn.f16.f32 %h112, %f763; - cvt.rn.f16.f32 %h113, %f766; - cvt.rn.f16.f32 %h114, %f765; - cvt.rn.f16.f32 %h115, %f768; - cvt.rn.f16.f32 %h116, %f767; - cvt.rn.f16.f32 %h117, %f770; - cvt.rn.f16.f32 %h118, %f769; - cvt.rn.f16.f32 %h119, %f772; - cvt.rn.f16.f32 %h120, %f771; - cvt.rn.f16.f32 %h121, %f774; - cvt.rn.f16.f32 %h122, %f773; - cvt.rn.f16.f32 %h123, %f776; - cvt.rn.f16.f32 %h124, %f775; - cvt.rn.f16.f32 %h125, %f778; - cvt.rn.f16.f32 %h126, %f777; - cvt.rn.f16.f32 %h127, %f780; - cvt.rn.f16.f32 %h128, %f779; - bar.sync 0; - st.shared.v2.b16 [%r182], {%h66, %h65}; - st.shared.v2.b16 [%r183], {%h68, %h67}; - st.shared.v2.b16 [%r184], {%h70, %h69}; - st.shared.v2.b16 [%r185], {%h72, %h71}; - st.shared.v2.b16 [%r186], {%h74, %h73}; - st.shared.v2.b16 [%r187], {%h76, %h75}; - st.shared.v2.b16 [%r188], {%h78, %h77}; - st.shared.v2.b16 [%r189], {%h80, %h79}; - st.shared.v2.b16 [%r190], {%h82, %h81}; - st.shared.v2.b16 [%r191], {%h84, %h83}; - st.shared.v2.b16 [%r192], {%h86, %h85}; - st.shared.v2.b16 [%r193], {%h88, %h87}; - st.shared.v2.b16 [%r194], {%h90, %h89}; - st.shared.v2.b16 [%r195], {%h92, %h91}; - st.shared.v2.b16 [%r196], {%h94, %h93}; - st.shared.v2.b16 [%r197], {%h96, %h95}; - st.shared.v2.b16 [%r182+128], {%h98, %h97}; - st.shared.v2.b16 [%r183+128], {%h100, %h99}; - st.shared.v2.b16 [%r200], {%h102, %h101}; - st.shared.v2.b16 [%r201], {%h104, %h103}; - st.shared.v2.b16 [%r202], {%h106, %h105}; - st.shared.v2.b16 [%r203], {%h108, %h107}; - st.shared.v2.b16 [%r204], {%h110, %h109}; - st.shared.v2.b16 [%r205], {%h112, %h111}; - st.shared.v2.b16 [%r206], {%h114, %h113}; - st.shared.v2.b16 [%r207], {%h116, %h115}; - st.shared.v2.b16 [%r208], {%h118, %h117}; - st.shared.v2.b16 [%r209], {%h120, %h119}; - st.shared.v2.b16 [%r210], {%h122, %h121}; - st.shared.v2.b16 [%r211], {%h124, %h123}; - st.shared.v2.b16 [%r212], {%h126, %h125}; - st.shared.v2.b16 [%r213], {%h128, %h127}; - bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4090, %r4091, %r4092, %r4093 }, [ %r3930 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4202, %r4203, %r4204, %r4205 }, [ %r3935 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4314, %r4315, %r4316, %r4317 }, [ %r3940 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4426, %r4427, %r4428, %r4429 }, [ %r3945 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4538, %r4539, %r4540, %r4541 }, [ %r3950 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4650, %r4651, %r4652, %r4653 }, [ %r3955 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4762, %r4763, %r4764, %r4765 }, [ %r3960 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4874, %r4875, %r4876, %r4877 }, [ %r3965 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4146, %r4147, %r4148, %r4149 }, [ %r3970 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4258, %r4259, %r4260, %r4261 }, [ %r3975 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4370, %r4371, %r4372, %r4373 }, [ %r3980 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4482, %r4483, %r4484, %r4485 }, [ %r3985 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4594, %r4595, %r4596, %r4597 }, [ %r3990 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4706, %r4707, %r4708, %r4709 }, [ %r3995 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4818, %r4819, %r4820, %r4821 }, [ %r4000 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4930, %r4931, %r4932, %r4933 }, [ %r4005 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4094, %r4095, %r4108, %r4109 }, [ %r4010 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4206, %r4207, %r4220, %r4221 }, [ %r4015 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4318, %r4319, %r4332, %r4333 }, [ %r4020 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4430, %r4431, %r4444, %r4445 }, [ %r4025 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4542, %r4543, %r4556, %r4557 }, [ %r4030 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4654, %r4655, %r4668, %r4669 }, [ %r4035 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4766, %r4767, %r4780, %r4781 }, [ %r4040 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4878, %r4879, %r4892, %r4893 }, [ %r4045 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4122, %r4123, %r4136, %r4137 }, [ %r4050 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4234, %r4235, %r4248, %r4249 }, [ %r4055 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4346, %r4347, %r4360, %r4361 }, [ %r4060 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4458, %r4459, %r4472, %r4473 }, [ %r4065 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4570, %r4571, %r4584, %r4585 }, [ %r4070 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4682, %r4683, %r4696, %r4697 }, [ %r4075 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4794, %r4795, %r4808, %r4809 }, [ %r4080 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4906, %r4907, %r4920, %r4921 }, [ %r4085 + 0 ]; - mov.b32 %r4198, %f941; - mov.b32 %r4199, %f942; - mov.b32 %r4200, %f943; - mov.b32 %r4201, %f944; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4094, %r4095 }, { %r4198, %r4199, %r4200, %r4201 }; - mov.b32 %r4212, %f945; - mov.b32 %r4213, %f946; - mov.b32 %r4214, %f947; - mov.b32 %r4215, %f948; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4108, %r4109 }, { %r4212, %r4213, %r4214, %r4215 }; - mov.b32 %r4226, %f949; - mov.b32 %r4227, %f950; - mov.b32 %r4228, %f951; - mov.b32 %r4229, %f952; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4122, %r4123 }, { %r4226, %r4227, %r4228, %r4229 }; - mov.b32 %r4240, %f953; - mov.b32 %r4241, %f954; - mov.b32 %r4242, %f955; - mov.b32 %r4243, %f956; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4136, %r4137 }, { %r4240, %r4241, %r4242, %r4243 }; - mov.b32 %r4254, %f957; - mov.b32 %r4255, %f958; - mov.b32 %r4256, %f959; - mov.b32 %r4257, %f960; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4094, %r4095 }, { %r4254, %r4255, %r4256, %r4257 }; - mov.b32 %r4268, %f961; - mov.b32 %r4269, %f962; - mov.b32 %r4270, %f963; - mov.b32 %r4271, %f964; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4108, %r4109 }, { %r4268, %r4269, %r4270, %r4271 }; - mov.b32 %r4282, %f965; - mov.b32 %r4283, %f966; - mov.b32 %r4284, %f967; - mov.b32 %r4285, %f968; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4122, %r4123 }, { %r4282, %r4283, %r4284, %r4285 }; - mov.b32 %r4296, %f969; - mov.b32 %r4297, %f970; - mov.b32 %r4298, %f971; - mov.b32 %r4299, %f972; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4136, %r4137 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4206, %r4207 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4220, %r4221 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4234, %r4235 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4248, %r4249 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4206, %r4207 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4220, %r4221 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4234, %r4235 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4248, %r4249 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4318, %r4319 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4332, %r4333 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4346, %r4347 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4360, %r4361 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4318, %r4319 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4332, %r4333 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4346, %r4347 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4360, %r4361 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4430, %r4431 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4444, %r4445 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4458, %r4459 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4472, %r4473 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4430, %r4431 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4444, %r4445 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4458, %r4459 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4472, %r4473 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4542, %r4543 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4556, %r4557 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4570, %r4571 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4584, %r4585 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4542, %r4543 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4556, %r4557 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4570, %r4571 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4584, %r4585 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4654, %r4655 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4668, %r4669 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4682, %r4683 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4696, %r4697 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4654, %r4655 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4668, %r4669 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4682, %r4683 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4696, %r4697 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4766, %r4767 }, { %r4198, %r4199, %r4200, %r4201 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4780, %r4781 }, { %r4212, %r4213, %r4214, %r4215 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4794, %r4795 }, { %r4226, %r4227, %r4228, %r4229 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4808, %r4809 }, { %r4240, %r4241, %r4242, %r4243 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4766, %r4767 }, { %r4254, %r4255, %r4256, %r4257 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4780, %r4781 }, { %r4268, %r4269, %r4270, %r4271 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4794, %r4795 }, { %r4282, %r4283, %r4284, %r4285 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4808, %r4809 }, { %r4296, %r4297, %r4298, %r4299 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4878, %r4879 }, { %r4198, %r4199, %r4200, %r4201 }; - mov.b32 %f944, %r4201; - mov.b32 %f943, %r4200; - mov.b32 %f942, %r4199; - mov.b32 %f941, %r4198; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4892, %r4893 }, { %r4212, %r4213, %r4214, %r4215 }; - mov.b32 %f948, %r4215; - mov.b32 %f947, %r4214; - mov.b32 %f946, %r4213; - mov.b32 %f945, %r4212; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4906, %r4907 }, { %r4226, %r4227, %r4228, %r4229 }; - mov.b32 %f952, %r4229; - mov.b32 %f951, %r4228; - mov.b32 %f950, %r4227; - mov.b32 %f949, %r4226; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4920, %r4921 }, { %r4240, %r4241, %r4242, %r4243 }; - mov.b32 %f956, %r4243; - mov.b32 %f955, %r4242; - mov.b32 %f954, %r4241; - mov.b32 %f953, %r4240; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4878, %r4879 }, { %r4254, %r4255, %r4256, %r4257 }; - mov.b32 %f960, %r4257; - mov.b32 %f959, %r4256; - mov.b32 %f958, %r4255; - mov.b32 %f957, %r4254; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4892, %r4893 }, { %r4268, %r4269, %r4270, %r4271 }; - mov.b32 %f964, %r4271; - mov.b32 %f963, %r4270; - mov.b32 %f962, %r4269; - mov.b32 %f961, %r4268; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4906, %r4907 }, { %r4282, %r4283, %r4284, %r4285 }; - mov.b32 %f968, %r4285; - mov.b32 %f967, %r4284; - mov.b32 %f966, %r4283; - mov.b32 %f965, %r4282; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4920, %r4921 }, { %r4296, %r4297, %r4298, %r4299 }; - mov.b32 %f972, %r4299; - mov.b32 %f971, %r4298; - mov.b32 %f970, %r4297; - mov.b32 %f969, %r4296; - @%p102 ld.global.v4.b32 { %r4982, %r4983, %r4984, %r4985 }, [ %rd96 + 0 ]; - @%p102 ld.global.v4.b32 { %r4986, %r4987, %r4988, %r4989 }, [ %rd97 + 0 ]; - @%p102 ld.global.v4.b32 { %r4990, %r4991, %r4992, %r4993 }, [ %rd98 + 0 ]; - @%p102 ld.global.v4.b32 { %r4994, %r4995, %r4996, %r4997 }, [ %rd99 + 0 ]; - @%p102 ld.global.v4.b32 { %r4998, %r4999, %r5000, %r5001 }, [ %rd100 + 0 ]; - @%p102 ld.global.v4.b32 { %r5002, %r5003, %r5004, %r5005 }, [ %rd101 + 0 ]; - @%p102 ld.global.v4.b32 { %r5006, %r5007, %r5008, %r5009 }, [ %rd102 + 0 ]; - @%p102 ld.global.v4.b32 { %r5010, %r5011, %r5012, %r5013 }, [ %rd103 + 0 ]; - bar.sync 0; - st.shared.v4.u32 [%r246], {%r4982, %r4983, %r4984, %r4985}; - st.shared.v4.u32 [%r246+4352], {%r4986, %r4987, %r4988, %r4989}; - st.shared.v4.u32 [%r246+8704], {%r4990, %r4991, %r4992, %r4993}; - st.shared.v4.u32 [%r246+13056], {%r4994, %r4995, %r4996, %r4997}; - bar.sync 0; - ld.shared.v2.f32 {%f781, %f782}, [%r247]; - ld.shared.v2.f32 {%f783, %f784}, [%r248]; - ld.shared.v2.f32 {%f785, %f786}, [%r247+64]; - ld.shared.v2.f32 {%f787, %f788}, [%r248+64]; - ld.shared.v2.f32 {%f789, %f790}, [%r247+128]; - ld.shared.v2.f32 {%f791, %f792}, [%r248+128]; - ld.shared.v2.f32 {%f793, %f794}, [%r247+192]; - ld.shared.v2.f32 {%f795, %f796}, [%r248+192]; - bar.sync 0; - st.shared.v4.u32 [%r246], {%r4998, %r4999, %r5000, %r5001}; - st.shared.v4.u32 [%r246+4352], {%r5002, %r5003, %r5004, %r5005}; - st.shared.v4.u32 [%r246+8704], {%r5006, %r5007, %r5008, %r5009}; - st.shared.v4.u32 [%r246+13056], {%r5010, %r5011, %r5012, %r5013}; - bar.sync 0; - ld.shared.v2.f32 {%f797, %f798}, [%r247]; - ld.shared.v2.f32 {%f799, %f800}, [%r248]; - ld.shared.v2.f32 {%f801, %f802}, [%r247+64]; - ld.shared.v2.f32 {%f803, %f804}, [%r248+64]; - ld.shared.v2.f32 {%f805, %f806}, [%r247+128]; - ld.shared.v2.f32 {%f807, %f808}, [%r248+128]; - ld.shared.v2.f32 {%f809, %f810}, [%r247+192]; - ld.shared.v2.f32 {%f811, %f812}, [%r248+192]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5178, %r5179, %r5180, %r5181 }, [ %r5018 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5290, %r5291, %r5292, %r5293 }, [ %r5023 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5402, %r5403, %r5404, %r5405 }, [ %r5028 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5514, %r5515, %r5516, %r5517 }, [ %r5033 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5626, %r5627, %r5628, %r5629 }, [ %r5038 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5738, %r5739, %r5740, %r5741 }, [ %r5043 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5850, %r5851, %r5852, %r5853 }, [ %r5048 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5962, %r5963, %r5964, %r5965 }, [ %r5053 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5234, %r5235, %r5236, %r5237 }, [ %r5058 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5346, %r5347, %r5348, %r5349 }, [ %r5063 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5458, %r5459, %r5460, %r5461 }, [ %r5068 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5570, %r5571, %r5572, %r5573 }, [ %r5073 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5682, %r5683, %r5684, %r5685 }, [ %r5078 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5794, %r5795, %r5796, %r5797 }, [ %r5083 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5906, %r5907, %r5908, %r5909 }, [ %r5088 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r6018, %r6019, %r6020, %r6021 }, [ %r5093 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5182, %r5183, %r5196, %r5197 }, [ %r5098 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5294, %r5295, %r5308, %r5309 }, [ %r5103 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5406, %r5407, %r5420, %r5421 }, [ %r5108 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5518, %r5519, %r5532, %r5533 }, [ %r5113 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5630, %r5631, %r5644, %r5645 }, [ %r5118 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5742, %r5743, %r5756, %r5757 }, [ %r5123 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5854, %r5855, %r5868, %r5869 }, [ %r5128 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5966, %r5967, %r5980, %r5981 }, [ %r5133 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5210, %r5211, %r5224, %r5225 }, [ %r5138 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5322, %r5323, %r5336, %r5337 }, [ %r5143 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5434, %r5435, %r5448, %r5449 }, [ %r5148 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5546, %r5547, %r5560, %r5561 }, [ %r5153 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5658, %r5659, %r5672, %r5673 }, [ %r5158 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5770, %r5771, %r5784, %r5785 }, [ %r5163 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5882, %r5883, %r5896, %r5897 }, [ %r5168 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5994, %r5995, %r6008, %r6009 }, [ %r5173 + 0 ]; - mov.b32 %r5286, %f781; - mov.b32 %r5287, %f782; - mov.b32 %r5288, %f783; - mov.b32 %r5289, %f784; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5182, %r5183 }, { %r5286, %r5287, %r5288, %r5289 }; - mov.b32 %r5300, %f785; - mov.b32 %r5301, %f786; - mov.b32 %r5302, %f787; - mov.b32 %r5303, %f788; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5196, %r5197 }, { %r5300, %r5301, %r5302, %r5303 }; - mov.b32 %r5314, %f789; - mov.b32 %r5315, %f790; - mov.b32 %r5316, %f791; - mov.b32 %r5317, %f792; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5210, %r5211 }, { %r5314, %r5315, %r5316, %r5317 }; - mov.b32 %r5328, %f793; - mov.b32 %r5329, %f794; - mov.b32 %r5330, %f795; - mov.b32 %r5331, %f796; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5224, %r5225 }, { %r5328, %r5329, %r5330, %r5331 }; - mov.b32 %r5342, %f797; - mov.b32 %r5343, %f798; - mov.b32 %r5344, %f799; - mov.b32 %r5345, %f800; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5182, %r5183 }, { %r5342, %r5343, %r5344, %r5345 }; - mov.b32 %r5356, %f801; - mov.b32 %r5357, %f802; - mov.b32 %r5358, %f803; - mov.b32 %r5359, %f804; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5196, %r5197 }, { %r5356, %r5357, %r5358, %r5359 }; - mov.b32 %r5370, %f805; - mov.b32 %r5371, %f806; - mov.b32 %r5372, %f807; - mov.b32 %r5373, %f808; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5210, %r5211 }, { %r5370, %r5371, %r5372, %r5373 }; - mov.b32 %r5384, %f809; - mov.b32 %r5385, %f810; - mov.b32 %r5386, %f811; - mov.b32 %r5387, %f812; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5224, %r5225 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5294, %r5295 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5308, %r5309 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5322, %r5323 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5336, %r5337 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5294, %r5295 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5308, %r5309 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5322, %r5323 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5336, %r5337 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5406, %r5407 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5420, %r5421 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5434, %r5435 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5448, %r5449 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5406, %r5407 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5420, %r5421 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5434, %r5435 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5448, %r5449 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5518, %r5519 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5532, %r5533 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5546, %r5547 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5560, %r5561 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5518, %r5519 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5532, %r5533 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5546, %r5547 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5560, %r5561 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5630, %r5631 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5644, %r5645 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5658, %r5659 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5672, %r5673 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5630, %r5631 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5644, %r5645 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5658, %r5659 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5672, %r5673 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5742, %r5743 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5756, %r5757 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5770, %r5771 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5784, %r5785 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5742, %r5743 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5756, %r5757 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5770, %r5771 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5784, %r5785 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5854, %r5855 }, { %r5286, %r5287, %r5288, %r5289 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5868, %r5869 }, { %r5300, %r5301, %r5302, %r5303 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5882, %r5883 }, { %r5314, %r5315, %r5316, %r5317 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5896, %r5897 }, { %r5328, %r5329, %r5330, %r5331 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5854, %r5855 }, { %r5342, %r5343, %r5344, %r5345 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5868, %r5869 }, { %r5356, %r5357, %r5358, %r5359 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5882, %r5883 }, { %r5370, %r5371, %r5372, %r5373 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5896, %r5897 }, { %r5384, %r5385, %r5386, %r5387 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5966, %r5967 }, { %r5286, %r5287, %r5288, %r5289 }; - mov.b32 %f813, %r5289; - mov.b32 %f814, %r5288; - mov.b32 %f815, %r5287; - mov.b32 %f816, %r5286; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5980, %r5981 }, { %r5300, %r5301, %r5302, %r5303 }; - mov.b32 %f817, %r5303; - mov.b32 %f818, %r5302; - mov.b32 %f819, %r5301; - mov.b32 %f820, %r5300; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5994, %r5995 }, { %r5314, %r5315, %r5316, %r5317 }; - mov.b32 %f821, %r5317; - mov.b32 %f822, %r5316; - mov.b32 %f823, %r5315; - mov.b32 %f824, %r5314; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5962, %r5963, %r5964, %r5965 }, { %r6008, %r6009 }, { %r5328, %r5329, %r5330, %r5331 }; - mov.b32 %f825, %r5331; - mov.b32 %f826, %r5330; - mov.b32 %f827, %r5329; - mov.b32 %f828, %r5328; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5966, %r5967 }, { %r5342, %r5343, %r5344, %r5345 }; - mov.b32 %f829, %r5345; - mov.b32 %f830, %r5344; - mov.b32 %f831, %r5343; - mov.b32 %f832, %r5342; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5980, %r5981 }, { %r5356, %r5357, %r5358, %r5359 }; - mov.b32 %f833, %r5359; - mov.b32 %f834, %r5358; - mov.b32 %f835, %r5357; - mov.b32 %f836, %r5356; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5994, %r5995 }, { %r5370, %r5371, %r5372, %r5373 }; - mov.b32 %f837, %r5373; - mov.b32 %f838, %r5372; - mov.b32 %f839, %r5371; - mov.b32 %f840, %r5370; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r6018, %r6019, %r6020, %r6021 }, { %r6008, %r6009 }, { %r5384, %r5385, %r5386, %r5387 }; - mov.b32 %f841, %r5387; - mov.b32 %f842, %r5386; - mov.b32 %f843, %r5385; - mov.b32 %f844, %r5384; - bar.sync 0; - st.shared.v2.f32 [%r247], {%f816, %f815}; - st.shared.v2.f32 [%r248], {%f814, %f813}; - st.shared.v2.f32 [%r247+64], {%f820, %f819}; - st.shared.v2.f32 [%r248+64], {%f818, %f817}; - st.shared.v2.f32 [%r247+128], {%f824, %f823}; - st.shared.v2.f32 [%r248+128], {%f822, %f821}; - st.shared.v2.f32 [%r247+192], {%f828, %f827}; - st.shared.v2.f32 [%r248+192], {%f826, %f825}; - bar.sync 0; - ld.shared.v4.u32 {%r6070, %r6071, %r6072, %r6073}, [%r246]; - ld.shared.v4.u32 {%r6074, %r6075, %r6076, %r6077}, [%r246+4352]; - ld.shared.v4.u32 {%r6078, %r6079, %r6080, %r6081}, [%r246+8704]; - ld.shared.v4.u32 {%r6082, %r6083, %r6084, %r6085}, [%r246+13056]; - bar.sync 0; - st.shared.v2.f32 [%r247], {%f832, %f831}; - st.shared.v2.f32 [%r248], {%f830, %f829}; - st.shared.v2.f32 [%r247+64], {%f836, %f835}; - st.shared.v2.f32 [%r248+64], {%f834, %f833}; - st.shared.v2.f32 [%r247+128], {%f840, %f839}; - st.shared.v2.f32 [%r248+128], {%f838, %f837}; - st.shared.v2.f32 [%r247+192], {%f844, %f843}; - st.shared.v2.f32 [%r248+192], {%f842, %f841}; - bar.sync 0; - ld.shared.v4.u32 {%r6086, %r6087, %r6088, %r6089}, [%r246]; - ld.shared.v4.u32 {%r6090, %r6091, %r6092, %r6093}, [%r246+4352]; - ld.shared.v4.u32 {%r6094, %r6095, %r6096, %r6097}, [%r246+8704]; - ld.shared.v4.u32 {%r6098, %r6099, %r6100, %r6101}, [%r246+13056]; - @%p102 st.global.v4.b32 [ %rd96 + 0 ], { %r6070, %r6071, %r6072, %r6073 }; - @%p102 st.global.v4.b32 [ %rd97 + 0 ], { %r6074, %r6075, %r6076, %r6077 }; - @%p102 st.global.v4.b32 [ %rd98 + 0 ], { %r6078, %r6079, %r6080, %r6081 }; - @%p102 st.global.v4.b32 [ %rd99 + 0 ], { %r6082, %r6083, %r6084, %r6085 }; - @%p102 st.global.v4.b32 [ %rd100 + 0 ], { %r6086, %r6087, %r6088, %r6089 }; - @%p102 st.global.v4.b32 [ %rd101 + 0 ], { %r6090, %r6091, %r6092, %r6093 }; - @%p102 st.global.v4.b32 [ %rd102 + 0 ], { %r6094, %r6095, %r6096, %r6097 }; - @%p102 st.global.v4.b32 [ %rd103 + 0 ], { %r6098, %r6099, %r6100, %r6101 }; - add.s32 %r6176, %r6176, 128; - add.s64 %rd138, %rd138, %rd10; - add.s64 %rd137, %rd137, %rd10; - add.s64 %rd136, %rd136, %rd10; - add.s64 %rd135, %rd135, %rd10; - add.s64 %rd134, %rd134, %rd10; - add.s64 %rd133, %rd133, %rd10; - add.s64 %rd132, %rd132, %rd10; - add.s64 %rd131, %rd131, %rd10; - add.s64 %rd130, %rd130, %rd11; - setp.lt.s32 %p101, %r6176, %r22; - @%p101 bra LBB0_4; - bra.uni LBB0_5; -LBB0_6: - ret; - -} diff --git a/python/bwd.ttgir b/python/bwd.ttgir deleted file mode 100644 index 9d66d62f4..000000000 --- a/python/bwd.ttgir +++ /dev/null @@ -1,169 +0,0 @@ -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma0 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1]}> -#mma1 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2]}> -#shared0 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1]}> -module attributes {"triton_gpu.num-warps" = 8 : i32} { - func public @_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32 {tt.divisibility = 16 : i32}, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32, %arg22: i32 {tt.divisibility = 16 : i32}, %arg23: i32 {tt.divisibility = 16 : i32}, %arg24: i32) { - %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma0> - %cst_1 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #mma0> - %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128_i32 = arith.constant 128 : i32 - %c128 = arith.constant 128 : index - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.divsi %0, %arg22 : i32 - %2 = arith.remsi %0, %arg22 : i32 - %3 = arith.muli %1, %arg12 : i32 - %4 = arith.muli %2, %arg13 : i32 - %5 = arith.addi %3, %4 : i32 - %6 = tt.addptr %arg0, %5 : !tt.ptr, i32 - %7 = tt.addptr %arg1, %5 : !tt.ptr, i32 - %8 = tt.addptr %arg2, %5 : !tt.ptr, i32 - %9 = tt.addptr %arg5, %5 : !tt.ptr, i32 - %10 = tt.addptr %arg6, %5 : !tt.ptr, i32 - %11 = tt.addptr %arg7, %5 : !tt.ptr, i32 - %12 = tt.addptr %arg8, %5 : !tt.ptr, i32 - %13 = arith.index_cast %arg24 : i32 to index - %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %19 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> - %20 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked2> - %21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %22 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %23 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %25 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2> - %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %27 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %28 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> - %29 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %31 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked2> - %33 = arith.muli %0, %arg23 : i32 - %34 = tt.addptr %arg11, %33 : !tt.ptr, i32 - %35 = tt.addptr %arg10, %33 : !tt.ptr, i32 - %36 = arith.muli %arg24, %c128_i32 : i32 - %37 = arith.index_cast %36 : i32 to index - %38 = tt.splat %35 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %39 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma0> - %40 = tt.splat %34 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %41 = arith.muli %arg14, %c128_i32 : i32 - %42 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked1> - %43 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked2> - %44 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %45 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - scf.for %arg25 = %c0 to %13 step %c1 { - %46 = arith.index_cast %arg25 : index to i32 - %47 = arith.muli %46, %c128_i32 : i32 - %48 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %49 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %50 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %51 = arith.addi %48, %15 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %52 = arith.addi %50, %17 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %53 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %54 = tt.expand_dims %52 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<128x1xi32, #blocked2> - %55 = arith.muli %53, %28 : tensor<128x1xi32, #blocked1> - %56 = tt.broadcast %55 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %57 = arith.addi %56, %24 : tensor<128x64xi32, #blocked1> - %58 = tt.addptr %29, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %59 = tt.load %58 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %60 = arith.muli %53, %19 : tensor<128x1xi32, #blocked1> - %61 = tt.broadcast %60 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %62 = arith.addi %61, %24 : tensor<128x64xi32, #blocked1> - %63 = tt.addptr %30, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %64 = tt.load %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %65 = arith.index_cast %47 : i32 to index - %66 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %67 = tt.trans %66 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %68 = arith.addi %49, %16 : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %69 = tt.expand_dims %68 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>>) -> tensor<1x128xi32, #mma0> - %70 = tt.broadcast %69 : (tensor<1x128xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %71 = triton_gpu.convert_layout %64 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %72 = tt.trans %71 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %73 = arith.muli %54, %20 : tensor<128x1xi32, #blocked2> - %74 = tt.broadcast %73 : (tensor<128x1xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %75 = arith.addi %74, %26 : tensor<128x64xi32, #blocked2> - %76 = tt.addptr %32, %75 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %77 = tt.addptr %27, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %78 = tt.addptr %31, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %79:5 = scf.for %arg26 = %65 to %37 step %c128 iter_args(%arg27 = %cst, %arg28 = %cst, %arg29 = %76, %arg30 = %77, %arg31 = %78) -> (tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { - %86 = arith.index_cast %arg26 : index to i32 - %87 = tt.splat %86 : (i32) -> tensor<128xi32, #blocked0> - %88 = tt.splat %86 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %89 = arith.addi %87, %14 : tensor<128xi32, #blocked0> - %90 = tt.load %arg30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %91 = triton_gpu.convert_layout %90 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %92 = triton_gpu.convert_layout %91 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %93 = triton_gpu.convert_layout %67 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %94 = tt.dot %92, %93, %cst_0 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %95 = arith.addi %88, %18 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %96 = tt.expand_dims %95 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xi32, #mma0> - %97 = tt.broadcast %96 : (tensor<128x1xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %98 = "triton_gpu.cmpi"(%97, %70) {predicate = 5 : i64} : (tensor<128x128xi32, #mma0>, tensor<128x128xi32, #mma0>) -> tensor<128x128xi1, #mma0> - %99 = "triton_gpu.select"(%98, %94, %cst_1) : (tensor<128x128xi1, #mma0>, tensor<128x128xf32, #mma0>, tensor<128x128xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %100 = tt.addptr %38, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %101 = tt.load %100 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %102 = arith.mulf %99, %39 : tensor<128x128xf32, #mma0> - %103 = triton_gpu.convert_layout %101 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %104 = tt.expand_dims %103 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %105 = tt.broadcast %104 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %106 = arith.subf %102, %105 : tensor<128x128xf32, #mma0> - %107 = math.exp %106 : tensor<128x128xf32, #mma0> - %108 = tt.load %arg31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %109 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %110 = arith.truncf %107 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %111 = triton_gpu.convert_layout %110 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %112 = tt.trans %111 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %113 = triton_gpu.convert_layout %112 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %114 = triton_gpu.convert_layout %109 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %115 = tt.dot %113, %114, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %116 = tt.addptr %40, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %117 = tt.load %116 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %118 = triton_gpu.convert_layout %117 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %119 = tt.expand_dims %118 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %120 = tt.broadcast %119 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %121 = arith.subf %cst_0, %120 : tensor<128x128xf32, #mma0> - %122 = triton_gpu.convert_layout %109 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %123 = triton_gpu.convert_layout %72 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %124 = tt.dot %122, %123, %121 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %125 = arith.mulf %107, %124 : tensor<128x128xf32, #mma0> - %126 = arith.mulf %125, %39 : tensor<128x128xf32, #mma0> - %127 = arith.truncf %126 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %128 = triton_gpu.convert_layout %127 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %129 = tt.trans %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %130 = triton_gpu.convert_layout %129 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %131 = triton_gpu.convert_layout %91 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %132 = tt.dot %130, %131, %arg28 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %133 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf32, #blocked2> - %134 = triton_gpu.convert_layout %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %135 = triton_gpu.convert_layout %66 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %136 = tt.dot %134, %135, %cst_2 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %137 = triton_gpu.convert_layout %136 : (tensor<128x64xf32, #mma1>) -> tensor<128x64xf32, #blocked2> - %138 = arith.addf %137, %133 : tensor<128x64xf32, #blocked2> - tt.store %arg29, %138 : tensor<128x64xf32, #blocked2> - %139 = tt.addptr %arg29, %43 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %140 = tt.addptr %arg30, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %141 = tt.addptr %arg31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %115, %132, %139, %140, %141 : tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> - } - %80 = tt.addptr %44, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %81 = arith.truncf %79#0 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %82 = triton_gpu.convert_layout %81 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %80, %82 : tensor<128x64xf16, #blocked1> - %83 = tt.addptr %45, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %84 = arith.truncf %79#1 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %85 = triton_gpu.convert_layout %84 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %83, %85 : tensor<128x64xf16, #blocked1> - } - return - } -} \ No newline at end of file diff --git a/python/flash-attention.ttgir b/python/flash-attention.ttgir deleted file mode 100644 index 6ff6b8da0..000000000 --- a/python/flash-attention.ttgir +++ /dev/null @@ -1,159 +0,0 @@ -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 4], warpsPerCTA = [1, 4], order = [0, 1]}> -#mma = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1]}> -#mma_s1 = #triton_gpu.slice<{dim = 1, parent = #mma}> -#shared0 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1]}> -module attributes {"triton_gpu.num-warps" = 4 : i32} { - func public @_fwd_kernel_0d1d2d34d5d6d7d8d9d10c11d12d13d14c15d16d17d18c19d20d21d22c2324d25d(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}, %arg9: i32 {tt.divisibility = 16 : i32}, %arg10: i32 {tt.divisibility = 16 : i32}, %arg11: i32 {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32 {tt.divisibility = 16 : i32}) { - %c0_i32 = arith.constant 0 : i32 - %cst = arith.constant dense<1.000000e+00> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma> - %cst_1 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #mma> - %cst_2 = arith.constant dense<0xFF800000> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma> - %cst_4 = arith.constant dense<0.000000e+00> : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %c1_i32 = arith.constant 1 : i32 - %c0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c128_i32 = arith.constant 128 : i32 - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = tt.get_program_id {axis = 1 : i32} : i32 - %2 = arith.muli %0, %c128_i32 : i32 - %3 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %4 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %5 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %6 = tt.splat %2 : (i32) -> tensor<128xi32, #blocked0> - %7 = tt.splat %2 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %8 = tt.splat %2 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %9 = arith.addi %6, %3 : tensor<128xi32, #blocked0> - %10 = arith.muli %1, %arg8 : i32 - %11 = arith.addi %7, %4 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %12 = arith.addi %8, %5 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %13 = tt.splat %arg9 : (i32) -> tensor<128x1xi32, #blocked1> - %14 = tt.splat %10 : (i32) -> tensor<128x1xi32, #blocked1> - %15 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %16 = tt.expand_dims %15 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %17 = tt.broadcast %16 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %19 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma}>> - %20 = tt.expand_dims %19 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma}>>) -> tensor<1x128xi32, #mma> - %21 = tt.splat %arg12 : (i32) -> tensor<1x128xi32, #blocked2> - %22 = tt.splat %10 : (i32) -> tensor<1x128xi32, #blocked2> - %23 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %24 = tt.expand_dims %23 {axis = 1 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<64x1xi32, #blocked2> - %25 = tt.expand_dims %18 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x128xi32, #blocked2> - %26 = arith.muli %25, %21 : tensor<1x128xi32, #blocked2> - %27 = arith.addi %22, %26 : tensor<1x128xi32, #blocked2> - %28 = tt.broadcast %27 : (tensor<1x128xi32, #blocked2>) -> tensor<64x128xi32, #blocked2> - %29 = tt.splat %arg0 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %arg1 : (!tt.ptr) -> tensor<64x128x!tt.ptr, #blocked2> - %31 = tt.splat %arg2 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.expand_dims %11 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %33 = arith.muli %32, %13 : tensor<128x1xi32, #blocked1> - %34 = arith.addi %14, %33 : tensor<128x1xi32, #blocked1> - %35 = tt.broadcast %34 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %36 = arith.addi %35, %17 : tensor<128x64xi32, #blocked1> - %37 = tt.addptr %29, %36 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %38 = arith.addi %0, %c1_i32 : i32 - %39 = arith.muli %38, %c128_i32 : i32 - %40 = arith.index_cast %39 : i32 to index - %41 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma> - %42 = tt.expand_dims %12 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128x1xi32, #mma> - %43 = tt.broadcast %42 : (tensor<128x1xi32, #mma>) -> tensor<128x128xi32, #mma> - %44 = arith.muli %arg12, %c128_i32 : i32 - %45 = tt.splat %44 : (i32) -> tensor<64x128xi32, #blocked2> - %46 = arith.muli %arg15, %c128_i32 : i32 - %47 = tt.splat %46 : (i32) -> tensor<128x64xi32, #blocked1> - %48 = tt.broadcast %24 : (tensor<64x1xi32, #blocked2>) -> tensor<64x128xi32, #blocked2> - %49 = arith.addi %28, %48 : tensor<64x128xi32, #blocked2> - %50 = tt.addptr %30, %49 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> - %51 = tt.expand_dims %4 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %52 = arith.muli %51, %13 : tensor<128x1xi32, #blocked1> - %53 = arith.addi %14, %52 : tensor<128x1xi32, #blocked1> - %54 = tt.broadcast %53 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %55 = arith.addi %54, %17 : tensor<128x64xi32, #blocked1> - %56 = tt.addptr %31, %55 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %79 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared0> - - // TODO: Load should be transformed into `insert_slice_async + extract_slice` at the very end of the optimization pass so it benefits from LICM - %80 = triton_gpu.insert_slice_async %37, %79, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared0> - triton_gpu.async_wait {num = 0 : i32} - %81 = tensor.extract_slice %80[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared0> to tensor<128x64xf16, #shared0> - %82 = triton_gpu.convert_layout %81 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>> - - %57:5 = scf.for %arg22 = %c0 to %40 step %c128 iter_args(%arg23 = %cst_4, %arg24 = %cst_3, %arg25 = %cst_2, %arg26 = %50, %arg27 = %56) -> (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<128x64xf32, #mma>, tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>) { - %78 = arith.index_cast %arg22 : index to i32 - %83 = triton_gpu.alloc_tensor : tensor<1x64x128xf16, #shared1> - %84 = triton_gpu.insert_slice_async %arg26, %83, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<64x128x!tt.ptr, #blocked2> -> tensor<1x64x128xf16, #shared1> - triton_gpu.async_wait {num = 0 : i32} - %85 = tensor.extract_slice %84[0, 0, 0] [1, 64, 128] [1, 1, 1] : tensor<1x64x128xf16, #shared1> to tensor<64x128xf16, #shared1> - %86 = triton_gpu.convert_layout %85 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>> - %87 = tt.dot %82, %86, %cst_0 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>> -> tensor<128x128xf32, #mma> - %88 = tt.splat %78 : (i32) -> tensor<1x128xi32, #mma> - %89 = arith.addi %88, %20 : tensor<1x128xi32, #mma> - %90 = tt.broadcast %89 : (tensor<1x128xi32, #mma>) -> tensor<128x128xi32, #mma> - %91 = arith.mulf %87, %41 : tensor<128x128xf32, #mma> - %92 = "triton_gpu.cmpi"(%43, %90) {predicate = 5 : i64} : (tensor<128x128xi32, #mma>, tensor<128x128xi32, #mma>) -> tensor<128x128xi1, #mma> - %93 = "triton_gpu.select"(%92, %91, %cst_1) : (tensor<128x128xi1, #mma>, tensor<128x128xf32, #mma>, tensor<128x128xf32, #mma>) -> tensor<128x128xf32, #mma> - %94 = tt.reduce %93 {axis = 1 : i32, redOp = 12 : i32} : tensor<128x128xf32, #mma> -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %95 = "triton_gpu.cmpf"(%94, %arg25) {predicate = 2 : i64} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xi1, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %96 = "triton_gpu.select"(%95, %94, %arg25) : (tensor<128xi1, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %97 = tt.expand_dims %96 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128x1xf32, #mma> - %98 = tt.broadcast %97 : (tensor<128x1xf32, #mma>) -> tensor<128x128xf32, #mma> - %99 = arith.subf %93, %98 : tensor<128x128xf32, #mma> - %100 = math.exp %99 : tensor<128x128xf32, #mma> - %101 = tt.reduce %100 {axis = 1 : i32, redOp = 2 : i32} : tensor<128x128xf32, #mma> -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %102 = arith.subf %arg25, %96 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %103 = math.exp %102 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %104 = arith.mulf %arg23, %103 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %105 = arith.addf %101, %104 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %106 = arith.divf %cst, %105 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %107 = arith.mulf %104, %106 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>> - %108 = tt.expand_dims %107 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128x1xf32, #mma> - %109 = tt.broadcast %108 : (tensor<128x1xf32, #mma>) -> tensor<128x64xf32, #mma> - %110 = arith.mulf %arg24, %109 : tensor<128x64xf32, #mma> - %111 = tt.expand_dims %106 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>) -> tensor<128x1xf32, #mma> - %112 = tt.broadcast %111 : (tensor<128x1xf32, #mma>) -> tensor<128x128xf32, #mma> - %113 = arith.mulf %100, %112 : tensor<128x128xf32, #mma> - %114 = arith.truncf %113 : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma> - %115 = triton_gpu.convert_layout %114 : (tensor<128x128xf16, #mma>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>> - %116 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared0> - %117 = triton_gpu.insert_slice_async %arg27, %116, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared0> - triton_gpu.async_wait {num = 0 : i32} - %118 = tensor.extract_slice %117[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared0> to tensor<128x64xf16, #shared0> - %119 = triton_gpu.convert_layout %118 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>> - %120 = tt.dot %115, %119, %110 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma}>> -> tensor<128x64xf32, #mma> - %121 = tt.addptr %arg26, %45 : tensor<64x128x!tt.ptr, #blocked2>, tensor<64x128xi32, #blocked2> - %122 = tt.addptr %arg27, %47 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %105, %120, %96, %121, %122 : tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<128x64xf32, #mma>, tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma}>>, tensor<64x128x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1> - } - %203 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #mma_s1> - %206 = tt.splat %2 : (i32) -> tensor<128xi32, #mma_s1> - %209 = arith.addi %206, %203 : tensor<128xi32, #mma_s1> - %61 = arith.muli %1, %arg21 : i32 - %62 = tt.addptr %arg4, %61 : !tt.ptr, i32 - %63 = tt.splat %62 : (!tt.ptr) -> tensor<128x!tt.ptr, #mma_s1> - %64 = tt.addptr %63, %209 : tensor<128x!tt.ptr, #mma_s1>, tensor<128xi32, #mma_s1> - %65 = tt.addptr %arg5, %61 : !tt.ptr, i32 - %66 = tt.splat %65 : (!tt.ptr) -> tensor<128x!tt.ptr, #mma_s1> - %67 = tt.addptr %66, %209 : tensor<128x!tt.ptr, #mma_s1>, tensor<128xi32, #mma_s1> - tt.store %64, %57#0 : tensor<128xf32, #mma_s1> - tt.store %67, %57#2 : tensor<128xf32, #mma_s1> - %68 = arith.muli %1, %arg17 : i32 - %69 = tt.splat %arg18 : (i32) -> tensor<128x1xi32, #blocked1> - %70 = tt.splat %68 : (i32) -> tensor<128x1xi32, #blocked1> - %71 = tt.splat %arg6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %72 = arith.muli %32, %69 : tensor<128x1xi32, #blocked1> - %73 = arith.addi %70, %72 : tensor<128x1xi32, #blocked1> - %74 = tt.broadcast %73 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %75 = arith.addi %74, %17 : tensor<128x64xi32, #blocked1> - %76 = tt.addptr %71, %75 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %77 = arith.truncf %57#1 : tensor<128x64xf32, #mma> to tensor<128x64xf16, #mma> - // TODO: conversion should be here, not right after the loop - %78 = triton_gpu.convert_layout %77 : (tensor<128x64xf16, #mma>) -> tensor<128x64xf16, #blocked1> - tt.store %76, %78 : tensor<128x64xf16, #blocked1> - return - } -} diff --git a/python/slow.ttgir b/python/slow.ttgir deleted file mode 100644 index 3a26af9a6..000000000 --- a/python/slow.ttgir +++ /dev/null @@ -1,168 +0,0 @@ -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma0 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1]}> -#mma1 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2]}> -#shared0 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> -#shared1 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [0, 1]}> -module attributes {"triton_gpu.num-warps" = 8 : i32} { - func public @_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32 {tt.divisibility = 16 : i32}, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32, %arg22: i32 {tt.divisibility = 16 : i32}, %arg23: i32 {tt.divisibility = 16 : i32}, %arg24: i32) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128_i32 = arith.constant 128 : i32 - %c128 = arith.constant 128 : index - %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %cst_0 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #mma0> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma0> - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.divsi %0, %arg22 : i32 - %2 = arith.remsi %0, %arg22 : i32 - %3 = arith.muli %1, %arg12 : i32 - %4 = arith.muli %2, %arg13 : i32 - %5 = arith.addi %3, %4 : i32 - %6 = tt.addptr %arg0, %5 : !tt.ptr, i32 - %7 = tt.addptr %arg1, %5 : !tt.ptr, i32 - %8 = tt.addptr %arg2, %5 : !tt.ptr, i32 - %9 = tt.addptr %arg5, %5 : !tt.ptr, i32 - %10 = tt.addptr %arg6, %5 : !tt.ptr, i32 - %11 = tt.addptr %arg7, %5 : !tt.ptr, i32 - %12 = tt.addptr %arg8, %5 : !tt.ptr, i32 - %13 = arith.index_cast %arg24 : i32 to index - %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %19 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> - %20 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked2> - %21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %22 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %23 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %25 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2> - %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %27 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %28 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> - %29 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %31 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked2> - %33 = arith.muli %0, %arg23 : i32 - %34 = tt.addptr %arg11, %33 : !tt.ptr, i32 - %35 = tt.addptr %arg10, %33 : !tt.ptr, i32 - %36 = arith.muli %arg24, %c128_i32 : i32 - %37 = arith.index_cast %36 : i32 to index - %38 = tt.splat %35 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %39 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma0> - %40 = tt.splat %34 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %41 = arith.muli %arg14, %c128_i32 : i32 - %42 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked1> - %43 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked2> - %44 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %45 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - scf.for %arg25 = %c0 to %13 step %c1 { - %46 = arith.index_cast %arg25 : index to i32 - %47 = arith.muli %46, %c128_i32 : i32 - %48 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %49 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %50 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %51 = arith.addi %48, %15 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %52 = arith.addi %49, %16 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %53 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %54 = tt.expand_dims %52 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<128x1xi32, #blocked2> - %55 = arith.muli %53, %28 : tensor<128x1xi32, #blocked1> - %56 = tt.broadcast %55 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %57 = arith.addi %56, %24 : tensor<128x64xi32, #blocked1> - %58 = tt.addptr %29, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %59 = tt.load %58 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %60 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %61 = tt.trans %60 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %62 = arith.muli %53, %19 : tensor<128x1xi32, #blocked1> - %63 = tt.broadcast %62 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %64 = arith.addi %63, %24 : tensor<128x64xi32, #blocked1> - %65 = tt.addptr %30, %64 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %66 = tt.load %65 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %67 = triton_gpu.convert_layout %66 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %68 = tt.trans %67 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %69 = arith.index_cast %47 : i32 to index - %70 = arith.addi %50, %17 : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %71 = tt.expand_dims %70 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>>) -> tensor<1x128xi32, #mma0> - %72 = tt.broadcast %71 : (tensor<1x128xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %73 = arith.muli %54, %20 : tensor<128x1xi32, #blocked2> - %74 = tt.broadcast %73 : (tensor<128x1xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %75 = arith.addi %74, %26 : tensor<128x64xi32, #blocked2> - %76 = tt.addptr %32, %75 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %77 = tt.addptr %27, %64 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %78 = tt.addptr %31, %64 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %79:5 = scf.for %arg26 = %69 to %37 step %c128 iter_args(%arg27 = %cst, %arg28 = %cst, %arg29 = %76, %arg30 = %77, %arg31 = %78) -> (tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { - %86 = arith.index_cast %arg26 : index to i32 - %87 = tt.splat %86 : (i32) -> tensor<128xi32, #blocked0> - %88 = tt.splat %86 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %89 = arith.addi %87, %14 : tensor<128xi32, #blocked0> - %90 = tt.load %arg30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %91 = triton_gpu.convert_layout %90 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %92 = triton_gpu.convert_layout %91 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %93 = triton_gpu.convert_layout %61 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %94 = tt.dot %92, %93, %cst_1 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %95 = arith.addi %88, %18 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %96 = tt.expand_dims %95 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xi32, #mma0> - %97 = tt.broadcast %96 : (tensor<128x1xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %98 = "triton_gpu.cmpi"(%97, %72) {predicate = 5 : i64} : (tensor<128x128xi32, #mma0>, tensor<128x128xi32, #mma0>) -> tensor<128x128xi1, #mma0> - %99 = "triton_gpu.select"(%98, %94, %cst_0) : (tensor<128x128xi1, #mma0>, tensor<128x128xf32, #mma0>, tensor<128x128xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %100 = tt.addptr %38, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %101 = tt.load %100 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %102 = arith.mulf %99, %39 : tensor<128x128xf32, #mma0> - %103 = triton_gpu.convert_layout %101 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %104 = tt.expand_dims %103 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %105 = tt.broadcast %104 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %106 = arith.subf %102, %105 : tensor<128x128xf32, #mma0> - %107 = math.exp %106 : tensor<128x128xf32, #mma0> - %108 = tt.load %arg31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %109 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %110 = arith.truncf %107 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %111 = triton_gpu.convert_layout %110 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %112 = tt.trans %111 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %113 = triton_gpu.convert_layout %112 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %114 = triton_gpu.convert_layout %109 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %115 = tt.dot %113, %114, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %116 = tt.addptr %40, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %117 = tt.load %116 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %118 = triton_gpu.convert_layout %117 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %119 = tt.expand_dims %118 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %120 = tt.broadcast %119 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %121 = arith.subf %cst_1, %120 : tensor<128x128xf32, #mma0> - %122 = triton_gpu.convert_layout %109 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %123 = triton_gpu.convert_layout %68 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %124 = tt.dot %122, %123, %121 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %125 = arith.mulf %107, %124 : tensor<128x128xf32, #mma0> - %126 = arith.mulf %125, %39 : tensor<128x128xf32, #mma0> - %127 = arith.truncf %126 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %128 = triton_gpu.convert_layout %127 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %129 = tt.trans %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %130 = triton_gpu.convert_layout %129 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %131 = triton_gpu.convert_layout %91 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %132 = tt.dot %130, %131, %arg28 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %133 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf32, #blocked2> - %134 = triton_gpu.convert_layout %133 : (tensor<128x64xf32, #blocked2>) -> tensor<128x64xf32, #mma1> - %135 = triton_gpu.convert_layout %128 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %136 = triton_gpu.convert_layout %60 : (tensor<128x64xf16, #shared0>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %137 = tt.dot %135, %136, %134 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %138 = triton_gpu.convert_layout %137 : (tensor<128x64xf32, #mma1>) -> tensor<128x64xf32, #blocked2> - tt.store %arg29, %133 : tensor<128x64xf32, #blocked2> - %139 = tt.addptr %arg29, %43 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %140 = tt.addptr %arg30, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %141 = tt.addptr %arg31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %115, %132, %139, %140, %141 : tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> - } - %80 = arith.truncf %79#0 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %81 = tt.addptr %44, %64 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %82 = triton_gpu.convert_layout %80 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %81, %82 : tensor<128x64xf16, #blocked1> - %83 = arith.truncf %79#1 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %84 = tt.addptr %45, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %85 = triton_gpu.convert_layout %83 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %84, %85 : tensor<128x64xf16, #blocked1> - } - return - } -} \ No newline at end of file diff --git a/python/unoptimized.ttgir b/python/unoptimized.ttgir deleted file mode 100644 index 4b1361378..000000000 --- a/python/unoptimized.ttgir +++ /dev/null @@ -1,178 +0,0 @@ -// TODO: swizzle -// TODO: move opIdx = 0 before opIdx = 1 -// TODO: reuse %128 in %137 = triton_gpu.convert_layout %127 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> -// TODO: don't convert loaded value to mma for accumulation -// triton-opt unoptimized.ttgir -tritongpu-sink-conversions-from-shared -tritongpu-decompose-conversions-to-dot-operand -cse - -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> -#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> -#mma0 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1]}> -#mma1 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2]}> -#shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> -#shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> -#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> -module attributes {"triton_gpu.num-warps" = 8 : i32} { - func public @_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32 {tt.divisibility = 16 : i32}, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32, %arg22: i32 {tt.divisibility = 16 : i32}, %arg23: i32 {tt.divisibility = 16 : i32}, %arg24: i32) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c128_i32 = arith.constant 128 : i32 - %c128 = arith.constant 128 : index - %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma1> - %cst_0 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #mma0> - %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma0> - %0 = tt.get_program_id {axis = 0 : i32} : i32 - %1 = arith.divsi %0, %arg22 : i32 - %2 = arith.remsi %0, %arg22 : i32 - %3 = arith.muli %1, %arg12 : i32 - %4 = arith.muli %2, %arg13 : i32 - %5 = arith.addi %3, %4 : i32 - %6 = tt.addptr %arg0, %5 : !tt.ptr, i32 - %7 = tt.addptr %arg1, %5 : !tt.ptr, i32 - %8 = tt.addptr %arg2, %5 : !tt.ptr, i32 - %9 = tt.addptr %arg5, %5 : !tt.ptr, i32 - %10 = tt.addptr %arg6, %5 : !tt.ptr, i32 - %11 = tt.addptr %arg7, %5 : !tt.ptr, i32 - %12 = tt.addptr %arg8, %5 : !tt.ptr, i32 - %13 = arith.index_cast %arg24 : i32 to index - %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %19 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> - %20 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked2> - %21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %22 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %23 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %25 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2> - %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %27 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %28 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> - %29 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %31 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked2> - %33 = arith.muli %0, %arg23 : i32 - %34 = tt.addptr %arg11, %33 : !tt.ptr, i32 - %35 = tt.addptr %arg10, %33 : !tt.ptr, i32 - %36 = arith.muli %arg24, %c128_i32 : i32 - %37 = arith.index_cast %36 : i32 to index - %38 = tt.splat %35 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %39 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma0> - %40 = tt.splat %34 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %41 = arith.muli %arg14, %c128_i32 : i32 - %42 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked1> - %43 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked2> - %44 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %45 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - scf.for %arg25 = %c0 to %13 step %c1 { - %46 = arith.index_cast %arg25 : index to i32 - %47 = arith.muli %46, %c128_i32 : i32 - %48 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %49 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %50 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %51 = arith.addi %48, %15 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %52 = arith.addi %49, %16 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %53 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %54 = tt.expand_dims %52 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<128x1xi32, #blocked2> - %55 = arith.muli %53, %28 : tensor<128x1xi32, #blocked1> - %56 = tt.broadcast %55 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %57 = arith.addi %56, %24 : tensor<128x64xi32, #blocked1> - %58 = tt.addptr %29, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %59 = tt.load %58 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %60 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %61 = arith.muli %53, %19 : tensor<128x1xi32, #blocked1> - %62 = tt.broadcast %61 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %63 = arith.addi %62, %24 : tensor<128x64xi32, #blocked1> - %64 = tt.addptr %30, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %65 = tt.load %64 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %66 = triton_gpu.convert_layout %65 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %67 = arith.index_cast %47 : i32 to index - %68 = tt.trans %60 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %69 = arith.addi %50, %17 : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>> - %70 = tt.expand_dims %69 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #mma0}>>) -> tensor<1x128xi32, #mma0> - %71 = tt.broadcast %70 : (tensor<1x128xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %72 = tt.trans %66 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %73 = arith.muli %54, %20 : tensor<128x1xi32, #blocked2> - %74 = tt.broadcast %73 : (tensor<128x1xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %75 = arith.addi %74, %26 : tensor<128x64xi32, #blocked2> - %76 = tt.addptr %32, %75 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %77 = tt.addptr %27, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %78 = tt.addptr %31, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %79:5 = scf.for %arg26 = %67 to %37 step %c128 iter_args(%arg27 = %cst, %arg28 = %cst, %arg29 = %76, %arg30 = %77, %arg31 = %78) -> (tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { - %86 = arith.index_cast %arg26 : index to i32 - %87 = tt.splat %86 : (i32) -> tensor<128xi32, #blocked0> - %88 = tt.splat %86 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %89 = arith.addi %87, %14 : tensor<128xi32, #blocked0> - %90 = tt.load %arg30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %91 = triton_gpu.convert_layout %68 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %92 = triton_gpu.convert_layout %90 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared2> - %93 = triton_gpu.convert_layout %92 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %94 = tt.dot %93, %91, %cst_1 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %95 = arith.addi %88, %18 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %96 = tt.expand_dims %95 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xi32, #mma0> - %97 = tt.broadcast %96 : (tensor<128x1xi32, #mma0>) -> tensor<128x128xi32, #mma0> - %98 = "triton_gpu.cmpi"(%97, %71) {predicate = 5 : i64} : (tensor<128x128xi32, #mma0>, tensor<128x128xi32, #mma0>) -> tensor<128x128xi1, #mma0> - %99 = "triton_gpu.select"(%98, %94, %cst_0) : (tensor<128x128xi1, #mma0>, tensor<128x128xf32, #mma0>, tensor<128x128xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %100 = tt.addptr %38, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %101 = tt.load %100 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %102 = triton_gpu.convert_layout %101 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %103 = arith.mulf %99, %39 : tensor<128x128xf32, #mma0> - %104 = tt.expand_dims %102 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %105 = tt.broadcast %104 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %106 = arith.subf %103, %105 : tensor<128x128xf32, #mma0> - %107 = math.exp %106 : tensor<128x128xf32, #mma0> - %108 = tt.load %arg31 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %109 = arith.truncf %107 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %110 = triton_gpu.convert_layout %109 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %111 = tt.trans %110 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %112 = triton_gpu.convert_layout %111 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %113 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared2> - %114 = triton_gpu.convert_layout %113 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %115 = tt.dot %112, %114, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %116 = tt.addptr %40, %89 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %117 = tt.load %116 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %118 = triton_gpu.convert_layout %117 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>> - %119 = tt.expand_dims %118 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma0}>>) -> tensor<128x1xf32, #mma0> - %120 = tt.broadcast %119 : (tensor<128x1xf32, #mma0>) -> tensor<128x128xf32, #mma0> - %121 = arith.subf %cst_1, %120 : tensor<128x128xf32, #mma0> - %122 = triton_gpu.convert_layout %72 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %123 = triton_gpu.convert_layout %108 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared2> - %124 = triton_gpu.convert_layout %123 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %125 = tt.dot %124, %122, %121 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x128xf32, #mma0> - %126 = arith.mulf %107, %125 : tensor<128x128xf32, #mma0> - %127 = arith.mulf %126, %39 : tensor<128x128xf32, #mma0> - %128 = arith.truncf %127 : tensor<128x128xf32, #mma0> to tensor<128x128xf16, #mma0> - %129 = triton_gpu.convert_layout %128 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #shared0> - %130 = tt.trans %129 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #shared1> - %131 = triton_gpu.convert_layout %90 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared2> - %132 = triton_gpu.convert_layout %131 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %133 = triton_gpu.convert_layout %130 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %134 = tt.dot %133, %132, %arg28 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %135 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf32, #blocked2> - %136 = triton_gpu.convert_layout %135 : (tensor<128x64xf32, #blocked2>) -> tensor<128x64xf32, #mma1> - %137 = triton_gpu.convert_layout %128 : (tensor<128x128xf16, #mma0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %138 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared2> - %139 = triton_gpu.convert_layout %138 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %140 = tt.dot %137, %139, %136 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x64xf32, #mma1> - %141 = triton_gpu.convert_layout %140 : (tensor<128x64xf32, #mma1>) -> tensor<128x64xf32, #blocked2> - tt.store %arg29, %141 : tensor<128x64xf32, #blocked2> - %142 = tt.addptr %arg29, %43 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %143 = tt.addptr %arg30, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %144 = tt.addptr %arg31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %115, %134, %142, %143, %144 : tensor<128x64xf32, #mma1>, tensor<128x64xf32, #mma1>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> - } - %80 = arith.truncf %79#0 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %81 = tt.addptr %44, %63 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %82 = triton_gpu.convert_layout %80 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %81, %82 : tensor<128x64xf16, #blocked1> - %83 = arith.truncf %79#1 : tensor<128x64xf32, #mma1> to tensor<128x64xf16, #mma1> - %84 = tt.addptr %45, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %85 = triton_gpu.convert_layout %83 : (tensor<128x64xf16, #mma1>) -> tensor<128x64xf16, #blocked1> - tt.store %84, %85 : tensor<128x64xf16, #blocked1> - } - return - } -} \ No newline at end of file