From b246d85fad615bcfcefb42ad3a0c242d362c3fb3 Mon Sep 17 00:00:00 2001 From: Phil Tillet Date: Fri, 30 Dec 2022 15:21:00 -0800 Subject: [PATCH] trying to figure out spilling root cause --- python/bwd.ptx | 2032 ++++++++++++++++++++++++ python/bwd.ttgir | 242 +-- python/triton/compiler.py | 2 +- python/tutorials/06-fused-attention.py | 126 +- 4 files changed, 2180 insertions(+), 222 deletions(-) create mode 100644 python/bwd.ptx diff --git a/python/bwd.ptx b/python/bwd.ptx new file mode 100644 index 000000000..9b0bd53f7 --- /dev/null +++ b/python/bwd.ptx @@ -0,0 +1,2032 @@ +// +// Generated by LLVM NVPTX Back-End +// + +.version 7.4 +.target sm_86 +.address_size 64 + + // .globl _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27 +.extern .shared .align 1 .b8 global_smem[]; + +.visible .entry _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27( + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2, + .param .f32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_4, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_9, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10, + .param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_15, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_16, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_18, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_19, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_20, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_21, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23, + .param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24 +) +.maxntid 256, 1, 1 +{ + .reg .pred %p<27>; + .reg .b16 %h<257>; + .reg .b32 %r<3848>; + .reg .b32 %hh<321>; + .reg .f32 %f<547>; + .reg .b64 %rd<70>; + + mov.u32 %r1, %tid.x; + ld.param.u64 %rd28, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1]; + and.b32 %r447, %r1, 31; + ld.param.u64 %rd29, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2]; + shr.u32 %r2, %r1, 5; + bfe.u32 %r448, %r1, 3, 2; + shr.u32 %r449, %r1, 3; + and.b32 %r450, %r449, 124; + or.b32 %r3, %r450, %r448; + ld.param.u64 %rd31, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7]; + ld.param.u64 %rd32, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8]; + and.b32 %r4, %r1, 7; + shl.b32 %r5, %r4, 3; + ld.param.u32 %r451, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12]; + shl.b32 %r6, %r2, 4; + ld.param.u32 %r452, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13]; + ld.param.u32 %r453, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14]; + bfe.u32 %r8, %r1, 2, 3; + shl.b32 %r454, %r1, 1; + ld.param.u32 %r455, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17]; + and.b32 %r10, %r454, 6; + mov.u32 %r456, %ctaid.x; + ld.param.u32 %r458, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22]; + div.s32 %r460, %r456, %r458; + mul.lo.s32 %r461, %r460, %r458; + sub.s32 %r462, %r456, %r461; + ld.param.u32 %r463, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24]; + mul.lo.s32 %r464, %r460, %r451; + mad.lo.s32 %r465, %r462, %r452, %r464; + mul.wide.s32 %rd33, %r465, 2; + add.s64 %rd35, %rd28, %rd33; + add.s64 %rd36, %rd29, %rd33; + add.s64 %rd1, %rd31, %rd33; + add.s64 %rd2, %rd32, %rd33; + shl.b32 %r466, %r455, 5; + mad.lo.s32 %r467, %r3, %r455, %r5; + add.s32 %r468, %r467, %r466; + add.s32 %r469, %r468, %r466; + add.s32 %r470, %r469, %r466; + cvt.s64.s32 %rd3, %r467; + mul.wide.s32 %rd38, %r467, 2; + add.s64 %rd19, %rd35, %rd38; + cvt.s64.s32 %rd4, %r468; + mul.wide.s32 %rd39, %r468, 2; + add.s64 %rd20, %rd35, %rd39; + cvt.s64.s32 %rd5, %r469; + mul.wide.s32 %rd40, %r469, 2; + add.s64 %rd21, %rd35, %rd40; + cvt.s64.s32 %rd6, %r470; + mul.wide.s32 %rd41, %r470, 2; + add.s64 %rd22, %rd35, %rd41; + mov.pred %p19, -1; + @%p19 ld.global.v4.b32 { %r95, %r96, %r97, %r98 }, [ %rd19 + 0 ]; + @%p19 ld.global.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd20 + 0 ]; + @%p19 ld.global.v4.b32 { %r103, %r104, %r105, %r106 }, [ %rd21 + 0 ]; + @%p19 ld.global.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd22 + 0 ]; + shl.b32 %r471, %r453, 5; + mad.lo.s32 %r472, %r3, %r453, %r5; + add.s32 %r473, %r472, %r471; + add.s32 %r474, %r473, %r471; + add.s32 %r475, %r474, %r471; + cvt.s64.s32 %rd7, %r472; + mul.wide.s32 %rd42, %r472, 2; + add.s64 %rd23, %rd36, %rd42; + cvt.s64.s32 %rd8, %r473; + mul.wide.s32 %rd43, %r473, 2; + add.s64 %rd24, %rd36, %rd43; + cvt.s64.s32 %rd9, %r474; + mul.wide.s32 %rd44, %r474, 2; + add.s64 %rd25, %rd36, %rd44; + cvt.s64.s32 %rd10, %r475; + mul.wide.s32 %rd45, %r475, 2; + add.s64 %rd26, %rd36, %rd45; + @%p19 ld.global.v4.b32 { %r111, %r112, %r113, %r114 }, [ %rd23 + 0 ]; + @%p19 ld.global.v4.b32 { %r115, %r116, %r117, %r118 }, [ %rd24 + 0 ]; + @%p19 ld.global.v4.b32 { %r119, %r120, %r121, %r122 }, [ %rd25 + 0 ]; + @%p19 ld.global.v4.b32 { %r123, %r124, %r125, %r126 }, [ %rd26 + 0 ]; + shl.b32 %r11, %r463, 7; + mov.b32 {%h1, %h2}, %r95; + shl.b32 %r12, %r3, 6; + or.b32 %r476, %r12, %r5; + shl.b32 %r477, %r476, 1; + mov.u32 %r478, global_smem; + add.s32 %r479, %r478, %r477; + st.shared.b16 [%r479], %h1; + st.shared.b16 [%r479+2], %h2; + mov.b32 {%h3, %h4}, %r96; + st.shared.b16 [%r479+4], %h3; + st.shared.b16 [%r479+6], %h4; + mov.b32 {%h5, %h6}, %r97; + st.shared.b16 [%r479+8], %h5; + st.shared.b16 [%r479+10], %h6; + mov.b32 {%h7, %h8}, %r98; + st.shared.b16 [%r479+12], %h7; + st.shared.b16 [%r479+14], %h8; + mov.b32 {%h9, %h10}, %r99; + add.s32 %r13, %r12, 2048; + or.b32 %r480, %r13, %r5; + shl.b32 %r481, %r480, 1; + add.s32 %r482, %r478, %r481; + st.shared.b16 [%r482], %h9; + st.shared.b16 [%r482+2], %h10; + mov.b32 {%h11, %h12}, %r100; + st.shared.b16 [%r482+4], %h11; + st.shared.b16 [%r482+6], %h12; + mov.b32 {%h13, %h14}, %r101; + st.shared.b16 [%r482+8], %h13; + st.shared.b16 [%r482+10], %h14; + mov.b32 {%h15, %h16}, %r102; + st.shared.b16 [%r482+12], %h15; + st.shared.b16 [%r482+14], %h16; + mov.b32 {%h17, %h18}, %r103; + add.s32 %r14, %r12, 4096; + or.b32 %r483, %r14, %r5; + shl.b32 %r484, %r483, 1; + add.s32 %r485, %r478, %r484; + st.shared.b16 [%r485], %h17; + st.shared.b16 [%r485+2], %h18; + mov.b32 {%h19, %h20}, %r104; + st.shared.b16 [%r485+4], %h19; + st.shared.b16 [%r485+6], %h20; + mov.b32 {%h21, %h22}, %r105; + st.shared.b16 [%r485+8], %h21; + st.shared.b16 [%r485+10], %h22; + mov.b32 {%h23, %h24}, %r106; + st.shared.b16 [%r485+12], %h23; + st.shared.b16 [%r485+14], %h24; + mov.b32 {%h25, %h26}, %r107; + add.s32 %r15, %r12, 6144; + or.b32 %r486, %r15, %r5; + shl.b32 %r487, %r486, 1; + add.s32 %r488, %r478, %r487; + st.shared.b16 [%r488], %h25; + st.shared.b16 [%r488+2], %h26; + mov.b32 {%h27, %h28}, %r108; + st.shared.b16 [%r488+4], %h27; + st.shared.b16 [%r488+6], %h28; + mov.b32 {%h29, %h30}, %r109; + st.shared.b16 [%r488+8], %h29; + st.shared.b16 [%r488+10], %h30; + mov.b32 {%h31, %h32}, %r110; + st.shared.b16 [%r488+12], %h31; + st.shared.b16 [%r488+14], %h32; + bar.sync 0; + mov.b32 {%h33, %h34}, %r111; + add.s32 %r489, %r478, 16384; + add.s32 %r490, %r489, %r477; + st.shared.b16 [%r490], %h33; + st.shared.b16 [%r490+2], %h34; + mov.b32 {%h35, %h36}, %r112; + st.shared.b16 [%r490+4], %h35; + st.shared.b16 [%r490+6], %h36; + mov.b32 {%h37, %h38}, %r113; + st.shared.b16 [%r490+8], %h37; + st.shared.b16 [%r490+10], %h38; + mov.b32 {%h39, %h40}, %r114; + st.shared.b16 [%r490+12], %h39; + st.shared.b16 [%r490+14], %h40; + mov.b32 {%h41, %h42}, %r115; + add.s32 %r491, %r489, %r481; + st.shared.b16 [%r491], %h41; + st.shared.b16 [%r491+2], %h42; + mov.b32 {%h43, %h44}, %r116; + st.shared.b16 [%r491+4], %h43; + st.shared.b16 [%r491+6], %h44; + mov.b32 {%h45, %h46}, %r117; + st.shared.b16 [%r491+8], %h45; + st.shared.b16 [%r491+10], %h46; + mov.b32 {%h47, %h48}, %r118; + st.shared.b16 [%r491+12], %h47; + st.shared.b16 [%r491+14], %h48; + mov.b32 {%h49, %h50}, %r119; + add.s32 %r492, %r489, %r484; + st.shared.b16 [%r492], %h49; + st.shared.b16 [%r492+2], %h50; + mov.b32 {%h51, %h52}, %r120; + st.shared.b16 [%r492+4], %h51; + st.shared.b16 [%r492+6], %h52; + mov.b32 {%h53, %h54}, %r121; + st.shared.b16 [%r492+8], %h53; + st.shared.b16 [%r492+10], %h54; + mov.b32 {%h55, %h56}, %r122; + st.shared.b16 [%r492+12], %h55; + st.shared.b16 [%r492+14], %h56; + mov.b32 {%h57, %h58}, %r123; + add.s32 %r493, %r489, %r487; + st.shared.b16 [%r493], %h57; + st.shared.b16 [%r493+2], %h58; + mov.b32 {%h59, %h60}, %r124; + st.shared.b16 [%r493+4], %h59; + st.shared.b16 [%r493+6], %h60; + mov.b32 {%h61, %h62}, %r125; + st.shared.b16 [%r493+8], %h61; + st.shared.b16 [%r493+10], %h62; + mov.b32 {%h63, %h64}, %r126; + st.shared.b16 [%r493+12], %h63; + st.shared.b16 [%r493+14], %h64; + bar.sync 0; + bfe.u32 %r16, %r447, 3, 1; + bfe.u32 %r17, %r1, 4, 1; + shl.b32 %r18, %r17, 3; + or.b32 %r494, %r18, %r4; + shl.b32 %r495, %r494, 6; + shl.b32 %r19, %r16, 3; + or.b32 %r496, %r495, %r19; + shl.b32 %r497, %r496, 1; + add.s32 %r131, %r478, %r497; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3591, %r3592, %r3593, %r3594 }, [ %r131 + 0 ]; + and.b32 %r498, %r1, 8; + or.b32 %r499, %r495, %r498; + shl.b32 %r500, %r499, 1; + add.s32 %r501, %r478, %r500; + add.s32 %r136, %r501, 32; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3623, %r3624, %r3625, %r3626 }, [ %r136 + 0 ]; + add.s32 %r141, %r131, 64; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3655, %r3656, %r3657, %r3658 }, [ %r141 + 0 ]; + add.s32 %r146, %r501, 96; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3687, %r3688, %r3689, %r3690 }, [ %r146 + 0 ]; + add.s32 %r151, %r131, 2048; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3595, %r3596, %r3597, %r3598 }, [ %r151 + 0 ]; + add.s32 %r156, %r501, 2080; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3627, %r3628, %r3629, %r3630 }, [ %r156 + 0 ]; + add.s32 %r161, %r131, 2112; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3659, %r3660, %r3661, %r3662 }, [ %r161 + 0 ]; + add.s32 %r166, %r501, 2144; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3691, %r3692, %r3693, %r3694 }, [ %r166 + 0 ]; + add.s32 %r171, %r131, 4096; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3599, %r3600, %r3601, %r3602 }, [ %r171 + 0 ]; + add.s32 %r176, %r501, 4128; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3631, %r3632, %r3633, %r3634 }, [ %r176 + 0 ]; + add.s32 %r181, %r131, 4160; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3663, %r3664, %r3665, %r3666 }, [ %r181 + 0 ]; + add.s32 %r186, %r501, 4192; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3695, %r3696, %r3697, %r3698 }, [ %r186 + 0 ]; + add.s32 %r191, %r131, 6144; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3603, %r3604, %r3605, %r3606 }, [ %r191 + 0 ]; + add.s32 %r196, %r501, 6176; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3635, %r3636, %r3637, %r3638 }, [ %r196 + 0 ]; + add.s32 %r201, %r131, 6208; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3667, %r3668, %r3669, %r3670 }, [ %r201 + 0 ]; + add.s32 %r206, %r501, 6240; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3699, %r3700, %r3701, %r3702 }, [ %r206 + 0 ]; + add.s32 %r211, %r131, 8192; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3607, %r3608, %r3609, %r3610 }, [ %r211 + 0 ]; + add.s32 %r216, %r501, 8224; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3639, %r3640, %r3641, %r3642 }, [ %r216 + 0 ]; + add.s32 %r221, %r131, 8256; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3671, %r3672, %r3673, %r3674 }, [ %r221 + 0 ]; + add.s32 %r226, %r501, 8288; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3703, %r3704, %r3705, %r3706 }, [ %r226 + 0 ]; + add.s32 %r231, %r131, 10240; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3611, %r3612, %r3613, %r3614 }, [ %r231 + 0 ]; + add.s32 %r236, %r501, 10272; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3643, %r3644, %r3645, %r3646 }, [ %r236 + 0 ]; + add.s32 %r241, %r131, 10304; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3675, %r3676, %r3677, %r3678 }, [ %r241 + 0 ]; + add.s32 %r246, %r501, 10336; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3707, %r3708, %r3709, %r3710 }, [ %r246 + 0 ]; + add.s32 %r251, %r131, 12288; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3615, %r3616, %r3617, %r3618 }, [ %r251 + 0 ]; + add.s32 %r256, %r501, 12320; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3647, %r3648, %r3649, %r3650 }, [ %r256 + 0 ]; + add.s32 %r261, %r131, 12352; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3679, %r3680, %r3681, %r3682 }, [ %r261 + 0 ]; + add.s32 %r266, %r501, 12384; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3711, %r3712, %r3713, %r3714 }, [ %r266 + 0 ]; + add.s32 %r271, %r131, 14336; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3619, %r3620, %r3621, %r3622 }, [ %r271 + 0 ]; + add.s32 %r276, %r501, 14368; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3651, %r3652, %r3653, %r3654 }, [ %r276 + 0 ]; + add.s32 %r281, %r131, 14400; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3683, %r3684, %r3685, %r3686 }, [ %r281 + 0 ]; + add.s32 %r286, %r501, 14432; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3715, %r3716, %r3717, %r3718 }, [ %r286 + 0 ]; + add.s32 %r291, %r489, %r497; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3719, %r3720, %r3721, %r3722 }, [ %r291 + 0 ]; + add.s32 %r502, %r489, %r500; + add.s32 %r296, %r502, 32; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3751, %r3752, %r3753, %r3754 }, [ %r296 + 0 ]; + add.s32 %r301, %r291, 64; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3783, %r3784, %r3785, %r3786 }, [ %r301 + 0 ]; + add.s32 %r306, %r502, 96; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3815, %r3816, %r3817, %r3818 }, [ %r306 + 0 ]; + add.s32 %r311, %r291, 2048; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3723, %r3724, %r3725, %r3726 }, [ %r311 + 0 ]; + add.s32 %r316, %r502, 2080; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3755, %r3756, %r3757, %r3758 }, [ %r316 + 0 ]; + add.s32 %r321, %r291, 2112; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3787, %r3788, %r3789, %r3790 }, [ %r321 + 0 ]; + add.s32 %r326, %r502, 2144; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3819, %r3820, %r3821, %r3822 }, [ %r326 + 0 ]; + add.s32 %r331, %r291, 4096; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3727, %r3728, %r3729, %r3730 }, [ %r331 + 0 ]; + add.s32 %r336, %r502, 4128; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3759, %r3760, %r3761, %r3762 }, [ %r336 + 0 ]; + add.s32 %r341, %r291, 4160; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3791, %r3792, %r3793, %r3794 }, [ %r341 + 0 ]; + add.s32 %r346, %r502, 4192; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3823, %r3824, %r3825, %r3826 }, [ %r346 + 0 ]; + add.s32 %r351, %r291, 6144; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3731, %r3732, %r3733, %r3734 }, [ %r351 + 0 ]; + add.s32 %r356, %r502, 6176; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3763, %r3764, %r3765, %r3766 }, [ %r356 + 0 ]; + add.s32 %r361, %r291, 6208; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3795, %r3796, %r3797, %r3798 }, [ %r361 + 0 ]; + add.s32 %r366, %r502, 6240; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3827, %r3828, %r3829, %r3830 }, [ %r366 + 0 ]; + add.s32 %r371, %r291, 8192; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3735, %r3736, %r3737, %r3738 }, [ %r371 + 0 ]; + add.s32 %r376, %r502, 8224; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3767, %r3768, %r3769, %r3770 }, [ %r376 + 0 ]; + add.s32 %r381, %r291, 8256; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3799, %r3800, %r3801, %r3802 }, [ %r381 + 0 ]; + add.s32 %r386, %r502, 8288; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3831, %r3832, %r3833, %r3834 }, [ %r386 + 0 ]; + add.s32 %r391, %r291, 10240; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3739, %r3740, %r3741, %r3742 }, [ %r391 + 0 ]; + add.s32 %r396, %r502, 10272; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3771, %r3772, %r3773, %r3774 }, [ %r396 + 0 ]; + add.s32 %r401, %r291, 10304; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3803, %r3804, %r3805, %r3806 }, [ %r401 + 0 ]; + add.s32 %r406, %r502, 10336; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3835, %r3836, %r3837, %r3838 }, [ %r406 + 0 ]; + add.s32 %r411, %r291, 12288; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3743, %r3744, %r3745, %r3746 }, [ %r411 + 0 ]; + add.s32 %r416, %r502, 12320; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3775, %r3776, %r3777, %r3778 }, [ %r416 + 0 ]; + add.s32 %r421, %r291, 12352; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3807, %r3808, %r3809, %r3810 }, [ %r421 + 0 ]; + add.s32 %r426, %r502, 12384; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3839, %r3840, %r3841, %r3842 }, [ %r426 + 0 ]; + add.s32 %r431, %r291, 14336; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3747, %r3748, %r3749, %r3750 }, [ %r431 + 0 ]; + add.s32 %r436, %r502, 14368; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3779, %r3780, %r3781, %r3782 }, [ %r436 + 0 ]; + add.s32 %r441, %r291, 14400; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3811, %r3812, %r3813, %r3814 }, [ %r441 + 0 ]; + add.s32 %r446, %r502, 14432; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3843, %r3844, %r3845, %r3846 }, [ %r446 + 0 ]; + setp.lt.s32 %p9, %r11, 1; + mov.f32 %f129, 0f00000000; + mov.f32 %f515, %f129; + mov.f32 %f516, %f129; + mov.f32 %f517, %f129; + mov.f32 %f518, %f129; + mov.f32 %f519, %f129; + mov.f32 %f520, %f129; + mov.f32 %f521, %f129; + mov.f32 %f522, %f129; + mov.f32 %f523, %f129; + mov.f32 %f524, %f129; + mov.f32 %f525, %f129; + mov.f32 %f526, %f129; + mov.f32 %f527, %f129; + mov.f32 %f528, %f129; + mov.f32 %f529, %f129; + mov.f32 %f530, %f129; + mov.f32 %f531, %f129; + mov.f32 %f532, %f129; + mov.f32 %f533, %f129; + mov.f32 %f534, %f129; + mov.f32 %f535, %f129; + mov.f32 %f536, %f129; + mov.f32 %f537, %f129; + mov.f32 %f538, %f129; + mov.f32 %f539, %f129; + mov.f32 %f540, %f129; + mov.f32 %f541, %f129; + mov.f32 %f542, %f129; + mov.f32 %f543, %f129; + mov.f32 %f544, %f129; + mov.f32 %f545, %f129; + mov.f32 %f546, %f129; + @%p9 bra LBB0_3; + ld.param.f32 %f97, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3]; + ld.param.u64 %rd27, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0]; + ld.param.u64 %rd30, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5]; + and.b32 %r7, %r6, 112; + or.b32 %r9, %r7, %r8; + add.s64 %rd34, %rd27, %rd33; + add.s64 %rd37, %rd30, %rd33; + add.s64 %rd46, %rd34, %rd42; + add.s64 %rd47, %rd34, %rd43; + add.s64 %rd48, %rd34, %rd44; + add.s64 %rd49, %rd34, %rd45; + add.s64 %rd50, %rd37, %rd42; + add.s64 %rd51, %rd37, %rd43; + add.s64 %rd52, %rd37, %rd44; + add.s64 %rd53, %rd37, %rd45; + mov.b32 %hh1, %r3591; + mov.b32 %hh2, %r3592; + mov.b32 %hh3, %r3593; + mov.b32 %hh4, %r3594; + mov.b32 %hh5, %r3623; + mov.b32 %hh6, %r3624; + mov.b32 %hh7, %r3625; + mov.b32 %hh8, %r3626; + mov.b32 %hh9, %r3655; + mov.b32 %hh10, %r3656; + mov.b32 %hh11, %r3657; + mov.b32 %hh12, %r3658; + mov.b32 %hh13, %r3687; + mov.b32 %hh14, %r3688; + mov.b32 %hh15, %r3689; + mov.b32 %hh16, %r3690; + mov.b32 %hh17, %r3595; + mov.b32 %hh18, %r3596; + mov.b32 %hh19, %r3597; + mov.b32 %hh20, %r3598; + mov.b32 %hh21, %r3627; + mov.b32 %hh22, %r3628; + mov.b32 %hh23, %r3629; + mov.b32 %hh24, %r3630; + mov.b32 %hh25, %r3659; + mov.b32 %hh26, %r3660; + mov.b32 %hh27, %r3661; + mov.b32 %hh28, %r3662; + mov.b32 %hh29, %r3691; + mov.b32 %hh30, %r3692; + mov.b32 %hh31, %r3693; + mov.b32 %hh32, %r3694; + mov.b32 %hh33, %r3599; + mov.b32 %hh34, %r3600; + mov.b32 %hh35, %r3601; + mov.b32 %hh36, %r3602; + mov.b32 %hh37, %r3631; + mov.b32 %hh38, %r3632; + mov.b32 %hh39, %r3633; + mov.b32 %hh40, %r3634; + mov.b32 %hh41, %r3663; + mov.b32 %hh42, %r3664; + mov.b32 %hh43, %r3665; + mov.b32 %hh44, %r3666; + mov.b32 %hh45, %r3695; + mov.b32 %hh46, %r3696; + mov.b32 %hh47, %r3697; + mov.b32 %hh48, %r3698; + mov.b32 %hh49, %r3603; + mov.b32 %hh50, %r3604; + mov.b32 %hh51, %r3605; + mov.b32 %hh52, %r3606; + mov.b32 %hh53, %r3635; + mov.b32 %hh54, %r3636; + mov.b32 %hh55, %r3637; + mov.b32 %hh56, %r3638; + mov.b32 %hh57, %r3667; + mov.b32 %hh58, %r3668; + mov.b32 %hh59, %r3669; + mov.b32 %hh60, %r3670; + mov.b32 %hh61, %r3699; + mov.b32 %hh62, %r3700; + mov.b32 %hh63, %r3701; + mov.b32 %hh64, %r3702; + mov.b32 %hh65, %r3607; + mov.b32 %hh66, %r3608; + mov.b32 %hh67, %r3609; + mov.b32 %hh68, %r3610; + mov.b32 %hh69, %r3639; + mov.b32 %hh70, %r3640; + mov.b32 %hh71, %r3641; + mov.b32 %hh72, %r3642; + mov.b32 %hh73, %r3671; + mov.b32 %hh74, %r3672; + mov.b32 %hh75, %r3673; + mov.b32 %hh76, %r3674; + mov.b32 %hh77, %r3703; + mov.b32 %hh78, %r3704; + mov.b32 %hh79, %r3705; + mov.b32 %hh80, %r3706; + mov.b32 %hh81, %r3611; + mov.b32 %hh82, %r3612; + mov.b32 %hh83, %r3613; + mov.b32 %hh84, %r3614; + mov.b32 %hh85, %r3643; + mov.b32 %hh86, %r3644; + mov.b32 %hh87, %r3645; + mov.b32 %hh88, %r3646; + mov.b32 %hh89, %r3675; + mov.b32 %hh90, %r3676; + mov.b32 %hh91, %r3677; + mov.b32 %hh92, %r3678; + mov.b32 %hh93, %r3707; + mov.b32 %hh94, %r3708; + mov.b32 %hh95, %r3709; + mov.b32 %hh96, %r3710; + mov.b32 %hh97, %r3615; + mov.b32 %hh98, %r3616; + mov.b32 %hh99, %r3617; + mov.b32 %hh100, %r3618; + mov.b32 %hh101, %r3647; + mov.b32 %hh102, %r3648; + mov.b32 %hh103, %r3649; + mov.b32 %hh104, %r3650; + mov.b32 %hh105, %r3679; + mov.b32 %hh106, %r3680; + mov.b32 %hh107, %r3681; + mov.b32 %hh108, %r3682; + mov.b32 %hh109, %r3711; + mov.b32 %hh110, %r3712; + mov.b32 %hh111, %r3713; + mov.b32 %hh112, %r3714; + mov.b32 %hh113, %r3619; + mov.b32 %hh114, %r3620; + mov.b32 %hh115, %r3621; + mov.b32 %hh116, %r3622; + mov.b32 %hh117, %r3651; + mov.b32 %hh118, %r3652; + mov.b32 %hh119, %r3653; + mov.b32 %hh120, %r3654; + mov.b32 %hh121, %r3683; + mov.b32 %hh122, %r3684; + mov.b32 %hh123, %r3685; + mov.b32 %hh124, %r3686; + mov.b32 %hh125, %r3715; + mov.b32 %hh126, %r3716; + mov.b32 %hh127, %r3717; + mov.b32 %hh128, %r3718; + mov.b32 %hh129, %r3719; + mov.b32 %hh130, %r3720; + mov.b32 %hh131, %r3721; + mov.b32 %hh132, %r3722; + mov.b32 %hh133, %r3751; + mov.b32 %hh134, %r3752; + mov.b32 %hh135, %r3753; + mov.b32 %hh136, %r3754; + mov.b32 %hh137, %r3783; + mov.b32 %hh138, %r3784; + mov.b32 %hh139, %r3785; + mov.b32 %hh140, %r3786; + mov.b32 %hh141, %r3815; + mov.b32 %hh142, %r3816; + mov.b32 %hh143, %r3817; + mov.b32 %hh144, %r3818; + mov.b32 %hh145, %r3723; + mov.b32 %hh146, %r3724; + mov.b32 %hh147, %r3725; + mov.b32 %hh148, %r3726; + mov.b32 %hh149, %r3755; + mov.b32 %hh150, %r3756; + mov.b32 %hh151, %r3757; + mov.b32 %hh152, %r3758; + mov.b32 %hh153, %r3787; + mov.b32 %hh154, %r3788; + mov.b32 %hh155, %r3789; + mov.b32 %hh156, %r3790; + mov.b32 %hh157, %r3819; + mov.b32 %hh158, %r3820; + mov.b32 %hh159, %r3821; + mov.b32 %hh160, %r3822; + mov.b32 %hh161, %r3727; + mov.b32 %hh162, %r3728; + mov.b32 %hh163, %r3729; + mov.b32 %hh164, %r3730; + mov.b32 %hh165, %r3759; + mov.b32 %hh166, %r3760; + mov.b32 %hh167, %r3761; + mov.b32 %hh168, %r3762; + mov.b32 %hh169, %r3791; + mov.b32 %hh170, %r3792; + mov.b32 %hh171, %r3793; + mov.b32 %hh172, %r3794; + mov.b32 %hh173, %r3823; + mov.b32 %hh174, %r3824; + mov.b32 %hh175, %r3825; + mov.b32 %hh176, %r3826; + mov.b32 %hh177, %r3731; + mov.b32 %hh178, %r3732; + mov.b32 %hh179, %r3733; + mov.b32 %hh180, %r3734; + mov.b32 %hh181, %r3763; + mov.b32 %hh182, %r3764; + mov.b32 %hh183, %r3765; + mov.b32 %hh184, %r3766; + mov.b32 %hh185, %r3795; + mov.b32 %hh186, %r3796; + mov.b32 %hh187, %r3797; + mov.b32 %hh188, %r3798; + mov.b32 %hh189, %r3827; + mov.b32 %hh190, %r3828; + mov.b32 %hh191, %r3829; + mov.b32 %hh192, %r3830; + mov.b32 %hh193, %r3735; + mov.b32 %hh194, %r3736; + mov.b32 %hh195, %r3737; + mov.b32 %hh196, %r3738; + mov.b32 %hh197, %r3767; + mov.b32 %hh198, %r3768; + mov.b32 %hh199, %r3769; + mov.b32 %hh200, %r3770; + mov.b32 %hh201, %r3799; + mov.b32 %hh202, %r3800; + mov.b32 %hh203, %r3801; + mov.b32 %hh204, %r3802; + mov.b32 %hh205, %r3831; + mov.b32 %hh206, %r3832; + mov.b32 %hh207, %r3833; + mov.b32 %hh208, %r3834; + mov.b32 %hh209, %r3739; + mov.b32 %hh210, %r3740; + mov.b32 %hh211, %r3741; + mov.b32 %hh212, %r3742; + mov.b32 %hh213, %r3771; + mov.b32 %hh214, %r3772; + mov.b32 %hh215, %r3773; + mov.b32 %hh216, %r3774; + mov.b32 %hh217, %r3803; + mov.b32 %hh218, %r3804; + mov.b32 %hh219, %r3805; + mov.b32 %hh220, %r3806; + mov.b32 %hh221, %r3835; + mov.b32 %hh222, %r3836; + mov.b32 %hh223, %r3837; + mov.b32 %hh224, %r3838; + mov.b32 %hh225, %r3743; + mov.b32 %hh226, %r3744; + mov.b32 %hh227, %r3745; + mov.b32 %hh228, %r3746; + mov.b32 %hh229, %r3775; + mov.b32 %hh230, %r3776; + mov.b32 %hh231, %r3777; + mov.b32 %hh232, %r3778; + mov.b32 %hh233, %r3807; + mov.b32 %hh234, %r3808; + mov.b32 %hh235, %r3809; + mov.b32 %hh236, %r3810; + mov.b32 %hh237, %r3839; + mov.b32 %hh238, %r3840; + mov.b32 %hh239, %r3841; + mov.b32 %hh240, %r3842; + mov.b32 %hh241, %r3747; + mov.b32 %hh242, %r3748; + mov.b32 %hh243, %r3749; + mov.b32 %hh244, %r3750; + mov.b32 %hh245, %r3779; + mov.b32 %hh246, %r3780; + mov.b32 %hh247, %r3781; + mov.b32 %hh248, %r3782; + mov.b32 %hh249, %r3811; + mov.b32 %hh250, %r3812; + mov.b32 %hh251, %r3813; + mov.b32 %hh252, %r3814; + mov.b32 %hh253, %r3843; + mov.b32 %hh254, %r3844; + mov.b32 %hh255, %r3845; + mov.b32 %hh256, %r3846; + and.b32 %r504, %r3, 7; + xor.b32 %r505, %r504, %r4; + shl.b32 %r506, %r505, 3; + or.b32 %r507, %r506, %r12; + shl.b32 %r508, %r507, 1; + add.s32 %r20, %r478, %r508; + or.b32 %r510, %r506, %r13; + shl.b32 %r511, %r510, 1; + add.s32 %r21, %r478, %r511; + or.b32 %r512, %r506, %r14; + shl.b32 %r513, %r512, 1; + add.s32 %r22, %r478, %r513; + or.b32 %r514, %r506, %r15; + shl.b32 %r515, %r514, 1; + add.s32 %r23, %r478, %r515; + or.b32 %r516, %r19, %r4; + or.b32 %r517, %r516, %r7; + xor.b32 %r518, %r17, %r4; + shl.b32 %r519, %r518, 4; + shl.b32 %r520, %r517, 7; + or.b32 %r521, %r520, %r519; + add.s32 %r599, %r478, %r521; + or.b32 %r522, %r17, 2; + xor.b32 %r523, %r522, %r4; + shl.b32 %r524, %r523, 4; + or.b32 %r525, %r524, %r520; + add.s32 %r604, %r478, %r525; + or.b32 %r526, %r17, 4; + xor.b32 %r527, %r526, %r4; + shl.b32 %r528, %r527, 4; + or.b32 %r529, %r528, %r520; + add.s32 %r609, %r478, %r529; + or.b32 %r530, %r17, 6; + xor.b32 %r531, %r530, %r4; + shl.b32 %r532, %r531, 4; + or.b32 %r533, %r532, %r520; + add.s32 %r614, %r478, %r533; + shl.b32 %r534, %r9, 1; + shl.b32 %r535, %r10, 8; + or.b32 %r536, %r535, %r534; + add.s32 %r28, %r478, %r536; + or.b32 %r537, %r536, 256; + add.s32 %r30, %r478, %r537; + add.s32 %r31, %r28, 2048; + or.b32 %r538, %r536, 2304; + add.s32 %r32, %r478, %r538; + add.s32 %r33, %r28, 4096; + or.b32 %r539, %r536, 4352; + add.s32 %r34, %r478, %r539; + add.s32 %r35, %r28, 6144; + or.b32 %r540, %r536, 6400; + add.s32 %r36, %r478, %r540; + add.s32 %r37, %r28, 8192; + or.b32 %r541, %r536, 8448; + add.s32 %r38, %r478, %r541; + add.s32 %r39, %r28, 10240; + or.b32 %r542, %r536, 10496; + add.s32 %r40, %r478, %r542; + add.s32 %r41, %r28, 12288; + or.b32 %r543, %r536, 12544; + add.s32 %r42, %r478, %r543; + add.s32 %r43, %r28, 14336; + or.b32 %r544, %r536, 14592; + add.s32 %r44, %r478, %r544; + add.s32 %r45, %r28, 16384; + or.b32 %r545, %r536, 16640; + add.s32 %r46, %r478, %r545; + add.s32 %r47, %r28, 18432; + or.b32 %r546, %r536, 18688; + add.s32 %r48, %r478, %r546; + add.s32 %r49, %r28, 20480; + or.b32 %r547, %r536, 20736; + add.s32 %r50, %r478, %r547; + add.s32 %r51, %r28, 22528; + or.b32 %r548, %r536, 22784; + add.s32 %r52, %r478, %r548; + add.s32 %r53, %r28, 24576; + or.b32 %r549, %r536, 24832; + add.s32 %r54, %r478, %r549; + add.s32 %r55, %r28, 26624; + or.b32 %r550, %r536, 26880; + add.s32 %r56, %r478, %r550; + add.s32 %r57, %r28, 28672; + or.b32 %r551, %r536, 28928; + add.s32 %r58, %r478, %r551; + add.s32 %r59, %r28, 30720; + or.b32 %r552, %r536, 30976; + add.s32 %r60, %r478, %r552; + shl.b32 %r553, %r2, 1; + and.b32 %r554, %r553, 6; + or.b32 %r555, %r554, %r16; + shl.b32 %r556, %r555, 10; + shl.b32 %r557, %r4, 7; + or.b32 %r558, %r556, %r557; + or.b32 %r559, %r558, %r18; + shl.b32 %r560, %r559, 1; + add.s32 %r1531, %r478, %r560; + shl.b32 %r561, %r522, 4; + shl.b32 %r562, %r558, 1; + or.b32 %r563, %r561, %r562; + add.s32 %r1536, %r478, %r563; + shl.b32 %r564, %r526, 4; + or.b32 %r565, %r564, %r562; + add.s32 %r1541, %r478, %r565; + shl.b32 %r566, %r530, 4; + or.b32 %r567, %r566, %r562; + add.s32 %r1546, %r478, %r567; + add.s32 %r1551, %r1531, 128; + add.s32 %r1556, %r1531, 160; + add.s32 %r1561, %r1531, 192; + add.s32 %r1566, %r1531, 224; + add.s32 %r1571, %r1531, 16384; + add.s32 %r1576, %r1536, 16384; + add.s32 %r1581, %r1541, 16384; + add.s32 %r1586, %r1546, 16384; + add.s32 %r1591, %r1531, 16512; + add.s32 %r1596, %r1531, 16544; + add.s32 %r1601, %r1531, 16576; + add.s32 %r1606, %r1531, 16608; + bfe.u32 %r568, %r1, 7, 1; + shl.b32 %r569, %r17, 1; + or.b32 %r570, %r569, %r568; + xor.b32 %r571, %r570, %r4; + shl.b32 %r572, %r571, 4; + shl.b32 %r573, %r516, 7; + or.b32 %r574, %r572, %r573; + add.s32 %r1611, %r478, %r574; + add.s32 %r1616, %r1611, 2048; + add.s32 %r1621, %r1611, 4096; + add.s32 %r1626, %r1611, 6144; + add.s32 %r1631, %r1611, 8192; + add.s32 %r1636, %r1611, 10240; + add.s32 %r1641, %r1611, 12288; + add.s32 %r1646, %r1611, 14336; + or.b32 %r575, %r570, 4; + xor.b32 %r576, %r575, %r4; + shl.b32 %r577, %r576, 4; + or.b32 %r578, %r577, %r573; + add.s32 %r1651, %r478, %r578; + add.s32 %r1656, %r1651, 2048; + add.s32 %r1661, %r1651, 4096; + add.s32 %r1666, %r1651, 6144; + add.s32 %r1671, %r1651, 8192; + add.s32 %r1676, %r1651, 10240; + add.s32 %r1681, %r1651, 12288; + add.s32 %r1686, %r1651, 14336; + mov.f32 %f515, 0f00000000; + mov.u32 %r625, 0; + mov.f32 %f516, %f515; + mov.f32 %f517, %f515; + mov.f32 %f518, %f515; + mov.f32 %f519, %f515; + mov.f32 %f520, %f515; + mov.f32 %f521, %f515; + mov.f32 %f522, %f515; + mov.f32 %f523, %f515; + mov.f32 %f524, %f515; + mov.f32 %f525, %f515; + mov.f32 %f526, %f515; + mov.f32 %f527, %f515; + mov.f32 %f528, %f515; + mov.f32 %f529, %f515; + mov.f32 %f530, %f515; + mov.f32 %f531, %f515; + mov.f32 %f532, %f515; + mov.f32 %f533, %f515; + mov.f32 %f534, %f515; + mov.f32 %f535, %f515; + mov.f32 %f536, %f515; + mov.f32 %f537, %f515; + mov.f32 %f538, %f515; + mov.f32 %f539, %f515; + mov.f32 %f540, %f515; + mov.f32 %f541, %f515; + mov.f32 %f542, %f515; + mov.f32 %f543, %f515; + mov.f32 %f544, %f515; + mov.f32 %f545, %f515; + mov.f32 %f546, %f515; + mov.u32 %r3847, %r625; +LBB0_2: + @%p19 ld.global.v4.b32 { %r3499, %r3500, %r3501, %r3502 }, [ %rd46 + 0 ]; + mov.b32 %hh257, %r3499; + mov.b32 %hh258, %r3500; + mov.b32 %hh259, %r3501; + mov.b32 %hh260, %r3502; + @%p19 ld.global.v4.b32 { %r3503, %r3504, %r3505, %r3506 }, [ %rd47 + 0 ]; + mov.b32 %hh261, %r3503; + mov.b32 %hh262, %r3504; + mov.b32 %hh263, %r3505; + mov.b32 %hh264, %r3506; + @%p19 ld.global.v4.b32 { %r3507, %r3508, %r3509, %r3510 }, [ %rd48 + 0 ]; + mov.b32 %hh265, %r3507; + mov.b32 %hh266, %r3508; + mov.b32 %hh267, %r3509; + mov.b32 %hh268, %r3510; + @%p19 ld.global.v4.b32 { %r3511, %r3512, %r3513, %r3514 }, [ %rd49 + 0 ]; + mov.b32 %hh269, %r3511; + mov.b32 %hh270, %r3512; + mov.b32 %hh271, %r3513; + mov.b32 %hh272, %r3514; + bar.sync 0; + st.shared.v4.b32 [%r20], {%r3499, %r3500, %r3501, %r3502}; + st.shared.v4.b32 [%r21], {%r3503, %r3504, %r3505, %r3506}; + st.shared.v4.b32 [%r22], {%r3507, %r3508, %r3509, %r3510}; + st.shared.v4.b32 [%r23], {%r3511, %r3512, %r3513, %r3514}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r619, %r620, %r621, %r622 }, [ %r599 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r843, %r844, %r845, %r846 }, [ %r604 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1067, %r1068, %r1069, %r1070 }, [ %r609 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1291, %r1292, %r1293, %r1294 }, [ %r614 + 0 ]; + mov.u32 %r839, %r625; + mov.u32 %r840, %r625; + mov.u32 %r841, %r625; + mov.u32 %r842, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r619, %r620, %r621, %r622 }, { %r3591, %r3592 }, { %r839, %r840, %r841, %r842 }; + mov.u32 %r853, %r625; + mov.u32 %r854, %r625; + mov.u32 %r855, %r625; + mov.u32 %r856, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r619, %r620, %r621, %r622 }, { %r3593, %r3594 }, { %r853, %r854, %r855, %r856 }; + mov.u32 %r867, %r625; + mov.u32 %r868, %r625; + mov.u32 %r869, %r625; + mov.u32 %r870, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r619, %r620, %r621, %r622 }, { %r3595, %r3596 }, { %r867, %r868, %r869, %r870 }; + mov.u32 %r881, %r625; + mov.u32 %r882, %r625; + mov.u32 %r883, %r625; + mov.u32 %r884, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r619, %r620, %r621, %r622 }, { %r3597, %r3598 }, { %r881, %r882, %r883, %r884 }; + mov.u32 %r895, %r625; + mov.u32 %r896, %r625; + mov.u32 %r897, %r625; + mov.u32 %r898, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r619, %r620, %r621, %r622 }, { %r3599, %r3600 }, { %r895, %r896, %r897, %r898 }; + mov.u32 %r909, %r625; + mov.u32 %r910, %r625; + mov.u32 %r911, %r625; + mov.u32 %r912, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r619, %r620, %r621, %r622 }, { %r3601, %r3602 }, { %r909, %r910, %r911, %r912 }; + mov.u32 %r923, %r625; + mov.u32 %r924, %r625; + mov.u32 %r925, %r625; + mov.u32 %r926, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r619, %r620, %r621, %r622 }, { %r3603, %r3604 }, { %r923, %r924, %r925, %r926 }; + mov.u32 %r937, %r625; + mov.u32 %r938, %r625; + mov.u32 %r939, %r625; + mov.u32 %r940, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r619, %r620, %r621, %r622 }, { %r3605, %r3606 }, { %r937, %r938, %r939, %r940 }; + mov.u32 %r951, %r625; + mov.u32 %r952, %r625; + mov.u32 %r953, %r625; + mov.u32 %r954, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r619, %r620, %r621, %r622 }, { %r3607, %r3608 }, { %r951, %r952, %r953, %r954 }; + mov.u32 %r965, %r625; + mov.u32 %r966, %r625; + mov.u32 %r967, %r625; + mov.u32 %r968, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r619, %r620, %r621, %r622 }, { %r3609, %r3610 }, { %r965, %r966, %r967, %r968 }; + mov.u32 %r979, %r625; + mov.u32 %r980, %r625; + mov.u32 %r981, %r625; + mov.u32 %r982, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r619, %r620, %r621, %r622 }, { %r3611, %r3612 }, { %r979, %r980, %r981, %r982 }; + mov.u32 %r993, %r625; + mov.u32 %r994, %r625; + mov.u32 %r995, %r625; + mov.u32 %r996, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r619, %r620, %r621, %r622 }, { %r3613, %r3614 }, { %r993, %r994, %r995, %r996 }; + mov.u32 %r1007, %r625; + mov.u32 %r1008, %r625; + mov.u32 %r1009, %r625; + mov.u32 %r1010, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r619, %r620, %r621, %r622 }, { %r3615, %r3616 }, { %r1007, %r1008, %r1009, %r1010 }; + mov.u32 %r1021, %r625; + mov.u32 %r1022, %r625; + mov.u32 %r1023, %r625; + mov.u32 %r1024, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r619, %r620, %r621, %r622 }, { %r3617, %r3618 }, { %r1021, %r1022, %r1023, %r1024 }; + mov.u32 %r1035, %r625; + mov.u32 %r1036, %r625; + mov.u32 %r1037, %r625; + mov.u32 %r1038, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r619, %r620, %r621, %r622 }, { %r3619, %r3620 }, { %r1035, %r1036, %r1037, %r1038 }; + mov.u32 %r1049, %r625; + mov.u32 %r1050, %r625; + mov.u32 %r1051, %r625; + mov.u32 %r1052, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r619, %r620, %r621, %r622 }, { %r3621, %r3622 }, { %r1049, %r1050, %r1051, %r1052 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r843, %r844, %r845, %r846 }, { %r3623, %r3624 }, { %r839, %r840, %r841, %r842 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r843, %r844, %r845, %r846 }, { %r3625, %r3626 }, { %r853, %r854, %r855, %r856 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r843, %r844, %r845, %r846 }, { %r3627, %r3628 }, { %r867, %r868, %r869, %r870 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r843, %r844, %r845, %r846 }, { %r3629, %r3630 }, { %r881, %r882, %r883, %r884 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r843, %r844, %r845, %r846 }, { %r3631, %r3632 }, { %r895, %r896, %r897, %r898 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r843, %r844, %r845, %r846 }, { %r3633, %r3634 }, { %r909, %r910, %r911, %r912 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r843, %r844, %r845, %r846 }, { %r3635, %r3636 }, { %r923, %r924, %r925, %r926 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r843, %r844, %r845, %r846 }, { %r3637, %r3638 }, { %r937, %r938, %r939, %r940 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r843, %r844, %r845, %r846 }, { %r3639, %r3640 }, { %r951, %r952, %r953, %r954 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r843, %r844, %r845, %r846 }, { %r3641, %r3642 }, { %r965, %r966, %r967, %r968 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r843, %r844, %r845, %r846 }, { %r3643, %r3644 }, { %r979, %r980, %r981, %r982 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r843, %r844, %r845, %r846 }, { %r3645, %r3646 }, { %r993, %r994, %r995, %r996 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r843, %r844, %r845, %r846 }, { %r3647, %r3648 }, { %r1007, %r1008, %r1009, %r1010 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r843, %r844, %r845, %r846 }, { %r3649, %r3650 }, { %r1021, %r1022, %r1023, %r1024 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r843, %r844, %r845, %r846 }, { %r3651, %r3652 }, { %r1035, %r1036, %r1037, %r1038 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r843, %r844, %r845, %r846 }, { %r3653, %r3654 }, { %r1049, %r1050, %r1051, %r1052 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3655, %r3656 }, { %r839, %r840, %r841, %r842 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3657, %r3658 }, { %r853, %r854, %r855, %r856 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3659, %r3660 }, { %r867, %r868, %r869, %r870 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3661, %r3662 }, { %r881, %r882, %r883, %r884 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3663, %r3664 }, { %r895, %r896, %r897, %r898 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3665, %r3666 }, { %r909, %r910, %r911, %r912 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3667, %r3668 }, { %r923, %r924, %r925, %r926 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3669, %r3670 }, { %r937, %r938, %r939, %r940 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3671, %r3672 }, { %r951, %r952, %r953, %r954 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3673, %r3674 }, { %r965, %r966, %r967, %r968 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3675, %r3676 }, { %r979, %r980, %r981, %r982 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3677, %r3678 }, { %r993, %r994, %r995, %r996 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3679, %r3680 }, { %r1007, %r1008, %r1009, %r1010 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3681, %r3682 }, { %r1021, %r1022, %r1023, %r1024 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3683, %r3684 }, { %r1035, %r1036, %r1037, %r1038 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3685, %r3686 }, { %r1049, %r1050, %r1051, %r1052 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3687, %r3688 }, { %r839, %r840, %r841, %r842 }; + mov.b32 %f162, %r842; + mov.b32 %f163, %r841; + mov.b32 %f164, %r840; + mov.b32 %f165, %r839; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3689, %r3690 }, { %r853, %r854, %r855, %r856 }; + mov.b32 %f166, %r856; + mov.b32 %f167, %r855; + mov.b32 %f168, %r854; + mov.b32 %f169, %r853; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3691, %r3692 }, { %r867, %r868, %r869, %r870 }; + mov.b32 %f170, %r870; + mov.b32 %f171, %r869; + mov.b32 %f172, %r868; + mov.b32 %f173, %r867; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3693, %r3694 }, { %r881, %r882, %r883, %r884 }; + mov.b32 %f174, %r884; + mov.b32 %f175, %r883; + mov.b32 %f176, %r882; + mov.b32 %f177, %r881; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3695, %r3696 }, { %r895, %r896, %r897, %r898 }; + mov.b32 %f178, %r898; + mov.b32 %f179, %r897; + mov.b32 %f180, %r896; + mov.b32 %f181, %r895; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3697, %r3698 }, { %r909, %r910, %r911, %r912 }; + mov.b32 %f182, %r912; + mov.b32 %f183, %r911; + mov.b32 %f184, %r910; + mov.b32 %f185, %r909; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3699, %r3700 }, { %r923, %r924, %r925, %r926 }; + mov.b32 %f186, %r926; + mov.b32 %f187, %r925; + mov.b32 %f188, %r924; + mov.b32 %f189, %r923; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3701, %r3702 }, { %r937, %r938, %r939, %r940 }; + mov.b32 %f190, %r940; + mov.b32 %f191, %r939; + mov.b32 %f192, %r938; + mov.b32 %f193, %r937; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3703, %r3704 }, { %r951, %r952, %r953, %r954 }; + mov.b32 %f194, %r954; + mov.b32 %f195, %r953; + mov.b32 %f196, %r952; + mov.b32 %f197, %r951; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3705, %r3706 }, { %r965, %r966, %r967, %r968 }; + mov.b32 %f198, %r968; + mov.b32 %f199, %r967; + mov.b32 %f200, %r966; + mov.b32 %f201, %r965; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3707, %r3708 }, { %r979, %r980, %r981, %r982 }; + mov.b32 %f202, %r982; + mov.b32 %f203, %r981; + mov.b32 %f204, %r980; + mov.b32 %f205, %r979; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3709, %r3710 }, { %r993, %r994, %r995, %r996 }; + mov.b32 %f206, %r996; + mov.b32 %f207, %r995; + mov.b32 %f208, %r994; + mov.b32 %f209, %r993; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3711, %r3712 }, { %r1007, %r1008, %r1009, %r1010 }; + mov.b32 %f210, %r1010; + mov.b32 %f211, %r1009; + mov.b32 %f212, %r1008; + mov.b32 %f213, %r1007; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3713, %r3714 }, { %r1021, %r1022, %r1023, %r1024 }; + mov.b32 %f214, %r1024; + mov.b32 %f215, %r1023; + mov.b32 %f216, %r1022; + mov.b32 %f217, %r1021; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3715, %r3716 }, { %r1035, %r1036, %r1037, %r1038 }; + mov.b32 %f218, %r1038; + mov.b32 %f219, %r1037; + mov.b32 %f220, %r1036; + mov.b32 %f221, %r1035; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3717, %r3718 }, { %r1049, %r1050, %r1051, %r1052 }; + mov.b32 %f222, %r1052; + mov.b32 %f223, %r1051; + mov.b32 %f224, %r1050; + mov.b32 %f225, %r1049; + @%p19 ld.global.v4.b32 { %r3531, %r3532, %r3533, %r3534 }, [ %rd50 + 0 ]; + mov.b32 %hh273, %r3531; + mov.b32 %hh274, %r3532; + mov.b32 %hh275, %r3533; + mov.b32 %hh276, %r3534; + @%p19 ld.global.v4.b32 { %r3535, %r3536, %r3537, %r3538 }, [ %rd51 + 0 ]; + mov.b32 %hh277, %r3535; + mov.b32 %hh278, %r3536; + mov.b32 %hh279, %r3537; + mov.b32 %hh280, %r3538; + @%p19 ld.global.v4.b32 { %r3539, %r3540, %r3541, %r3542 }, [ %rd52 + 0 ]; + mov.b32 %hh281, %r3539; + mov.b32 %hh282, %r3540; + mov.b32 %hh283, %r3541; + mov.b32 %hh284, %r3542; + @%p19 ld.global.v4.b32 { %r3543, %r3544, %r3545, %r3546 }, [ %rd53 + 0 ]; + mov.b32 %hh285, %r3543; + mov.b32 %hh286, %r3544; + mov.b32 %hh287, %r3545; + mov.b32 %hh288, %r3546; + cvt.rn.f16.f32 %h65, %f165; + cvt.rn.f16.f32 %h66, %f164; + cvt.rn.f16.f32 %h67, %f163; + cvt.rn.f16.f32 %h68, %f162; + cvt.rn.f16.f32 %h69, %f169; + cvt.rn.f16.f32 %h70, %f168; + cvt.rn.f16.f32 %h71, %f167; + cvt.rn.f16.f32 %h72, %f166; + cvt.rn.f16.f32 %h73, %f173; + cvt.rn.f16.f32 %h74, %f172; + cvt.rn.f16.f32 %h75, %f171; + cvt.rn.f16.f32 %h76, %f170; + cvt.rn.f16.f32 %h77, %f177; + cvt.rn.f16.f32 %h78, %f176; + cvt.rn.f16.f32 %h79, %f175; + cvt.rn.f16.f32 %h80, %f174; + cvt.rn.f16.f32 %h81, %f181; + cvt.rn.f16.f32 %h82, %f180; + cvt.rn.f16.f32 %h83, %f179; + cvt.rn.f16.f32 %h84, %f178; + cvt.rn.f16.f32 %h85, %f185; + cvt.rn.f16.f32 %h86, %f184; + cvt.rn.f16.f32 %h87, %f183; + cvt.rn.f16.f32 %h88, %f182; + cvt.rn.f16.f32 %h89, %f189; + cvt.rn.f16.f32 %h90, %f188; + cvt.rn.f16.f32 %h91, %f187; + cvt.rn.f16.f32 %h92, %f186; + cvt.rn.f16.f32 %h93, %f193; + cvt.rn.f16.f32 %h94, %f192; + cvt.rn.f16.f32 %h95, %f191; + cvt.rn.f16.f32 %h96, %f190; + cvt.rn.f16.f32 %h97, %f197; + cvt.rn.f16.f32 %h98, %f196; + cvt.rn.f16.f32 %h99, %f195; + cvt.rn.f16.f32 %h100, %f194; + cvt.rn.f16.f32 %h101, %f201; + cvt.rn.f16.f32 %h102, %f200; + cvt.rn.f16.f32 %h103, %f199; + cvt.rn.f16.f32 %h104, %f198; + cvt.rn.f16.f32 %h105, %f205; + cvt.rn.f16.f32 %h106, %f204; + cvt.rn.f16.f32 %h107, %f203; + cvt.rn.f16.f32 %h108, %f202; + cvt.rn.f16.f32 %h109, %f209; + cvt.rn.f16.f32 %h110, %f208; + cvt.rn.f16.f32 %h111, %f207; + cvt.rn.f16.f32 %h112, %f206; + cvt.rn.f16.f32 %h113, %f213; + cvt.rn.f16.f32 %h114, %f212; + cvt.rn.f16.f32 %h115, %f211; + cvt.rn.f16.f32 %h116, %f210; + cvt.rn.f16.f32 %h117, %f217; + cvt.rn.f16.f32 %h118, %f216; + cvt.rn.f16.f32 %h119, %f215; + cvt.rn.f16.f32 %h120, %f214; + cvt.rn.f16.f32 %h121, %f221; + cvt.rn.f16.f32 %h122, %f220; + cvt.rn.f16.f32 %h123, %f219; + cvt.rn.f16.f32 %h124, %f218; + cvt.rn.f16.f32 %h125, %f225; + cvt.rn.f16.f32 %h126, %f224; + cvt.rn.f16.f32 %h127, %f223; + cvt.rn.f16.f32 %h128, %f222; + bar.sync 0; + st.shared.b16 [%r28], %h65; + st.shared.b16 [%r28+256], %h66; + st.shared.b16 [%r28+16], %h67; + st.shared.b16 [%r30+16], %h68; + st.shared.b16 [%r28+2048], %h69; + st.shared.b16 [%r28+2304], %h70; + st.shared.b16 [%r31+16], %h71; + st.shared.b16 [%r32+16], %h72; + st.shared.b16 [%r28+4096], %h73; + st.shared.b16 [%r28+4352], %h74; + st.shared.b16 [%r33+16], %h75; + st.shared.b16 [%r34+16], %h76; + st.shared.b16 [%r28+6144], %h77; + st.shared.b16 [%r28+6400], %h78; + st.shared.b16 [%r35+16], %h79; + st.shared.b16 [%r36+16], %h80; + st.shared.b16 [%r28+8192], %h81; + st.shared.b16 [%r28+8448], %h82; + st.shared.b16 [%r37+16], %h83; + st.shared.b16 [%r38+16], %h84; + st.shared.b16 [%r28+10240], %h85; + st.shared.b16 [%r28+10496], %h86; + st.shared.b16 [%r39+16], %h87; + st.shared.b16 [%r40+16], %h88; + st.shared.b16 [%r28+12288], %h89; + st.shared.b16 [%r28+12544], %h90; + st.shared.b16 [%r41+16], %h91; + st.shared.b16 [%r42+16], %h92; + st.shared.b16 [%r28+14336], %h93; + st.shared.b16 [%r28+14592], %h94; + st.shared.b16 [%r43+16], %h95; + st.shared.b16 [%r44+16], %h96; + st.shared.b16 [%r28+16384], %h97; + st.shared.b16 [%r28+16640], %h98; + st.shared.b16 [%r45+16], %h99; + st.shared.b16 [%r46+16], %h100; + st.shared.b16 [%r28+18432], %h101; + st.shared.b16 [%r28+18688], %h102; + st.shared.b16 [%r47+16], %h103; + st.shared.b16 [%r48+16], %h104; + st.shared.b16 [%r28+20480], %h105; + st.shared.b16 [%r28+20736], %h106; + st.shared.b16 [%r49+16], %h107; + st.shared.b16 [%r50+16], %h108; + st.shared.b16 [%r28+22528], %h109; + st.shared.b16 [%r28+22784], %h110; + st.shared.b16 [%r51+16], %h111; + st.shared.b16 [%r52+16], %h112; + st.shared.b16 [%r28+24576], %h113; + st.shared.b16 [%r28+24832], %h114; + st.shared.b16 [%r53+16], %h115; + st.shared.b16 [%r54+16], %h116; + st.shared.b16 [%r28+26624], %h117; + st.shared.b16 [%r28+26880], %h118; + st.shared.b16 [%r55+16], %h119; + st.shared.b16 [%r56+16], %h120; + st.shared.b16 [%r28+28672], %h121; + st.shared.b16 [%r28+28928], %h122; + st.shared.b16 [%r57+16], %h123; + st.shared.b16 [%r58+16], %h124; + st.shared.b16 [%r28+30720], %h125; + st.shared.b16 [%r28+30976], %h126; + st.shared.b16 [%r59+16], %h127; + st.shared.b16 [%r60+16], %h128; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1691, %r1692, %r1693, %r1694 }, [ %r1531 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1803, %r1804, %r1805, %r1806 }, [ %r1536 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1915, %r1916, %r1917, %r1918 }, [ %r1541 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2027, %r2028, %r2029, %r2030 }, [ %r1546 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2139, %r2140, %r2141, %r2142 }, [ %r1551 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2251, %r2252, %r2253, %r2254 }, [ %r1556 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2363, %r2364, %r2365, %r2366 }, [ %r1561 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2475, %r2476, %r2477, %r2478 }, [ %r1566 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1747, %r1748, %r1749, %r1750 }, [ %r1571 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1859, %r1860, %r1861, %r1862 }, [ %r1576 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1971, %r1972, %r1973, %r1974 }, [ %r1581 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2083, %r2084, %r2085, %r2086 }, [ %r1586 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2195, %r2196, %r2197, %r2198 }, [ %r1591 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2307, %r2308, %r2309, %r2310 }, [ %r1596 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2419, %r2420, %r2421, %r2422 }, [ %r1601 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2531, %r2532, %r2533, %r2534 }, [ %r1606 + 0 ]; + bar.sync 0; + st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534}; + st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538}; + st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542}; + st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1695, %r1696, %r1709, %r1710 }, [ %r1611 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1807, %r1808, %r1821, %r1822 }, [ %r1616 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1919, %r1920, %r1933, %r1934 }, [ %r1621 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2031, %r2032, %r2045, %r2046 }, [ %r1626 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2143, %r2144, %r2157, %r2158 }, [ %r1631 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2255, %r2256, %r2269, %r2270 }, [ %r1636 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2367, %r2368, %r2381, %r2382 }, [ %r1641 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2479, %r2480, %r2493, %r2494 }, [ %r1646 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1723, %r1724, %r1737, %r1738 }, [ %r1651 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1835, %r1836, %r1849, %r1850 }, [ %r1656 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1947, %r1948, %r1961, %r1962 }, [ %r1661 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2059, %r2060, %r2073, %r2074 }, [ %r1666 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2171, %r2172, %r2185, %r2186 }, [ %r1671 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2283, %r2284, %r2297, %r2298 }, [ %r1676 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2395, %r2396, %r2409, %r2410 }, [ %r1681 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2507, %r2508, %r2521, %r2522 }, [ %r1686 + 0 ]; + mov.b32 %r1799, %f515; + mov.b32 %r1800, %f516; + mov.b32 %r1801, %f517; + mov.b32 %r1802, %f518; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1695, %r1696 }, { %r1799, %r1800, %r1801, %r1802 }; + mov.b32 %r1813, %f519; + mov.b32 %r1814, %f520; + mov.b32 %r1815, %f521; + mov.b32 %r1816, %f522; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1709, %r1710 }, { %r1813, %r1814, %r1815, %r1816 }; + mov.b32 %r1827, %f523; + mov.b32 %r1828, %f524; + mov.b32 %r1829, %f525; + mov.b32 %r1830, %f526; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1723, %r1724 }, { %r1827, %r1828, %r1829, %r1830 }; + mov.b32 %r1841, %f527; + mov.b32 %r1842, %f528; + mov.b32 %r1843, %f529; + mov.b32 %r1844, %f530; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1737, %r1738 }, { %r1841, %r1842, %r1843, %r1844 }; + mov.b32 %r1855, %f531; + mov.b32 %r1856, %f532; + mov.b32 %r1857, %f533; + mov.b32 %r1858, %f534; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1695, %r1696 }, { %r1855, %r1856, %r1857, %r1858 }; + mov.b32 %r1869, %f535; + mov.b32 %r1870, %f536; + mov.b32 %r1871, %f537; + mov.b32 %r1872, %f538; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1709, %r1710 }, { %r1869, %r1870, %r1871, %r1872 }; + mov.b32 %r1883, %f539; + mov.b32 %r1884, %f540; + mov.b32 %r1885, %f541; + mov.b32 %r1886, %f542; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1723, %r1724 }, { %r1883, %r1884, %r1885, %r1886 }; + mov.b32 %r1897, %f543; + mov.b32 %r1898, %f544; + mov.b32 %r1899, %f545; + mov.b32 %r1900, %f546; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1737, %r1738 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1807, %r1808 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1821, %r1822 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1835, %r1836 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1849, %r1850 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1807, %r1808 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1821, %r1822 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1835, %r1836 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1849, %r1850 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1919, %r1920 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1933, %r1934 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1947, %r1948 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1961, %r1962 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1919, %r1920 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1933, %r1934 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1947, %r1948 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1961, %r1962 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2031, %r2032 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2045, %r2046 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2059, %r2060 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2073, %r2074 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2031, %r2032 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2045, %r2046 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2059, %r2060 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2073, %r2074 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2143, %r2144 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2157, %r2158 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2171, %r2172 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2185, %r2186 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2143, %r2144 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2157, %r2158 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2171, %r2172 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2185, %r2186 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2255, %r2256 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2269, %r2270 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2283, %r2284 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2297, %r2298 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2255, %r2256 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2269, %r2270 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2283, %r2284 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2297, %r2298 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2367, %r2368 }, { %r1799, %r1800, %r1801, %r1802 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2381, %r2382 }, { %r1813, %r1814, %r1815, %r1816 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2395, %r2396 }, { %r1827, %r1828, %r1829, %r1830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2409, %r2410 }, { %r1841, %r1842, %r1843, %r1844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2367, %r2368 }, { %r1855, %r1856, %r1857, %r1858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2381, %r2382 }, { %r1869, %r1870, %r1871, %r1872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2395, %r2396 }, { %r1883, %r1884, %r1885, %r1886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2409, %r2410 }, { %r1897, %r1898, %r1899, %r1900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2479, %r2480 }, { %r1799, %r1800, %r1801, %r1802 }; + mov.b32 %f518, %r1802; + mov.b32 %f517, %r1801; + mov.b32 %f516, %r1800; + mov.b32 %f515, %r1799; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2493, %r2494 }, { %r1813, %r1814, %r1815, %r1816 }; + mov.b32 %f522, %r1816; + mov.b32 %f521, %r1815; + mov.b32 %f520, %r1814; + mov.b32 %f519, %r1813; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2507, %r2508 }, { %r1827, %r1828, %r1829, %r1830 }; + mov.b32 %f526, %r1830; + mov.b32 %f525, %r1829; + mov.b32 %f524, %r1828; + mov.b32 %f523, %r1827; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2521, %r2522 }, { %r1841, %r1842, %r1843, %r1844 }; + mov.b32 %f530, %r1844; + mov.b32 %f529, %r1843; + mov.b32 %f528, %r1842; + mov.b32 %f527, %r1841; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2479, %r2480 }, { %r1855, %r1856, %r1857, %r1858 }; + mov.b32 %f534, %r1858; + mov.b32 %f533, %r1857; + mov.b32 %f532, %r1856; + mov.b32 %f531, %r1855; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2493, %r2494 }, { %r1869, %r1870, %r1871, %r1872 }; + mov.b32 %f538, %r1872; + mov.b32 %f537, %r1871; + mov.b32 %f536, %r1870; + mov.b32 %f535, %r1869; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2507, %r2508 }, { %r1883, %r1884, %r1885, %r1886 }; + mov.b32 %f542, %r1886; + mov.b32 %f541, %r1885; + mov.b32 %f540, %r1884; + mov.b32 %f539, %r1883; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2521, %r2522 }, { %r1897, %r1898, %r1899, %r1900 }; + mov.b32 %f546, %r1900; + mov.b32 %f545, %r1899; + mov.b32 %f544, %r1898; + mov.b32 %f543, %r1897; + bar.sync 0; + st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534}; + st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538}; + st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542}; + st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546}; + bar.sync 0; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2607, %r2608, %r2609, %r2610 }, [ %r599 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2831, %r2832, %r2833, %r2834 }, [ %r604 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3055, %r3056, %r3057, %r3058 }, [ %r609 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3279, %r3280, %r3281, %r3282 }, [ %r614 + 0 ]; + mov.u32 %r2827, %r625; + mov.u32 %r2828, %r625; + mov.u32 %r2829, %r625; + mov.u32 %r2830, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3719, %r3720 }, { %r2827, %r2828, %r2829, %r2830 }; + mov.u32 %r2841, %r625; + mov.u32 %r2842, %r625; + mov.u32 %r2843, %r625; + mov.u32 %r2844, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3721, %r3722 }, { %r2841, %r2842, %r2843, %r2844 }; + mov.u32 %r2855, %r625; + mov.u32 %r2856, %r625; + mov.u32 %r2857, %r625; + mov.u32 %r2858, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3723, %r3724 }, { %r2855, %r2856, %r2857, %r2858 }; + mov.u32 %r2869, %r625; + mov.u32 %r2870, %r625; + mov.u32 %r2871, %r625; + mov.u32 %r2872, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3725, %r3726 }, { %r2869, %r2870, %r2871, %r2872 }; + mov.u32 %r2883, %r625; + mov.u32 %r2884, %r625; + mov.u32 %r2885, %r625; + mov.u32 %r2886, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3727, %r3728 }, { %r2883, %r2884, %r2885, %r2886 }; + mov.u32 %r2897, %r625; + mov.u32 %r2898, %r625; + mov.u32 %r2899, %r625; + mov.u32 %r2900, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3729, %r3730 }, { %r2897, %r2898, %r2899, %r2900 }; + mov.u32 %r2911, %r625; + mov.u32 %r2912, %r625; + mov.u32 %r2913, %r625; + mov.u32 %r2914, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3731, %r3732 }, { %r2911, %r2912, %r2913, %r2914 }; + mov.u32 %r2925, %r625; + mov.u32 %r2926, %r625; + mov.u32 %r2927, %r625; + mov.u32 %r2928, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3733, %r3734 }, { %r2925, %r2926, %r2927, %r2928 }; + mov.u32 %r2939, %r625; + mov.u32 %r2940, %r625; + mov.u32 %r2941, %r625; + mov.u32 %r2942, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3735, %r3736 }, { %r2939, %r2940, %r2941, %r2942 }; + mov.u32 %r2953, %r625; + mov.u32 %r2954, %r625; + mov.u32 %r2955, %r625; + mov.u32 %r2956, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3737, %r3738 }, { %r2953, %r2954, %r2955, %r2956 }; + mov.u32 %r2967, %r625; + mov.u32 %r2968, %r625; + mov.u32 %r2969, %r625; + mov.u32 %r2970, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3739, %r3740 }, { %r2967, %r2968, %r2969, %r2970 }; + mov.u32 %r2981, %r625; + mov.u32 %r2982, %r625; + mov.u32 %r2983, %r625; + mov.u32 %r2984, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3741, %r3742 }, { %r2981, %r2982, %r2983, %r2984 }; + mov.u32 %r2995, %r625; + mov.u32 %r2996, %r625; + mov.u32 %r2997, %r625; + mov.u32 %r2998, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3743, %r3744 }, { %r2995, %r2996, %r2997, %r2998 }; + mov.u32 %r3009, %r625; + mov.u32 %r3010, %r625; + mov.u32 %r3011, %r625; + mov.u32 %r3012, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3745, %r3746 }, { %r3009, %r3010, %r3011, %r3012 }; + mov.u32 %r3023, %r625; + mov.u32 %r3024, %r625; + mov.u32 %r3025, %r625; + mov.u32 %r3026, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3747, %r3748 }, { %r3023, %r3024, %r3025, %r3026 }; + mov.u32 %r3040, %r625; + mov.u32 %r3037, %r625; + mov.u32 %r3038, %r625; + mov.u32 %r3039, %r625; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3749, %r3750 }, { %r3037, %r3038, %r3039, %r3040 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3751, %r3752 }, { %r2827, %r2828, %r2829, %r2830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3753, %r3754 }, { %r2841, %r2842, %r2843, %r2844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3755, %r3756 }, { %r2855, %r2856, %r2857, %r2858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3757, %r3758 }, { %r2869, %r2870, %r2871, %r2872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3759, %r3760 }, { %r2883, %r2884, %r2885, %r2886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3761, %r3762 }, { %r2897, %r2898, %r2899, %r2900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3763, %r3764 }, { %r2911, %r2912, %r2913, %r2914 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3765, %r3766 }, { %r2925, %r2926, %r2927, %r2928 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3767, %r3768 }, { %r2939, %r2940, %r2941, %r2942 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3769, %r3770 }, { %r2953, %r2954, %r2955, %r2956 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3771, %r3772 }, { %r2967, %r2968, %r2969, %r2970 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3773, %r3774 }, { %r2981, %r2982, %r2983, %r2984 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3775, %r3776 }, { %r2995, %r2996, %r2997, %r2998 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3777, %r3778 }, { %r3009, %r3010, %r3011, %r3012 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3779, %r3780 }, { %r3023, %r3024, %r3025, %r3026 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3781, %r3782 }, { %r3037, %r3038, %r3039, %r3040 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3783, %r3784 }, { %r2827, %r2828, %r2829, %r2830 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3785, %r3786 }, { %r2841, %r2842, %r2843, %r2844 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3787, %r3788 }, { %r2855, %r2856, %r2857, %r2858 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3789, %r3790 }, { %r2869, %r2870, %r2871, %r2872 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3791, %r3792 }, { %r2883, %r2884, %r2885, %r2886 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3793, %r3794 }, { %r2897, %r2898, %r2899, %r2900 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3795, %r3796 }, { %r2911, %r2912, %r2913, %r2914 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3797, %r3798 }, { %r2925, %r2926, %r2927, %r2928 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3799, %r3800 }, { %r2939, %r2940, %r2941, %r2942 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3801, %r3802 }, { %r2953, %r2954, %r2955, %r2956 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3803, %r3804 }, { %r2967, %r2968, %r2969, %r2970 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3805, %r3806 }, { %r2981, %r2982, %r2983, %r2984 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3807, %r3808 }, { %r2995, %r2996, %r2997, %r2998 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3809, %r3810 }, { %r3009, %r3010, %r3011, %r3012 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3811, %r3812 }, { %r3023, %r3024, %r3025, %r3026 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3813, %r3814 }, { %r3037, %r3038, %r3039, %r3040 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3815, %r3816 }, { %r2827, %r2828, %r2829, %r2830 }; + mov.b32 %f226, %r2830; + mov.b32 %f227, %r2829; + mov.b32 %f228, %r2828; + mov.b32 %f229, %r2827; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3817, %r3818 }, { %r2841, %r2842, %r2843, %r2844 }; + mov.b32 %f230, %r2844; + mov.b32 %f231, %r2843; + mov.b32 %f232, %r2842; + mov.b32 %f233, %r2841; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3819, %r3820 }, { %r2855, %r2856, %r2857, %r2858 }; + mov.b32 %f234, %r2858; + mov.b32 %f235, %r2857; + mov.b32 %f236, %r2856; + mov.b32 %f237, %r2855; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3821, %r3822 }, { %r2869, %r2870, %r2871, %r2872 }; + mov.b32 %f238, %r2872; + mov.b32 %f239, %r2871; + mov.b32 %f240, %r2870; + mov.b32 %f241, %r2869; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3823, %r3824 }, { %r2883, %r2884, %r2885, %r2886 }; + mov.b32 %f242, %r2886; + mov.b32 %f243, %r2885; + mov.b32 %f244, %r2884; + mov.b32 %f245, %r2883; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3825, %r3826 }, { %r2897, %r2898, %r2899, %r2900 }; + mov.b32 %f246, %r2900; + mov.b32 %f247, %r2899; + mov.b32 %f248, %r2898; + mov.b32 %f249, %r2897; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3827, %r3828 }, { %r2911, %r2912, %r2913, %r2914 }; + mov.b32 %f250, %r2914; + mov.b32 %f251, %r2913; + mov.b32 %f252, %r2912; + mov.b32 %f253, %r2911; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3829, %r3830 }, { %r2925, %r2926, %r2927, %r2928 }; + mov.b32 %f254, %r2928; + mov.b32 %f255, %r2927; + mov.b32 %f256, %r2926; + mov.b32 %f257, %r2925; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3831, %r3832 }, { %r2939, %r2940, %r2941, %r2942 }; + mov.b32 %f258, %r2942; + mov.b32 %f259, %r2941; + mov.b32 %f260, %r2940; + mov.b32 %f261, %r2939; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3833, %r3834 }, { %r2953, %r2954, %r2955, %r2956 }; + mov.b32 %f262, %r2956; + mov.b32 %f263, %r2955; + mov.b32 %f264, %r2954; + mov.b32 %f265, %r2953; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3835, %r3836 }, { %r2967, %r2968, %r2969, %r2970 }; + mov.b32 %f266, %r2970; + mov.b32 %f267, %r2969; + mov.b32 %f268, %r2968; + mov.b32 %f269, %r2967; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3837, %r3838 }, { %r2981, %r2982, %r2983, %r2984 }; + mov.b32 %f270, %r2984; + mov.b32 %f271, %r2983; + mov.b32 %f272, %r2982; + mov.b32 %f273, %r2981; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3839, %r3840 }, { %r2995, %r2996, %r2997, %r2998 }; + mov.b32 %f274, %r2998; + mov.b32 %f275, %r2997; + mov.b32 %f276, %r2996; + mov.b32 %f277, %r2995; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3841, %r3842 }, { %r3009, %r3010, %r3011, %r3012 }; + mov.b32 %f278, %r3012; + mov.b32 %f279, %r3011; + mov.b32 %f280, %r3010; + mov.b32 %f281, %r3009; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3843, %r3844 }, { %r3023, %r3024, %r3025, %r3026 }; + mov.b32 %f282, %r3026; + mov.b32 %f283, %r3025; + mov.b32 %f284, %r3024; + mov.b32 %f285, %r3023; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3845, %r3846 }, { %r3037, %r3038, %r3039, %r3040 }; + mov.b32 %f286, %r3040; + mov.b32 %f287, %r3039; + mov.b32 %f288, %r3038; + mov.b32 %f289, %r3037; + mul.f32 %f290, %f165, %f229; + mul.f32 %f291, %f164, %f228; + mul.f32 %f292, %f163, %f227; + mul.f32 %f293, %f162, %f226; + mul.f32 %f294, %f169, %f233; + mul.f32 %f295, %f168, %f232; + mul.f32 %f296, %f167, %f231; + mul.f32 %f297, %f166, %f230; + mul.f32 %f298, %f173, %f237; + mul.f32 %f299, %f172, %f236; + mul.f32 %f300, %f171, %f235; + mul.f32 %f301, %f170, %f234; + mul.f32 %f302, %f177, %f241; + mul.f32 %f303, %f176, %f240; + mul.f32 %f304, %f175, %f239; + mul.f32 %f305, %f174, %f238; + mul.f32 %f306, %f181, %f245; + mul.f32 %f307, %f180, %f244; + mul.f32 %f308, %f179, %f243; + mul.f32 %f309, %f178, %f242; + mul.f32 %f310, %f185, %f249; + mul.f32 %f311, %f184, %f248; + mul.f32 %f312, %f183, %f247; + mul.f32 %f313, %f182, %f246; + mul.f32 %f314, %f189, %f253; + mul.f32 %f315, %f188, %f252; + mul.f32 %f316, %f187, %f251; + mul.f32 %f317, %f186, %f250; + mul.f32 %f318, %f193, %f257; + mul.f32 %f319, %f192, %f256; + mul.f32 %f320, %f191, %f255; + mul.f32 %f321, %f190, %f254; + mul.f32 %f322, %f197, %f261; + mul.f32 %f323, %f196, %f260; + mul.f32 %f324, %f195, %f259; + mul.f32 %f325, %f194, %f258; + mul.f32 %f326, %f201, %f265; + mul.f32 %f327, %f200, %f264; + mul.f32 %f328, %f199, %f263; + mul.f32 %f329, %f198, %f262; + mul.f32 %f330, %f205, %f269; + mul.f32 %f331, %f204, %f268; + mul.f32 %f332, %f203, %f267; + mul.f32 %f333, %f202, %f266; + mul.f32 %f334, %f209, %f273; + mul.f32 %f335, %f208, %f272; + mul.f32 %f336, %f207, %f271; + mul.f32 %f337, %f206, %f270; + mul.f32 %f338, %f213, %f277; + mul.f32 %f339, %f212, %f276; + mul.f32 %f340, %f211, %f275; + mul.f32 %f341, %f210, %f274; + mul.f32 %f342, %f217, %f281; + mul.f32 %f343, %f216, %f280; + mul.f32 %f344, %f215, %f279; + mul.f32 %f345, %f214, %f278; + mul.f32 %f346, %f221, %f285; + mul.f32 %f347, %f220, %f284; + mul.f32 %f348, %f219, %f283; + mul.f32 %f349, %f218, %f282; + mul.f32 %f350, %f225, %f289; + mul.f32 %f351, %f224, %f288; + mul.f32 %f352, %f223, %f287; + mul.f32 %f353, %f222, %f286; + mul.f32 %f354, %f290, %f97; + mul.f32 %f355, %f291, %f97; + mul.f32 %f356, %f292, %f97; + mul.f32 %f357, %f293, %f97; + mul.f32 %f358, %f294, %f97; + mul.f32 %f359, %f295, %f97; + mul.f32 %f360, %f296, %f97; + mul.f32 %f361, %f297, %f97; + mul.f32 %f362, %f298, %f97; + mul.f32 %f363, %f299, %f97; + mul.f32 %f364, %f300, %f97; + mul.f32 %f365, %f301, %f97; + mul.f32 %f366, %f302, %f97; + mul.f32 %f367, %f303, %f97; + mul.f32 %f368, %f304, %f97; + mul.f32 %f369, %f305, %f97; + mul.f32 %f370, %f306, %f97; + mul.f32 %f371, %f307, %f97; + mul.f32 %f372, %f308, %f97; + mul.f32 %f373, %f309, %f97; + mul.f32 %f374, %f310, %f97; + mul.f32 %f375, %f311, %f97; + mul.f32 %f376, %f312, %f97; + mul.f32 %f377, %f313, %f97; + mul.f32 %f378, %f314, %f97; + mul.f32 %f379, %f315, %f97; + mul.f32 %f380, %f316, %f97; + mul.f32 %f381, %f317, %f97; + mul.f32 %f382, %f318, %f97; + mul.f32 %f383, %f319, %f97; + mul.f32 %f384, %f320, %f97; + mul.f32 %f385, %f321, %f97; + mul.f32 %f386, %f322, %f97; + mul.f32 %f387, %f323, %f97; + mul.f32 %f388, %f324, %f97; + mul.f32 %f389, %f325, %f97; + mul.f32 %f390, %f326, %f97; + mul.f32 %f391, %f327, %f97; + mul.f32 %f392, %f328, %f97; + mul.f32 %f393, %f329, %f97; + mul.f32 %f394, %f330, %f97; + mul.f32 %f395, %f331, %f97; + mul.f32 %f396, %f332, %f97; + mul.f32 %f397, %f333, %f97; + mul.f32 %f398, %f334, %f97; + mul.f32 %f399, %f335, %f97; + mul.f32 %f400, %f336, %f97; + mul.f32 %f401, %f337, %f97; + mul.f32 %f402, %f338, %f97; + mul.f32 %f403, %f339, %f97; + mul.f32 %f404, %f340, %f97; + mul.f32 %f405, %f341, %f97; + mul.f32 %f406, %f342, %f97; + mul.f32 %f407, %f343, %f97; + mul.f32 %f408, %f344, %f97; + mul.f32 %f409, %f345, %f97; + mul.f32 %f410, %f346, %f97; + mul.f32 %f411, %f347, %f97; + mul.f32 %f412, %f348, %f97; + mul.f32 %f413, %f349, %f97; + mul.f32 %f414, %f350, %f97; + mul.f32 %f415, %f351, %f97; + mul.f32 %f416, %f352, %f97; + mul.f32 %f417, %f353, %f97; + bar.sync 0; + st.shared.b32 [%r28], %f254; + st.shared.b32 [%r28+256], %f255; + st.shared.b32 [%r28+16], %f256; + st.shared.b32 [%r28+16], %f257; + st.shared.b32 [%r28+2048], %f258; + st.shared.b32 [%r28+2304], %f259; + st.shared.b32 [%r28+16], %f260; + st.shared.b32 [%r28+16], %f261; + st.shared.b32 [%r28+4096], %f262; + st.shared.b32 [%r28+4352], %f263; + st.shared.b32 [%r28+16], %f264; + st.shared.b32 [%r28+16], %f265; + st.shared.b32 [%r28+6144], %f266; + st.shared.b32 [%r28+6400], %f267; + st.shared.b32 [%r28+16], %f268; + st.shared.b32 [%r28+16], %f269; + st.shared.b32 [%r28+8192], %f270; + st.shared.b32 [%r28+8448], %f271; + st.shared.b32 [%r28+16], %f272; + st.shared.b32 [%r28+16], %f273; + st.shared.b32 [%r28+10240], %f274; + st.shared.b32 [%r28+10496], %f275; + st.shared.b32 [%r28+16], %f276; + st.shared.b32 [%r28+16], %f277; + st.shared.b32 [%r28+12288], %f278; + st.shared.b32 [%r28+12544], %f279; + st.shared.b32 [%r28+16], %f280; + st.shared.b32 [%r28+16], %f281; + st.shared.b32 [%r28+14336], %f282; + st.shared.b32 [%r28+14592], %f283; + st.shared.b32 [%r28+16], %f284; + st.shared.b32 [%r28+16], %f285; + st.shared.b32 [%r28+16384], %f286; + st.shared.b32 [%r28+16640], %f287; + st.shared.b32 [%r28+16], %f288; + st.shared.b32 [%r28+16], %f289; + st.shared.b32 [%r28+18432], %f290; + st.shared.b32 [%r28+18688], %f291; + st.shared.b32 [%r28+16], %f292; + st.shared.b32 [%r28+16], %f293; + st.shared.b32 [%r28+20480], %f294; + st.shared.b32 [%r28+20736], %f295; + st.shared.b32 [%r28+16], %f296; + st.shared.b32 [%r28+16], %f297; + st.shared.b32 [%r28+22528], %f298; + st.shared.b32 [%r28+22784], %f299; + st.shared.b32 [%r28+16], %f300; + st.shared.b32 [%r28+16], %f301; + st.shared.b32 [%r28+24576], %f302; + st.shared.b32 [%r28+24832], %f303; + st.shared.b32 [%r28+16], %f304; + st.shared.b32 [%r28+16], %f305; + st.shared.b32 [%r28+26624], %f306; + st.shared.b32 [%r28+26880], %f307; + st.shared.b32 [%r28+16], %f308; + st.shared.b32 [%r28+16], %f309; + st.shared.b32 [%r28+28672], %f310; + st.shared.b32 [%r28+28928], %f311; + st.shared.b32 [%r28+16], %f312; + st.shared.b32 [%r28+16], %f313; + st.shared.b32 [%r28+30720], %f314; + st.shared.b32 [%r28+30976], %f315; + st.shared.b32 [%r28+16], %f316; + st.shared.b32 [%r28+16], %f317; + add.s32 %r3847, %r3847, 128; + setp.lt.s32 %p18, %r3847, %r11; + @%p18 bra LBB0_2; +LBB0_3: + bar.sync 0; + and.b32 %r3579, %r6, 48; + or.b32 %r3580, %r3579, %r8; + shr.u32 %r3581, %r1, 4; + and.b32 %r3582, %r3581, 56; + or.b32 %r3583, %r10, %r3582; + mad.lo.s32 %r3584, %r3580, 72, %r3583; + shl.b32 %r3585, %r3584, 2; + add.s32 %r3587, %r478, %r3585; + st.shared.v2.f32 [%r3587], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2304], {%f129, %f129}; + st.shared.v2.f32 [%r3587+64], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2368], {%f129, %f129}; + st.shared.v2.f32 [%r3587+128], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2432], {%f129, %f129}; + st.shared.v2.f32 [%r3587+192], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2496], {%f129, %f129}; + bar.sync 0; + mad.lo.s32 %r3588, %r3, 72, %r5; + shl.b32 %r3589, %r3588, 2; + add.s32 %r3590, %r478, %r3589; + ld.shared.v4.f32 {%f419, %f420, %f421, %f422}, [%r3590]; + ld.shared.v4.f32 {%f423, %f424, %f425, %f426}, [%r3590+16]; + ld.shared.v4.f32 {%f427, %f428, %f429, %f430}, [%r3590+9216]; + ld.shared.v4.f32 {%f431, %f432, %f433, %f434}, [%r3590+9232]; + bar.sync 0; + st.shared.v2.f32 [%r3587], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2304], {%f129, %f129}; + st.shared.v2.f32 [%r3587+64], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2368], {%f129, %f129}; + st.shared.v2.f32 [%r3587+128], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2432], {%f129, %f129}; + st.shared.v2.f32 [%r3587+192], {%f129, %f129}; + st.shared.v2.f32 [%r3587+2496], {%f129, %f129}; + bar.sync 0; + ld.shared.v4.f32 {%f435, %f436, %f437, %f438}, [%r3590]; + ld.shared.v4.f32 {%f439, %f440, %f441, %f442}, [%r3590+16]; + ld.shared.v4.f32 {%f443, %f444, %f445, %f446}, [%r3590+9216]; + ld.shared.v4.f32 {%f447, %f448, %f449, %f450}, [%r3590+9232]; + bar.sync 0; + st.shared.v2.f32 [%r3587], {%f515, %f516}; + st.shared.v2.f32 [%r3587+2304], {%f517, %f518}; + st.shared.v2.f32 [%r3587+64], {%f519, %f520}; + st.shared.v2.f32 [%r3587+2368], {%f521, %f522}; + st.shared.v2.f32 [%r3587+128], {%f523, %f524}; + st.shared.v2.f32 [%r3587+2432], {%f525, %f526}; + st.shared.v2.f32 [%r3587+192], {%f527, %f528}; + st.shared.v2.f32 [%r3587+2496], {%f529, %f530}; + bar.sync 0; + ld.shared.v4.f32 {%f451, %f452, %f453, %f454}, [%r3590]; + ld.shared.v4.f32 {%f455, %f456, %f457, %f458}, [%r3590+16]; + ld.shared.v4.f32 {%f459, %f460, %f461, %f462}, [%r3590+9216]; + ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r3590+9232]; + bar.sync 0; + st.shared.v2.f32 [%r3587], {%f531, %f532}; + st.shared.v2.f32 [%r3587+2304], {%f533, %f534}; + st.shared.v2.f32 [%r3587+64], {%f535, %f536}; + st.shared.v2.f32 [%r3587+2368], {%f537, %f538}; + st.shared.v2.f32 [%r3587+128], {%f539, %f540}; + st.shared.v2.f32 [%r3587+2432], {%f541, %f542}; + st.shared.v2.f32 [%r3587+192], {%f543, %f544}; + st.shared.v2.f32 [%r3587+2496], {%f545, %f546}; + bar.sync 0; + ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r3590]; + ld.shared.v4.f32 {%f471, %f472, %f473, %f474}, [%r3590+16]; + ld.shared.v4.f32 {%f475, %f476, %f477, %f478}, [%r3590+9216]; + ld.shared.v4.f32 {%f479, %f480, %f481, %f482}, [%r3590+9232]; + shl.b64 %rd62, %rd7, 1; + add.s64 %rd54, %rd2, %rd62; + shl.b64 %rd63, %rd8, 1; + add.s64 %rd55, %rd2, %rd63; + shl.b64 %rd64, %rd9, 1; + add.s64 %rd56, %rd2, %rd64; + shl.b64 %rd65, %rd10, 1; + add.s64 %rd57, %rd2, %rd65; + cvt.rn.f16.f32 %h193, %f452; + cvt.rn.f16.f32 %h194, %f451; + mov.b32 %hh289, {%h194, %h193}; + cvt.rn.f16.f32 %h195, %f454; + cvt.rn.f16.f32 %h196, %f453; + mov.b32 %hh290, {%h196, %h195}; + cvt.rn.f16.f32 %h197, %f456; + cvt.rn.f16.f32 %h198, %f455; + mov.b32 %hh291, {%h198, %h197}; + cvt.rn.f16.f32 %h199, %f458; + cvt.rn.f16.f32 %h200, %f457; + mov.b32 %hh292, {%h200, %h199}; + cvt.rn.f16.f32 %h201, %f460; + cvt.rn.f16.f32 %h202, %f459; + mov.b32 %hh293, {%h202, %h201}; + cvt.rn.f16.f32 %h203, %f462; + cvt.rn.f16.f32 %h204, %f461; + mov.b32 %hh294, {%h204, %h203}; + cvt.rn.f16.f32 %h205, %f464; + cvt.rn.f16.f32 %h206, %f463; + mov.b32 %hh295, {%h206, %h205}; + cvt.rn.f16.f32 %h207, %f466; + cvt.rn.f16.f32 %h208, %f465; + mov.b32 %hh296, {%h208, %h207}; + cvt.rn.f16.f32 %h209, %f468; + cvt.rn.f16.f32 %h210, %f467; + mov.b32 %hh297, {%h210, %h209}; + cvt.rn.f16.f32 %h211, %f470; + cvt.rn.f16.f32 %h212, %f469; + mov.b32 %hh298, {%h212, %h211}; + cvt.rn.f16.f32 %h213, %f472; + cvt.rn.f16.f32 %h214, %f471; + mov.b32 %hh299, {%h214, %h213}; + cvt.rn.f16.f32 %h215, %f474; + cvt.rn.f16.f32 %h216, %f473; + mov.b32 %hh300, {%h216, %h215}; + cvt.rn.f16.f32 %h217, %f476; + cvt.rn.f16.f32 %h218, %f475; + mov.b32 %hh301, {%h218, %h217}; + cvt.rn.f16.f32 %h219, %f478; + cvt.rn.f16.f32 %h220, %f477; + mov.b32 %hh302, {%h220, %h219}; + cvt.rn.f16.f32 %h221, %f480; + cvt.rn.f16.f32 %h222, %f479; + mov.b32 %hh303, {%h222, %h221}; + cvt.rn.f16.f32 %h223, %f482; + cvt.rn.f16.f32 %h224, %f481; + mov.b32 %hh304, {%h224, %h223}; + mov.b32 %r3547, %hh289; + mov.b32 %r3548, %hh290; + mov.b32 %r3549, %hh291; + mov.b32 %r3550, %hh292; + @%p19 st.global.v4.b32 [ %rd54 + 0 ], { %r3547, %r3548, %r3549, %r3550 }; + mov.b32 %r3551, %hh293; + mov.b32 %r3552, %hh294; + mov.b32 %r3553, %hh295; + mov.b32 %r3554, %hh296; + @%p19 st.global.v4.b32 [ %rd55 + 0 ], { %r3551, %r3552, %r3553, %r3554 }; + mov.b32 %r3555, %hh297; + mov.b32 %r3556, %hh298; + mov.b32 %r3557, %hh299; + mov.b32 %r3558, %hh300; + @%p19 st.global.v4.b32 [ %rd56 + 0 ], { %r3555, %r3556, %r3557, %r3558 }; + mov.b32 %r3559, %hh301; + mov.b32 %r3560, %hh302; + mov.b32 %r3561, %hh303; + mov.b32 %r3562, %hh304; + @%p19 st.global.v4.b32 [ %rd57 + 0 ], { %r3559, %r3560, %r3561, %r3562 }; + shl.b64 %rd66, %rd3, 1; + add.s64 %rd58, %rd1, %rd66; + shl.b64 %rd67, %rd4, 1; + add.s64 %rd59, %rd1, %rd67; + shl.b64 %rd68, %rd5, 1; + add.s64 %rd60, %rd1, %rd68; + shl.b64 %rd69, %rd6, 1; + add.s64 %rd61, %rd1, %rd69; + cvt.rn.f16.f32 %h225, %f420; + cvt.rn.f16.f32 %h226, %f419; + mov.b32 %hh305, {%h226, %h225}; + cvt.rn.f16.f32 %h227, %f422; + cvt.rn.f16.f32 %h228, %f421; + mov.b32 %hh306, {%h228, %h227}; + cvt.rn.f16.f32 %h229, %f424; + cvt.rn.f16.f32 %h230, %f423; + mov.b32 %hh307, {%h230, %h229}; + cvt.rn.f16.f32 %h231, %f426; + cvt.rn.f16.f32 %h232, %f425; + mov.b32 %hh308, {%h232, %h231}; + cvt.rn.f16.f32 %h233, %f428; + cvt.rn.f16.f32 %h234, %f427; + mov.b32 %hh309, {%h234, %h233}; + cvt.rn.f16.f32 %h235, %f430; + cvt.rn.f16.f32 %h236, %f429; + mov.b32 %hh310, {%h236, %h235}; + cvt.rn.f16.f32 %h237, %f432; + cvt.rn.f16.f32 %h238, %f431; + mov.b32 %hh311, {%h238, %h237}; + cvt.rn.f16.f32 %h239, %f434; + cvt.rn.f16.f32 %h240, %f433; + mov.b32 %hh312, {%h240, %h239}; + cvt.rn.f16.f32 %h241, %f436; + cvt.rn.f16.f32 %h242, %f435; + mov.b32 %hh313, {%h242, %h241}; + cvt.rn.f16.f32 %h243, %f438; + cvt.rn.f16.f32 %h244, %f437; + mov.b32 %hh314, {%h244, %h243}; + cvt.rn.f16.f32 %h245, %f440; + cvt.rn.f16.f32 %h246, %f439; + mov.b32 %hh315, {%h246, %h245}; + cvt.rn.f16.f32 %h247, %f442; + cvt.rn.f16.f32 %h248, %f441; + mov.b32 %hh316, {%h248, %h247}; + cvt.rn.f16.f32 %h249, %f444; + cvt.rn.f16.f32 %h250, %f443; + mov.b32 %hh317, {%h250, %h249}; + cvt.rn.f16.f32 %h251, %f446; + cvt.rn.f16.f32 %h252, %f445; + mov.b32 %hh318, {%h252, %h251}; + cvt.rn.f16.f32 %h253, %f448; + cvt.rn.f16.f32 %h254, %f447; + mov.b32 %hh319, {%h254, %h253}; + cvt.rn.f16.f32 %h255, %f450; + cvt.rn.f16.f32 %h256, %f449; + mov.b32 %hh320, {%h256, %h255}; + mov.b32 %r3563, %hh305; + mov.b32 %r3564, %hh306; + mov.b32 %r3565, %hh307; + mov.b32 %r3566, %hh308; + @%p19 st.global.v4.b32 [ %rd58 + 0 ], { %r3563, %r3564, %r3565, %r3566 }; + mov.b32 %r3567, %hh309; + mov.b32 %r3568, %hh310; + mov.b32 %r3569, %hh311; + mov.b32 %r3570, %hh312; + @%p19 st.global.v4.b32 [ %rd59 + 0 ], { %r3567, %r3568, %r3569, %r3570 }; + mov.b32 %r3571, %hh313; + mov.b32 %r3572, %hh314; + mov.b32 %r3573, %hh315; + mov.b32 %r3574, %hh316; + @%p19 st.global.v4.b32 [ %rd60 + 0 ], { %r3571, %r3572, %r3573, %r3574 }; + mov.b32 %r3575, %hh317; + mov.b32 %r3576, %hh318; + mov.b32 %r3577, %hh319; + mov.b32 %r3578, %hh320; + @%p19 st.global.v4.b32 [ %rd61 + 0 ], { %r3575, %r3576, %r3577, %r3578 }; + ret; + +} diff --git a/python/bwd.ttgir b/python/bwd.ttgir index 983e2ca5b..ffad904f2 100644 --- a/python/bwd.ttgir +++ b/python/bwd.ttgir @@ -1,22 +1,16 @@ -#blocked0 = #triton_gpu.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [8], order = [0]}> +#blocked0 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> #blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked2 = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}> -#blocked3 = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 2], order = [0, 1]}> #mma0 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 2]}> #mma1 = #triton_gpu.mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [8, 1]}> #shared0 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}> #shared1 = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}> -#shared2 = #triton_gpu.shared<{vec = 8, perPhase = 1, maxPhase = 8, order = [1, 0]}> module attributes {"triton_gpu.num-warps" = 8 : i32} { func public @_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(%arg0: !tt.ptr {tt.divisibility = 16 : i32}, %arg1: !tt.ptr {tt.divisibility = 16 : i32}, %arg2: !tt.ptr {tt.divisibility = 16 : i32}, %arg3: f32, %arg4: !tt.ptr {tt.divisibility = 16 : i32}, %arg5: !tt.ptr {tt.divisibility = 16 : i32}, %arg6: !tt.ptr {tt.divisibility = 16 : i32}, %arg7: !tt.ptr {tt.divisibility = 16 : i32}, %arg8: !tt.ptr {tt.divisibility = 16 : i32}, %arg9: !tt.ptr {tt.divisibility = 16 : i32}, %arg10: !tt.ptr {tt.divisibility = 16 : i32}, %arg11: !tt.ptr {tt.divisibility = 16 : i32}, %arg12: i32 {tt.divisibility = 16 : i32}, %arg13: i32 {tt.divisibility = 16 : i32}, %arg14: i32 {tt.divisibility = 16 : i32}, %arg15: i32 {tt.divisibility = 16 : i32}, %arg16: i32 {tt.divisibility = 16 : i32}, %arg17: i32 {tt.divisibility = 16 : i32}, %arg18: i32 {tt.divisibility = 16 : i32}, %arg19: i32 {tt.divisibility = 16 : i32}, %arg20: i32 {tt.divisibility = 16 : i32}, %arg21: i32, %arg22: i32 {tt.divisibility = 16 : i32}, %arg23: i32 {tt.divisibility = 16 : i32}, %arg24: i32) { %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma1> %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma0> - %cst_1 = arith.constant dense<0xFF800000> : tensor<128x128xf32, #blocked3> %c128 = arith.constant 128 : index - %c128_i32 = arith.constant 128 : i32 - %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %c0_i32 = arith.constant 0 : i32 + %c128_i32 = arith.constant 128 : i32 %0 = tt.get_program_id {axis = 0 : i32} : i32 %1 = arith.divsi %0, %arg22 : i32 %2 = arith.remsi %0, %arg22 : i32 @@ -30,160 +24,88 @@ module attributes {"triton_gpu.num-warps" = 8 : i32} { %10 = tt.addptr %arg6, %5 : !tt.ptr, i32 %11 = tt.addptr %arg7, %5 : !tt.ptr, i32 %12 = tt.addptr %arg8, %5 : !tt.ptr, i32 - %13 = arith.index_cast %arg24 : i32 to index - %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #blocked0> - %15 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %16 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>> - %17 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %18 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> - %19 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> - %20 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked2> + %13 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked0}>> + %14 = tt.make_range {end = 128 : i32, start = 0 : i32} : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> + %15 = tt.expand_dims %13 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked0}>>) -> tensor<128x1xi32, #blocked0> + %16 = tt.expand_dims %14 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> + %17 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked0> + %18 = tt.splat %arg14 : (i32) -> tensor<128x1xi32, #blocked1> + %19 = arith.muli %15, %17 : tensor<128x1xi32, #blocked0> + %20 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked0}>> %21 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>> - %22 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>> - %23 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> - %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %25 = tt.expand_dims %22 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked2}>>) -> tensor<1x64xi32, #blocked2> - %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %27 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %28 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> - %29 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %30 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %31 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %32 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked2> - %33 = arith.muli %0, %arg23 : i32 - %34 = tt.addptr %arg11, %33 : !tt.ptr, i32 - %35 = tt.addptr %arg10, %33 : !tt.ptr, i32 - %36 = arith.muli %arg24, %c128_i32 : i32 - %37 = arith.index_cast %36 : i32 to index - %38 = tt.splat %35 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %39 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #blocked3> - %40 = tt.splat %34 : (!tt.ptr) -> tensor<128x!tt.ptr, #blocked0> - %41 = arith.muli %arg14, %c128_i32 : i32 - %42 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked1> - %43 = tt.splat %41 : (i32) -> tensor<128x64xi32, #blocked2> - %44 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - %45 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> - scf.for %arg25 = %c0 to %13 step %c1 { - %46 = arith.index_cast %arg25 : index to i32 - %47 = arith.muli %46, %c128_i32 : i32 - %48 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %49 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>> - %50 = tt.splat %47 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %51 = arith.addi %48, %15 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> - %52 = arith.addi %50, %17 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>> - %53 = tt.expand_dims %51 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>) -> tensor<128x1xi32, #blocked1> - %54 = tt.expand_dims %52 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked2}>>) -> tensor<128x1xi32, #blocked2> - %55 = arith.muli %53, %28 : tensor<128x1xi32, #blocked1> - %56 = tt.broadcast %55 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %57 = arith.addi %56, %24 : tensor<128x64xi32, #blocked1> - %58 = tt.addptr %29, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %59 = tt.load %58 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %60 = arith.muli %53, %19 : tensor<128x1xi32, #blocked1> - %61 = tt.broadcast %60 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> - %62 = arith.addi %61, %24 : tensor<128x64xi32, #blocked1> - %63 = tt.addptr %30, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %64 = tt.load %63 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> - %65 = arith.index_cast %47 : i32 to index - %66 = triton_gpu.convert_layout %59 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %67 = tt.trans %66 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %68 = arith.addi %49, %16 : tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>> - %69 = tt.expand_dims %68 {axis = 0 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 0, parent = #blocked3}>>) -> tensor<1x128xi32, #blocked3> - %70 = tt.broadcast %69 : (tensor<1x128xi32, #blocked3>) -> tensor<128x128xi32, #blocked3> - %71 = triton_gpu.convert_layout %64 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> - %72 = tt.trans %71 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> - %73 = arith.muli %54, %20 : tensor<128x1xi32, #blocked2> - %74 = tt.broadcast %73 : (tensor<128x1xi32, #blocked2>) -> tensor<128x64xi32, #blocked2> - %75 = arith.addi %74, %26 : tensor<128x64xi32, #blocked2> - %76 = tt.addptr %32, %75 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %77 = tt.addptr %27, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %78 = tt.addptr %31, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %79 = triton_gpu.convert_layout %67 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %80 = triton_gpu.convert_layout %72 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> - %81 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared2> - %82 = triton_gpu.insert_slice_async %58, %81, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared2> - triton_gpu.async_wait {num = 0 : i32} - %83 = tensor.extract_slice %82[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared2> to tensor<128x64xf16, #shared2> - %84 = triton_gpu.convert_layout %83 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %85:5 = scf.for %arg26 = %65 to %37 step %c128 iter_args(%arg27 = %cst_0, %arg28 = %cst_0, %arg29 = %76, %arg30 = %77, %arg31 = %78) -> (tensor<128x64xf32, #mma0>, tensor<128x64xf32, #mma0>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { - %92 = arith.index_cast %arg26 : index to i32 - %93 = tt.splat %92 : (i32) -> tensor<128xi32, #blocked0> - %94 = tt.splat %92 : (i32) -> tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> - %95 = arith.addi %93, %14 : tensor<128xi32, #blocked0> - %96 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared2> - %97 = triton_gpu.insert_slice_async %arg30, %96, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared2> - triton_gpu.async_wait {num = 0 : i32} - %98 = tensor.extract_slice %97[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared2> to tensor<128x64xf16, #shared2> - %99 = triton_gpu.convert_layout %98 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %100 = tt.dot %99, %79, %cst {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x128xf32, #mma1> - %101 = triton_gpu.convert_layout %100 : (tensor<128x128xf32, #mma1>) -> tensor<128x128xf32, #blocked3> - %102 = arith.addi %94, %18 : tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> - %103 = tt.expand_dims %102 {axis = 1 : i32} : (tensor<128xi32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>>) -> tensor<128x1xi32, #blocked3> - %104 = tt.broadcast %103 : (tensor<128x1xi32, #blocked3>) -> tensor<128x128xi32, #blocked3> - %105 = "triton_gpu.cmpi"(%104, %70) {predicate = 5 : i64} : (tensor<128x128xi32, #blocked3>, tensor<128x128xi32, #blocked3>) -> tensor<128x128xi1, #blocked3> - %106 = "triton_gpu.select"(%105, %101, %cst_1) : (tensor<128x128xi1, #blocked3>, tensor<128x128xf32, #blocked3>, tensor<128x128xf32, #blocked3>) -> tensor<128x128xf32, #blocked3> - %107 = tt.addptr %38, %95 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %108 = tt.load %107 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %109 = arith.mulf %106, %39 : tensor<128x128xf32, #blocked3> - %110 = triton_gpu.convert_layout %108 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>> - %111 = tt.expand_dims %110 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #blocked3}>>) -> tensor<128x1xf32, #blocked3> - %112 = tt.broadcast %111 : (tensor<128x1xf32, #blocked3>) -> tensor<128x128xf32, #blocked3> - %113 = arith.subf %109, %112 : tensor<128x128xf32, #blocked3> - %114 = math.exp %113 : tensor<128x128xf32, #blocked3> - %115 = arith.truncf %114 : tensor<128x128xf32, #blocked3> to tensor<128x128xf16, #blocked3> - %116 = triton_gpu.convert_layout %115 : (tensor<128x128xf16, #blocked3>) -> tensor<128x128xf16, #shared1> - %117 = tt.trans %116 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #shared0> - %118 = triton_gpu.convert_layout %117 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %119 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared2> - %120 = triton_gpu.insert_slice_async %arg31, %119, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared2> - triton_gpu.async_wait {num = 0 : i32} - %121 = tensor.extract_slice %120[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared2> to tensor<128x64xf16, #shared2> - %122 = triton_gpu.convert_layout %121 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %123 = tt.dot %118, %122, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x64xf32, #mma0> - %124 = tt.addptr %40, %95 : tensor<128x!tt.ptr, #blocked0>, tensor<128xi32, #blocked0> - %125 = tt.load %124 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128xf32, #blocked0> - %126 = triton_gpu.convert_layout %125 : (tensor<128xf32, #blocked0>) -> tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma1}>> - %127 = tt.expand_dims %126 {axis = 1 : i32} : (tensor<128xf32, #triton_gpu.slice<{dim = 1, parent = #mma1}>>) -> tensor<128x1xf32, #mma1> - %128 = tt.broadcast %127 : (tensor<128x1xf32, #mma1>) -> tensor<128x128xf32, #mma1> - %129 = arith.subf %cst, %128 : tensor<128x128xf32, #mma1> - %130 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared2> - %131 = triton_gpu.insert_slice_async %arg31, %130, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared2> - triton_gpu.async_wait {num = 0 : i32} - %132 = tensor.extract_slice %131[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared2> to tensor<128x64xf16, #shared2> - %133 = triton_gpu.convert_layout %132 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> - %134 = tt.dot %133, %80, %129 {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x128xf32, #mma1> - %135 = triton_gpu.convert_layout %134 : (tensor<128x128xf32, #mma1>) -> tensor<128x128xf32, #blocked3> - %136 = arith.mulf %114, %135 : tensor<128x128xf32, #blocked3> - %137 = arith.mulf %136, %39 : tensor<128x128xf32, #blocked3> - %138 = arith.truncf %137 : tensor<128x128xf32, #blocked3> to tensor<128x128xf16, #blocked3> - %139 = triton_gpu.convert_layout %138 : (tensor<128x128xf16, #blocked3>) -> tensor<128x128xf16, #shared1> - %140 = tt.trans %139 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #shared0> - %141 = triton_gpu.convert_layout %140 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %142 = triton_gpu.alloc_tensor : tensor<1x128x64xf16, #shared2> - %143 = triton_gpu.insert_slice_async %arg30, %142, %c0_i32 {axis = 0 : i32, cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64x!tt.ptr, #blocked1> -> tensor<1x128x64xf16, #shared2> - triton_gpu.async_wait {num = 0 : i32} - %144 = tensor.extract_slice %143[0, 0, 0] [1, 128, 64] [1, 1, 1] : tensor<1x128x64xf16, #shared2> to tensor<128x64xf16, #shared2> - %145 = triton_gpu.convert_layout %144 : (tensor<128x64xf16, #shared2>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> - %146 = tt.dot %141, %145, %arg28 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x64xf32, #mma0> - %147 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf32, #blocked2> - %148 = triton_gpu.convert_layout %147 : (tensor<128x64xf32, #blocked2>) -> tensor<128x64xf32, #mma0> - %149 = triton_gpu.convert_layout %138 : (tensor<128x128xf16, #blocked3>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> - %150 = tt.dot %149, %84, %148 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x64xf32, #mma0> - %151 = triton_gpu.convert_layout %150 : (tensor<128x64xf32, #mma0>) -> tensor<128x64xf32, #blocked2> - tt.store %arg29, %151 : tensor<128x64xf32, #blocked2> - %152 = tt.addptr %arg29, %43 : tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64xi32, #blocked2> - %153 = tt.addptr %arg30, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %154 = tt.addptr %arg31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - scf.yield %123, %146, %152, %153, %154 : tensor<128x64xf32, #mma0>, tensor<128x64xf32, #mma0>, tensor<128x64x!tt.ptr, #blocked2>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> - } - %86 = triton_gpu.convert_layout %85#1 : (tensor<128x64xf32, #mma0>) -> tensor<128x64xf32, #blocked1> - %87 = triton_gpu.convert_layout %85#0 : (tensor<128x64xf32, #mma0>) -> tensor<128x64xf32, #blocked1> - %88 = tt.addptr %44, %62 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %89 = arith.truncf %87 : tensor<128x64xf32, #blocked1> to tensor<128x64xf16, #blocked1> - tt.store %88, %89 : tensor<128x64xf16, #blocked1> - %90 = tt.addptr %45, %57 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> - %91 = arith.truncf %86 : tensor<128x64xf32, #blocked1> to tensor<128x64xf16, #blocked1> - tt.store %90, %91 : tensor<128x64xf16, #blocked1> + %22 = tt.broadcast %19 : (tensor<128x1xi32, #blocked0>) -> tensor<128x64xi32, #blocked0> + %23 = tt.expand_dims %20 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked0}>>) -> tensor<1x64xi32, #blocked0> + %24 = tt.broadcast %23 : (tensor<1x64xi32, #blocked0>) -> tensor<128x64xi32, #blocked0> + %25 = tt.expand_dims %21 {axis = 0 : i32} : (tensor<64xi32, #triton_gpu.slice<{dim = 0, parent = #blocked1}>>) -> tensor<1x64xi32, #blocked1> + %26 = tt.broadcast %25 : (tensor<1x64xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> + %27 = arith.addi %22, %24 : tensor<128x64xi32, #blocked0> + %28 = tt.splat %6 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %29 = tt.splat %arg17 : (i32) -> tensor<128x1xi32, #blocked1> + %30 = tt.splat %7 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %31 = tt.splat %8 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %32 = tt.splat %9 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %33 = tt.splat %10 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked0> + %34 = tt.addptr %33, %27 : tensor<128x64x!tt.ptr, #blocked0>, tensor<128x64xi32, #blocked0> + %35 = arith.muli %16, %29 : tensor<128x1xi32, #blocked1> + %36 = tt.broadcast %35 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> + %37 = arith.addi %36, %26 : tensor<128x64xi32, #blocked1> + %38 = tt.addptr %30, %37 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %39 = tt.load %38 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> + %40 = arith.muli %16, %18 : tensor<128x1xi32, #blocked1> + %41 = tt.broadcast %40 : (tensor<128x1xi32, #blocked1>) -> tensor<128x64xi32, #blocked1> + %42 = arith.addi %41, %26 : tensor<128x64xi32, #blocked1> + %43 = tt.addptr %31, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %44 = tt.load %43 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> + %45 = arith.muli %arg24, %c128_i32 : i32 + %46 = arith.index_cast %45 : i32 to index + %47 = triton_gpu.convert_layout %39 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> + %48 = tt.trans %47 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> + %49 = tt.splat %arg3 : (f32) -> tensor<128x128xf32, #mma1> + %50 = triton_gpu.convert_layout %44 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #shared0> + %51 = tt.trans %50 : (tensor<128x64xf16, #shared0>) -> tensor<64x128xf16, #shared1> + %52 = arith.muli %arg14, %c128_i32 : i32 + %53 = tt.splat %52 : (i32) -> tensor<128x64xi32, #blocked0> + %54 = tt.splat %52 : (i32) -> tensor<128x64xi32, #blocked1> + %55 = tt.addptr %28, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %56 = tt.addptr %32, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %57 = triton_gpu.convert_layout %48 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> + %58 = triton_gpu.convert_layout %51 : (tensor<64x128xf16, #shared1>) -> tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> + %59:5 = scf.for %arg25 = %c0 to %46 step %c128 iter_args(%arg26 = %cst_0, %arg27 = %cst_0, %arg28 = %34, %arg29 = %55, %arg30 = %56) -> (tensor<128x64xf32, #mma0>, tensor<128x64xf32, #mma0>, tensor<128x64x!tt.ptr, #blocked0>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1>) { + %68 = tt.load %arg29 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> + %69 = triton_gpu.convert_layout %68 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> + %70 = tt.dot %69, %57, %cst {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x128xf32, #mma1> + %73 = tt.load %arg30 {cache = 1 : i32, evict = 1 : i32, isVolatile = false} : tensor<128x64xf16, #blocked1> + %74 = arith.truncf %70 : tensor<128x128xf32, #mma1> to tensor<128x128xf16, #mma1> + %75 = triton_gpu.convert_layout %74 : (tensor<128x128xf16, #mma1>) -> tensor<128x128xf16, #shared1> + %76 = tt.trans %75 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #shared0> + %77 = triton_gpu.convert_layout %76 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> + %78 = triton_gpu.convert_layout %73 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> + %79 = tt.dot %77, %78, %arg26 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x64xf32, #mma0> + %80 = triton_gpu.convert_layout %73 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> + %81 = tt.dot %80, %58, %cst {allowTF32 = true} : tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1}>> * tensor<64x128xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1}>> -> tensor<128x128xf32, #mma1> + %83 = arith.mulf %70, %81 : tensor<128x128xf32, #mma1> + %84 = arith.mulf %83, %49 : tensor<128x128xf32, #mma1> + %85 = arith.truncf %84 : tensor<128x128xf32, #mma1> to tensor<128x128xf16, #mma1> + %86 = triton_gpu.convert_layout %85 : (tensor<128x128xf16, #mma1>) -> tensor<128x128xf16, #shared1> + %87 = tt.trans %86 : (tensor<128x128xf16, #shared1>) -> tensor<128x128xf16, #shared0> + %88 = triton_gpu.convert_layout %87 : (tensor<128x128xf16, #shared0>) -> tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> + %89 = triton_gpu.convert_layout %68 : (tensor<128x64xf16, #blocked1>) -> tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> + %90 = tt.dot %88, %89, %arg27 {allowTF32 = true} : tensor<128x128xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma0}>> * tensor<128x64xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #mma0}>> -> tensor<128x64xf32, #mma0> + %91 = tt.addptr %arg28, %53 : tensor<128x64x!tt.ptr, #blocked0>, tensor<128x64xi32, #blocked0> + %92 = tt.addptr %arg29, %54 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %93 = tt.addptr %arg30, %54 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + scf.yield %79, %arg27, %arg28, %arg29, %arg30 : tensor<128x64xf32, #mma0>, tensor<128x64xf32, #mma0>, tensor<128x64x!tt.ptr, #blocked0>, tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64x!tt.ptr, #blocked1> } + %60 = triton_gpu.convert_layout %59#1 : (tensor<128x64xf32, #mma0>) -> tensor<128x64xf32, #blocked1> + %61 = triton_gpu.convert_layout %59#0 : (tensor<128x64xf32, #mma0>) -> tensor<128x64xf32, #blocked1> + %62 = tt.splat %12 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %63 = tt.splat %11 : (!tt.ptr) -> tensor<128x64x!tt.ptr, #blocked1> + %64 = tt.addptr %62, %42 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %65 = arith.truncf %61 : tensor<128x64xf32, #blocked1> to tensor<128x64xf16, #blocked1> + tt.store %64, %65 : tensor<128x64xf16, #blocked1> + %66 = tt.addptr %63, %37 : tensor<128x64x!tt.ptr, #blocked1>, tensor<128x64xi32, #blocked1> + %67 = arith.truncf %60 : tensor<128x64xf32, #blocked1> to tensor<128x64xf16, #blocked1> + tt.store %66, %67 : tensor<128x64xf16, #blocked1> return } } \ No newline at end of file diff --git a/python/triton/compiler.py b/python/triton/compiler.py index 30e9c059a..3da6d3929 100644 --- a/python/triton/compiler.py +++ b/python/triton/compiler.py @@ -905,7 +905,7 @@ def ttir_to_ttgir(mod, num_warps, num_stages, compute_capability): pm.add_licm_pass() pm.add_tritongpu_combine_pass(compute_capability) pm.add_cse_pass() - pm.add_tritongpu_optimize_load_convert_pass() + # pm.add_tritongpu_optimize_load_convert_pass() pm.run(mod) return mod diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py index 0af1f8fb0..0097da9ec 100644 --- a/python/tutorials/06-fused-attention.py +++ b/python/tutorials/06-fused-attention.py @@ -133,65 +133,68 @@ def _bwd_kernel( DQ += off_z * stride_qz + off_h * stride_qh DK += off_z * stride_qz + off_h * stride_qh DV += off_z * stride_qz + off_h * stride_qh - for start_n in range(0, num_block): - lo = start_n * BLOCK_M - # initialize row/col offsets - offs_qm = lo + tl.arange(0, BLOCK_M) - offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) - offs_m = tl.arange(0, BLOCK_N) - offs_k = tl.arange(0, BLOCK_DMODEL) - # initialize pointers to value-like data - q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) - # pointer to row-wise quantities in value-like data - D_ptrs = D + off_hz * N_CTX - m_ptrs = M + off_hz * N_CTX - # initialize dv amd dk - dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) - # k and v stay in SRAM throughout - k = tl.load(k_ptrs) - v = tl.load(v_ptrs) - # loop over rows - for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): - offs_m_curr = start_m + offs_m - # load q, k, v, do on-chip - q = tl.load(q_ptrs) - # recompute p = softmax(qk, dim=-1).T - # NOTE: `do` is pre-divided by `l`; no normalization here - qk = tl.dot(q, tl.trans(k)) - qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) - m = tl.load(m_ptrs + offs_m_curr) - p = tl.exp(qk * sm_scale - m[:, None]) - # compute dv - do = tl.load(do_ptrs) - dv += tl.dot(tl.trans(p.to(tl.float16)), do) - # compute dp = dot(v, do) - Di = tl.load(D_ptrs + offs_m_curr) - dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] - dp += tl.dot(do, tl.trans(v)) - # compute ds = p * (dp - delta[:, None]) - ds = p * dp * sm_scale - # compute dk = dot(ds.T, q) - dk += tl.dot(tl.trans(ds.to(tl.float16)), q) - # compute dq - dq = tl.load(dq_ptrs) - dq += tl.dot(ds.to(tl.float16), k) - tl.store(dq_ptrs, dq) - # increment pointers - dq_ptrs += BLOCK_M * stride_qm - q_ptrs += BLOCK_M * stride_qm - do_ptrs += BLOCK_M * stride_qm - # write-back - dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) - dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) - tl.store(dv_ptrs, dv) - tl.store(dk_ptrs, dk) + # for start_n in range(0, num_block): + start_n = 0 + lo = start_n * BLOCK_M + # initialize row/col offsets + offs_qm = lo + tl.arange(0, BLOCK_M) + offs_n = start_n * BLOCK_M + tl.arange(0, BLOCK_M) + offs_m = tl.arange(0, BLOCK_N) + offs_k = tl.arange(0, BLOCK_DMODEL) + # initialize pointers to value-like data + q_ptrs = Q + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + k_ptrs = K + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) + v_ptrs = V + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) + do_ptrs = DO + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + dq_ptrs = DQ + (offs_qm[:, None] * stride_qm + offs_k[None, :] * stride_qk) + # pointer to row-wise quantities in value-like data + # D_ptrs = D + off_hz * N_CTX + # m_ptrs = M + off_hz * N_CTX + # initialize dv amd dk + dv = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + dk = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + # k and v stay in SRAM throughout + k = tl.load(k_ptrs) + v = tl.load(v_ptrs) + # loop over rows + for start_m in range(lo, num_block * BLOCK_M, BLOCK_M): + # offs_m_curr = start_m + offs_m + # load q, k, v, do on-chip + q = tl.load(q_ptrs) + # recompute p = softmax(qk, dim=-1).T + # NOTE: `do` is pre-divided by `l`; no normalization here + qk = tl.dot(q, tl.trans(k)) + # qk = tl.where(offs_m_curr[:, None] >= (offs_n[None, :]), qk, float("-inf")) + # m = tl.load(m_ptrs + offs_m_curr) + # p = tl.exp(qk * sm_scale - m[:, None]) + p = qk * sm_scale + # compute dv + do = tl.load(do_ptrs) + dv += tl.dot(tl.trans(p.to(tl.float16)), do) + # # compute dp = dot(v, do) + # Di = tl.load(D_ptrs + offs_m_curr) + # dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) - Di[:, None] + dp = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + dp += tl.dot(do, tl.trans(v)) + # compute ds = p * (dp - delta[:, None]) + ds = p * dp * sm_scale + # # compute dk = dot(ds.T, q) + dk += tl.dot(tl.trans(ds.to(tl.float16)), q) + # # compute dq + # dq = tl.load(dq_ptrs) + # dq += tl.dot(ds.to(tl.float16), k) + # tl.store(dq_ptrs, dq) + # increment pointers + dq_ptrs += BLOCK_M * stride_qm + q_ptrs += BLOCK_M * stride_qm + do_ptrs += BLOCK_M * stride_qm + # write-back + dv_ptrs = DV + (offs_n[:, None] * stride_qm + offs_k[None, :] * stride_qk) + dk_ptrs = DK + (offs_n[:, None] * stride_kn + offs_k[None, :] * stride_kk) + tl.store(dv_ptrs, dv) + tl.store(dk_ptrs, dk) -_bwd_kernel = triton.compile("./bwd.ttgir", num_warps=8) +_bwd_kernel = triton.compile("./bwd.ptx", num_warps=8, shared=32768) # _fwd_kernel = triton.compile("./fails.ptx", num_warps=4, shared=18432) empty = torch.empty(128, device="cuda") @@ -285,6 +288,7 @@ class _attention(torch.autograd.Function): # num_stages=1, # ) # print(pgm.asm["ttgir"]) + exit(1) return dq, dk, dv, None @@ -330,7 +334,7 @@ BATCH, N_HEADS, N_CTX, D_HEAD = 4, 48, 4096, 64 # vary seq length for fixed head and batch=4 configs = [triton.testing.Benchmark( x_names=['N_CTX'], - x_vals=[2**i for i in range(10, 16)], + x_vals=[2**i for i in range(10, 11)], line_arg='provider', line_vals=['triton'], line_names=['Triton'], @@ -338,7 +342,7 @@ configs = [triton.testing.Benchmark( ylabel='ms', plot_name=f'fused-attention-batch{BATCH}-head{N_HEADS}-d{D_HEAD}-{mode}', args={'H': N_HEADS, 'BATCH': BATCH, 'D_HEAD': D_HEAD, 'dtype': torch.float16, 'mode': mode} -) for mode in ['fwd']] +) for mode in ['bwd']] @triton.testing.perf_report(configs) @@ -375,4 +379,4 @@ def bench_flash_attention(BATCH, H, N_CTX, D_HEAD, mode, provider, dtype=torch.f ms = triton.testing.do_bench(fn, percentiles=None, warmup=warmup, rep=rep) return ms -# bench_flash_attention.run(save_path='.', print_data=True) \ No newline at end of file +bench_flash_attention.run(save_path='.', print_data=True) \ No newline at end of file