diff --git a/python/bwd.ptx b/python/bwd.ptx index 9b0bd53f7..dc40696f9 100644 --- a/python/bwd.ptx +++ b/python/bwd.ptx @@ -38,1995 +38,2727 @@ ) .maxntid 256, 1, 1 { - .reg .pred %p<27>; - .reg .b16 %h<257>; - .reg .b32 %r<3848>; - .reg .b32 %hh<321>; - .reg .f32 %f<547>; - .reg .b64 %rd<70>; + .reg .pred %p<111>; + .reg .b16 %h<193>; + .reg .b32 %r<6177>; + .reg .b32 %hh<65>; + .reg .f32 %f<973>; + .reg .b64 %rd<139>; + ld.param.u32 %r380, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24]; + setp.lt.s32 %p1, %r380, 1; + @%p1 bra LBB0_6; + ld.param.u32 %r379, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17]; + ld.param.u32 %r378, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14]; + ld.param.u64 %rd55, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5]; + ld.param.f32 %f195, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3]; + ld.param.u64 %rd54, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0]; mov.u32 %r1, %tid.x; - ld.param.u64 %rd28, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1]; - and.b32 %r447, %r1, 31; - ld.param.u64 %rd29, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2]; - shr.u32 %r2, %r1, 5; - bfe.u32 %r448, %r1, 3, 2; - shr.u32 %r449, %r1, 3; - and.b32 %r450, %r449, 124; - or.b32 %r3, %r450, %r448; - ld.param.u64 %rd31, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7]; - ld.param.u64 %rd32, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8]; - and.b32 %r4, %r1, 7; - shl.b32 %r5, %r4, 3; - ld.param.u32 %r451, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12]; - shl.b32 %r6, %r2, 4; - ld.param.u32 %r452, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13]; - ld.param.u32 %r453, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14]; - bfe.u32 %r8, %r1, 2, 3; - shl.b32 %r454, %r1, 1; - ld.param.u32 %r455, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17]; - and.b32 %r10, %r454, 6; - mov.u32 %r456, %ctaid.x; - ld.param.u32 %r458, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22]; - div.s32 %r460, %r456, %r458; - mul.lo.s32 %r461, %r460, %r458; - sub.s32 %r462, %r456, %r461; - ld.param.u32 %r463, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24]; - mul.lo.s32 %r464, %r460, %r451; - mad.lo.s32 %r465, %r462, %r452, %r464; - mul.wide.s32 %rd33, %r465, 2; - add.s64 %rd35, %rd28, %rd33; - add.s64 %rd36, %rd29, %rd33; - add.s64 %rd1, %rd31, %rd33; - add.s64 %rd2, %rd32, %rd33; - shl.b32 %r466, %r455, 5; - mad.lo.s32 %r467, %r3, %r455, %r5; - add.s32 %r468, %r467, %r466; - add.s32 %r469, %r468, %r466; - add.s32 %r470, %r469, %r466; - cvt.s64.s32 %rd3, %r467; - mul.wide.s32 %rd38, %r467, 2; - add.s64 %rd19, %rd35, %rd38; - cvt.s64.s32 %rd4, %r468; - mul.wide.s32 %rd39, %r468, 2; - add.s64 %rd20, %rd35, %rd39; - cvt.s64.s32 %rd5, %r469; - mul.wide.s32 %rd40, %r469, 2; - add.s64 %rd21, %rd35, %rd40; - cvt.s64.s32 %rd6, %r470; - mul.wide.s32 %rd41, %r470, 2; - add.s64 %rd22, %rd35, %rd41; - mov.pred %p19, -1; - @%p19 ld.global.v4.b32 { %r95, %r96, %r97, %r98 }, [ %rd19 + 0 ]; - @%p19 ld.global.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd20 + 0 ]; - @%p19 ld.global.v4.b32 { %r103, %r104, %r105, %r106 }, [ %rd21 + 0 ]; - @%p19 ld.global.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd22 + 0 ]; - shl.b32 %r471, %r453, 5; - mad.lo.s32 %r472, %r3, %r453, %r5; - add.s32 %r473, %r472, %r471; - add.s32 %r474, %r473, %r471; - add.s32 %r475, %r474, %r471; - cvt.s64.s32 %rd7, %r472; - mul.wide.s32 %rd42, %r472, 2; - add.s64 %rd23, %rd36, %rd42; - cvt.s64.s32 %rd8, %r473; - mul.wide.s32 %rd43, %r473, 2; - add.s64 %rd24, %rd36, %rd43; - cvt.s64.s32 %rd9, %r474; - mul.wide.s32 %rd44, %r474, 2; - add.s64 %rd25, %rd36, %rd44; - cvt.s64.s32 %rd10, %r475; - mul.wide.s32 %rd45, %r475, 2; - add.s64 %rd26, %rd36, %rd45; - @%p19 ld.global.v4.b32 { %r111, %r112, %r113, %r114 }, [ %rd23 + 0 ]; - @%p19 ld.global.v4.b32 { %r115, %r116, %r117, %r118 }, [ %rd24 + 0 ]; - @%p19 ld.global.v4.b32 { %r119, %r120, %r121, %r122 }, [ %rd25 + 0 ]; - @%p19 ld.global.v4.b32 { %r123, %r124, %r125, %r126 }, [ %rd26 + 0 ]; - shl.b32 %r11, %r463, 7; - mov.b32 {%h1, %h2}, %r95; - shl.b32 %r12, %r3, 6; - or.b32 %r476, %r12, %r5; + ld.param.u64 %rd56, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1]; + ld.param.u64 %rd57, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2]; + bfe.u32 %r2, %r1, 5, 2; + and.b32 %r3, %r1, 127; + bfe.u32 %r4, %r1, 3, 2; + ld.param.u64 %rd58, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6]; + shr.u32 %r381, %r1, 3; + ld.param.u64 %rd59, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7]; + and.b32 %r382, %r381, 124; + ld.param.u64 %rd60, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8]; + or.b32 %r5, %r382, %r4; + add.s32 %r6, %r5, 32; + ld.param.u64 %rd61, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10]; + add.s32 %r7, %r5, 64; + ld.param.u64 %rd62, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11]; + add.s32 %r8, %r5, 96; + ld.param.u32 %r383, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12]; + shl.b32 %r384, %r1, 1; + ld.param.u32 %r385, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13]; + and.b32 %r9, %r384, 6; + bfe.u32 %r10, %r1, 4, 1; + shr.u32 %r386, %r1, 4; + and.b32 %r387, %r386, 126; + or.b32 %r11, %r387, %r10; + add.s32 %r12, %r11, 16; + add.s32 %r13, %r11, 32; + add.s32 %r14, %r11, 48; + shr.u32 %r388, %r1, 1; + ld.param.u32 %r389, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22]; + and.b32 %r15, %r388, 112; + ld.param.u32 %r390, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23]; + bfe.u32 %r16, %r1, 2, 3; + or.b32 %r17, %r15, %r16; + or.b32 %r18, %r17, 8; + and.b32 %r19, %r1, 7; + shl.b32 %r20, %r19, 3; + shl.b32 %r391, %r1, 2; + and.b32 %r21, %r391, 60; + mov.u32 %r392, %ctaid.x; + div.s32 %r395, %r392, %r389; + mul.lo.s32 %r396, %r395, %r389; + sub.s32 %r397, %r392, %r396; + mul.lo.s32 %r398, %r395, %r383; + mad.lo.s32 %r399, %r397, %r385, %r398; + cvt.s64.s32 %rd1, %r399; + mul.wide.s32 %rd63, %r399, 2; + add.s64 %rd2, %rd56, %rd63; + add.s64 %rd3, %rd57, %rd63; + mul.wide.s32 %rd64, %r399, 4; + add.s64 %rd4, %rd58, %rd64; + add.s64 %rd5, %rd59, %rd63; + add.s64 %rd6, %rd60, %rd63; + mul.lo.s32 %r400, %r392, %r390; + mul.wide.s32 %rd65, %r400, 4; + add.s64 %rd7, %rd62, %rd65; + add.s64 %rd8, %rd61, %rd65; + shl.b32 %r22, %r380, 7; + shl.b32 %r23, %r378, 7; + and.b32 %r402, %r5, 7; + xor.b32 %r403, %r402, %r19; + shl.b32 %r404, %r5, 7; + shl.b32 %r405, %r403, 4; + or.b32 %r406, %r405, %r404; + mov.u32 %r407, global_smem; + add.s32 %r24, %r407, %r406; + shl.b32 %r408, %r6, 7; + or.b32 %r409, %r408, %r405; + add.s32 %r25, %r407, %r409; + shl.b32 %r410, %r7, 7; + or.b32 %r411, %r410, %r405; + add.s32 %r26, %r407, %r411; + shl.b32 %r412, %r8, 7; + or.b32 %r413, %r412, %r405; + add.s32 %r27, %r407, %r413; + add.s32 %r414, %r407, 16384; + add.s32 %r28, %r414, %r406; + add.s32 %r29, %r414, %r409; + add.s32 %r30, %r414, %r411; + add.s32 %r31, %r414, %r413; + add.s32 %r415, %r407, 32768; + add.s32 %r32, %r415, %r406; + add.s32 %r33, %r415, %r409; + add.s32 %r34, %r415, %r411; + add.s32 %r35, %r415, %r413; + and.b32 %r416, %r4, 1; + shl.b32 %r417, %r416, 3; + or.b32 %r418, %r417, %r15; + or.b32 %r419, %r418, %r19; + xor.b32 %r420, %r10, %r19; + shl.b32 %r421, %r419, 6; + shl.b32 %r422, %r420, 3; + or.b32 %r423, %r421, %r422; + shl.b32 %r424, %r423, 1; + add.s32 %r704, %r415, %r424; + or.b32 %r425, %r10, 2; + xor.b32 %r426, %r425, %r19; + shl.b32 %r427, %r426, 3; + or.b32 %r428, %r421, %r427; + shl.b32 %r429, %r428, 1; + add.s32 %r709, %r415, %r429; + or.b32 %r430, %r10, 4; + xor.b32 %r431, %r430, %r19; + shl.b32 %r432, %r431, 3; + or.b32 %r433, %r421, %r432; + shl.b32 %r434, %r433, 1; + add.s32 %r714, %r415, %r434; + or.b32 %r435, %r10, 6; + xor.b32 %r436, %r435, %r19; + shl.b32 %r437, %r436, 3; + or.b32 %r438, %r421, %r437; + shl.b32 %r439, %r438, 1; + add.s32 %r719, %r415, %r439; + shl.b32 %r440, %r10, 3; + or.b32 %r441, %r440, %r19; + xor.b32 %r442, %r416, %r19; + shl.b32 %r443, %r442, 4; + shl.b32 %r444, %r441, 7; + or.b32 %r445, %r443, %r444; + add.s32 %r724, %r407, %r445; + or.b32 %r446, %r4, 2; + xor.b32 %r447, %r446, %r19; + shl.b32 %r448, %r447, 4; + or.b32 %r449, %r448, %r444; + add.s32 %r729, %r407, %r449; + or.b32 %r450, %r416, 4; + xor.b32 %r451, %r450, %r19; + shl.b32 %r452, %r451, 4; + or.b32 %r453, %r452, %r444; + add.s32 %r734, %r407, %r453; + or.b32 %r454, %r4, 6; + xor.b32 %r455, %r454, %r19; + shl.b32 %r456, %r455, 4; + or.b32 %r457, %r456, %r444; + add.s32 %r739, %r407, %r457; + add.s32 %r744, %r724, 2048; + add.s32 %r749, %r729, 2048; + add.s32 %r754, %r734, 2048; + add.s32 %r759, %r739, 2048; + add.s32 %r764, %r724, 4096; + add.s32 %r769, %r729, 4096; + add.s32 %r774, %r734, 4096; + add.s32 %r779, %r739, 4096; + add.s32 %r784, %r724, 6144; + add.s32 %r789, %r729, 6144; + add.s32 %r794, %r734, 6144; + add.s32 %r799, %r739, 6144; + add.s32 %r804, %r724, 8192; + add.s32 %r809, %r729, 8192; + add.s32 %r814, %r734, 8192; + add.s32 %r819, %r739, 8192; + add.s32 %r824, %r724, 10240; + add.s32 %r829, %r729, 10240; + add.s32 %r834, %r734, 10240; + add.s32 %r839, %r739, 10240; + add.s32 %r844, %r724, 12288; + add.s32 %r849, %r729, 12288; + add.s32 %r854, %r734, 12288; + add.s32 %r859, %r739, 12288; + add.s32 %r864, %r724, 14336; + add.s32 %r869, %r729, 14336; + add.s32 %r874, %r734, 14336; + add.s32 %r879, %r739, 14336; + shl.b32 %r458, %r3, 2; + add.s32 %r459, %r407, 49152; + add.s32 %r72, %r459, %r458; + shl.b32 %r460, %r17, 2; + add.s32 %r73, %r459, %r460; + add.s32 %r461, %r16, %r15; + shl.b32 %r462, %r461, 2; + add.s32 %r74, %r459, %r462; + add.s32 %r75, %r459, %r406; + add.s32 %r76, %r459, %r409; + add.s32 %r77, %r459, %r411; + add.s32 %r78, %r459, %r413; + shl.b32 %r463, %r17, 7; + shl.b32 %r464, %r16, 3; + or.b32 %r465, %r464, %r9; + or.b32 %r466, %r463, %r465; + shl.b32 %r467, %r466, 1; + add.s32 %r468, %r407, 65536; + add.s32 %r79, %r468, %r467; + shl.b32 %r469, %r18, 7; + or.b32 %r470, %r469, %r465; + shl.b32 %r471, %r470, 1; + add.s32 %r80, %r468, %r471; + xor.b32 %r472, %r466, 8; + shl.b32 %r473, %r472, 1; + add.s32 %r81, %r468, %r473; + xor.b32 %r474, %r470, 8; + shl.b32 %r475, %r474, 1; + add.s32 %r82, %r468, %r475; + xor.b32 %r476, %r466, 16; shl.b32 %r477, %r476, 1; - mov.u32 %r478, global_smem; - add.s32 %r479, %r478, %r477; - st.shared.b16 [%r479], %h1; - st.shared.b16 [%r479+2], %h2; - mov.b32 {%h3, %h4}, %r96; - st.shared.b16 [%r479+4], %h3; - st.shared.b16 [%r479+6], %h4; - mov.b32 {%h5, %h6}, %r97; - st.shared.b16 [%r479+8], %h5; - st.shared.b16 [%r479+10], %h6; - mov.b32 {%h7, %h8}, %r98; - st.shared.b16 [%r479+12], %h7; - st.shared.b16 [%r479+14], %h8; - mov.b32 {%h9, %h10}, %r99; - add.s32 %r13, %r12, 2048; - or.b32 %r480, %r13, %r5; + add.s32 %r83, %r468, %r477; + xor.b32 %r478, %r470, 16; + shl.b32 %r479, %r478, 1; + add.s32 %r84, %r468, %r479; + xor.b32 %r480, %r466, 24; shl.b32 %r481, %r480, 1; - add.s32 %r482, %r478, %r481; - st.shared.b16 [%r482], %h9; - st.shared.b16 [%r482+2], %h10; - mov.b32 {%h11, %h12}, %r100; - st.shared.b16 [%r482+4], %h11; - st.shared.b16 [%r482+6], %h12; - mov.b32 {%h13, %h14}, %r101; - st.shared.b16 [%r482+8], %h13; - st.shared.b16 [%r482+10], %h14; - mov.b32 {%h15, %h16}, %r102; - st.shared.b16 [%r482+12], %h15; - st.shared.b16 [%r482+14], %h16; - mov.b32 {%h17, %h18}, %r103; - add.s32 %r14, %r12, 4096; - or.b32 %r483, %r14, %r5; - shl.b32 %r484, %r483, 1; - add.s32 %r485, %r478, %r484; - st.shared.b16 [%r485], %h17; - st.shared.b16 [%r485+2], %h18; - mov.b32 {%h19, %h20}, %r104; - st.shared.b16 [%r485+4], %h19; - st.shared.b16 [%r485+6], %h20; - mov.b32 {%h21, %h22}, %r105; - st.shared.b16 [%r485+8], %h21; - st.shared.b16 [%r485+10], %h22; - mov.b32 {%h23, %h24}, %r106; - st.shared.b16 [%r485+12], %h23; - st.shared.b16 [%r485+14], %h24; - mov.b32 {%h25, %h26}, %r107; - add.s32 %r15, %r12, 6144; - or.b32 %r486, %r15, %r5; + add.s32 %r85, %r468, %r481; + xor.b32 %r482, %r470, 24; + shl.b32 %r483, %r482, 1; + add.s32 %r86, %r468, %r483; + xor.b32 %r484, %r466, 32; + shl.b32 %r485, %r484, 1; + add.s32 %r87, %r468, %r485; + xor.b32 %r486, %r470, 32; shl.b32 %r487, %r486, 1; - add.s32 %r488, %r478, %r487; - st.shared.b16 [%r488], %h25; - st.shared.b16 [%r488+2], %h26; - mov.b32 {%h27, %h28}, %r108; - st.shared.b16 [%r488+4], %h27; - st.shared.b16 [%r488+6], %h28; - mov.b32 {%h29, %h30}, %r109; - st.shared.b16 [%r488+8], %h29; - st.shared.b16 [%r488+10], %h30; - mov.b32 {%h31, %h32}, %r110; - st.shared.b16 [%r488+12], %h31; - st.shared.b16 [%r488+14], %h32; - bar.sync 0; - mov.b32 {%h33, %h34}, %r111; - add.s32 %r489, %r478, 16384; - add.s32 %r490, %r489, %r477; - st.shared.b16 [%r490], %h33; - st.shared.b16 [%r490+2], %h34; - mov.b32 {%h35, %h36}, %r112; - st.shared.b16 [%r490+4], %h35; - st.shared.b16 [%r490+6], %h36; - mov.b32 {%h37, %h38}, %r113; - st.shared.b16 [%r490+8], %h37; - st.shared.b16 [%r490+10], %h38; - mov.b32 {%h39, %h40}, %r114; - st.shared.b16 [%r490+12], %h39; - st.shared.b16 [%r490+14], %h40; - mov.b32 {%h41, %h42}, %r115; - add.s32 %r491, %r489, %r481; - st.shared.b16 [%r491], %h41; - st.shared.b16 [%r491+2], %h42; - mov.b32 {%h43, %h44}, %r116; - st.shared.b16 [%r491+4], %h43; - st.shared.b16 [%r491+6], %h44; - mov.b32 {%h45, %h46}, %r117; - st.shared.b16 [%r491+8], %h45; - st.shared.b16 [%r491+10], %h46; - mov.b32 {%h47, %h48}, %r118; - st.shared.b16 [%r491+12], %h47; - st.shared.b16 [%r491+14], %h48; - mov.b32 {%h49, %h50}, %r119; - add.s32 %r492, %r489, %r484; - st.shared.b16 [%r492], %h49; - st.shared.b16 [%r492+2], %h50; - mov.b32 {%h51, %h52}, %r120; - st.shared.b16 [%r492+4], %h51; - st.shared.b16 [%r492+6], %h52; - mov.b32 {%h53, %h54}, %r121; - st.shared.b16 [%r492+8], %h53; - st.shared.b16 [%r492+10], %h54; - mov.b32 {%h55, %h56}, %r122; - st.shared.b16 [%r492+12], %h55; - st.shared.b16 [%r492+14], %h56; - mov.b32 {%h57, %h58}, %r123; - add.s32 %r493, %r489, %r487; - st.shared.b16 [%r493], %h57; - st.shared.b16 [%r493+2], %h58; - mov.b32 {%h59, %h60}, %r124; - st.shared.b16 [%r493+4], %h59; - st.shared.b16 [%r493+6], %h60; - mov.b32 {%h61, %h62}, %r125; - st.shared.b16 [%r493+8], %h61; - st.shared.b16 [%r493+10], %h62; - mov.b32 {%h63, %h64}, %r126; - st.shared.b16 [%r493+12], %h63; - st.shared.b16 [%r493+14], %h64; - bar.sync 0; - bfe.u32 %r16, %r447, 3, 1; - bfe.u32 %r17, %r1, 4, 1; - shl.b32 %r18, %r17, 3; - or.b32 %r494, %r18, %r4; - shl.b32 %r495, %r494, 6; - shl.b32 %r19, %r16, 3; - or.b32 %r496, %r495, %r19; + add.s32 %r88, %r468, %r487; + xor.b32 %r488, %r466, 40; + shl.b32 %r489, %r488, 1; + add.s32 %r89, %r468, %r489; + xor.b32 %r490, %r470, 40; + shl.b32 %r491, %r490, 1; + add.s32 %r90, %r468, %r491; + xor.b32 %r492, %r466, 48; + shl.b32 %r493, %r492, 1; + add.s32 %r91, %r468, %r493; + xor.b32 %r494, %r470, 48; + shl.b32 %r495, %r494, 1; + add.s32 %r92, %r468, %r495; + xor.b32 %r496, %r466, 56; shl.b32 %r497, %r496, 1; - add.s32 %r131, %r478, %r497; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3591, %r3592, %r3593, %r3594 }, [ %r131 + 0 ]; - and.b32 %r498, %r1, 8; - or.b32 %r499, %r495, %r498; - shl.b32 %r500, %r499, 1; - add.s32 %r501, %r478, %r500; - add.s32 %r136, %r501, 32; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3623, %r3624, %r3625, %r3626 }, [ %r136 + 0 ]; - add.s32 %r141, %r131, 64; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3655, %r3656, %r3657, %r3658 }, [ %r141 + 0 ]; - add.s32 %r146, %r501, 96; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3687, %r3688, %r3689, %r3690 }, [ %r146 + 0 ]; - add.s32 %r151, %r131, 2048; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3595, %r3596, %r3597, %r3598 }, [ %r151 + 0 ]; - add.s32 %r156, %r501, 2080; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3627, %r3628, %r3629, %r3630 }, [ %r156 + 0 ]; - add.s32 %r161, %r131, 2112; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3659, %r3660, %r3661, %r3662 }, [ %r161 + 0 ]; - add.s32 %r166, %r501, 2144; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3691, %r3692, %r3693, %r3694 }, [ %r166 + 0 ]; - add.s32 %r171, %r131, 4096; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3599, %r3600, %r3601, %r3602 }, [ %r171 + 0 ]; - add.s32 %r176, %r501, 4128; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3631, %r3632, %r3633, %r3634 }, [ %r176 + 0 ]; - add.s32 %r181, %r131, 4160; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3663, %r3664, %r3665, %r3666 }, [ %r181 + 0 ]; - add.s32 %r186, %r501, 4192; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3695, %r3696, %r3697, %r3698 }, [ %r186 + 0 ]; - add.s32 %r191, %r131, 6144; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3603, %r3604, %r3605, %r3606 }, [ %r191 + 0 ]; - add.s32 %r196, %r501, 6176; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3635, %r3636, %r3637, %r3638 }, [ %r196 + 0 ]; - add.s32 %r201, %r131, 6208; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3667, %r3668, %r3669, %r3670 }, [ %r201 + 0 ]; - add.s32 %r206, %r501, 6240; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3699, %r3700, %r3701, %r3702 }, [ %r206 + 0 ]; - add.s32 %r211, %r131, 8192; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3607, %r3608, %r3609, %r3610 }, [ %r211 + 0 ]; - add.s32 %r216, %r501, 8224; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3639, %r3640, %r3641, %r3642 }, [ %r216 + 0 ]; - add.s32 %r221, %r131, 8256; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3671, %r3672, %r3673, %r3674 }, [ %r221 + 0 ]; - add.s32 %r226, %r501, 8288; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3703, %r3704, %r3705, %r3706 }, [ %r226 + 0 ]; - add.s32 %r231, %r131, 10240; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3611, %r3612, %r3613, %r3614 }, [ %r231 + 0 ]; - add.s32 %r236, %r501, 10272; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3643, %r3644, %r3645, %r3646 }, [ %r236 + 0 ]; - add.s32 %r241, %r131, 10304; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3675, %r3676, %r3677, %r3678 }, [ %r241 + 0 ]; - add.s32 %r246, %r501, 10336; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3707, %r3708, %r3709, %r3710 }, [ %r246 + 0 ]; - add.s32 %r251, %r131, 12288; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3615, %r3616, %r3617, %r3618 }, [ %r251 + 0 ]; - add.s32 %r256, %r501, 12320; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3647, %r3648, %r3649, %r3650 }, [ %r256 + 0 ]; - add.s32 %r261, %r131, 12352; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3679, %r3680, %r3681, %r3682 }, [ %r261 + 0 ]; - add.s32 %r266, %r501, 12384; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3711, %r3712, %r3713, %r3714 }, [ %r266 + 0 ]; - add.s32 %r271, %r131, 14336; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3619, %r3620, %r3621, %r3622 }, [ %r271 + 0 ]; - add.s32 %r276, %r501, 14368; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3651, %r3652, %r3653, %r3654 }, [ %r276 + 0 ]; - add.s32 %r281, %r131, 14400; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3683, %r3684, %r3685, %r3686 }, [ %r281 + 0 ]; - add.s32 %r286, %r501, 14432; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3715, %r3716, %r3717, %r3718 }, [ %r286 + 0 ]; - add.s32 %r291, %r489, %r497; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3719, %r3720, %r3721, %r3722 }, [ %r291 + 0 ]; - add.s32 %r502, %r489, %r500; - add.s32 %r296, %r502, 32; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3751, %r3752, %r3753, %r3754 }, [ %r296 + 0 ]; - add.s32 %r301, %r291, 64; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3783, %r3784, %r3785, %r3786 }, [ %r301 + 0 ]; - add.s32 %r306, %r502, 96; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3815, %r3816, %r3817, %r3818 }, [ %r306 + 0 ]; - add.s32 %r311, %r291, 2048; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3723, %r3724, %r3725, %r3726 }, [ %r311 + 0 ]; - add.s32 %r316, %r502, 2080; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3755, %r3756, %r3757, %r3758 }, [ %r316 + 0 ]; - add.s32 %r321, %r291, 2112; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3787, %r3788, %r3789, %r3790 }, [ %r321 + 0 ]; - add.s32 %r326, %r502, 2144; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3819, %r3820, %r3821, %r3822 }, [ %r326 + 0 ]; - add.s32 %r331, %r291, 4096; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3727, %r3728, %r3729, %r3730 }, [ %r331 + 0 ]; - add.s32 %r336, %r502, 4128; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3759, %r3760, %r3761, %r3762 }, [ %r336 + 0 ]; - add.s32 %r341, %r291, 4160; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3791, %r3792, %r3793, %r3794 }, [ %r341 + 0 ]; - add.s32 %r346, %r502, 4192; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3823, %r3824, %r3825, %r3826 }, [ %r346 + 0 ]; - add.s32 %r351, %r291, 6144; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3731, %r3732, %r3733, %r3734 }, [ %r351 + 0 ]; - add.s32 %r356, %r502, 6176; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3763, %r3764, %r3765, %r3766 }, [ %r356 + 0 ]; - add.s32 %r361, %r291, 6208; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3795, %r3796, %r3797, %r3798 }, [ %r361 + 0 ]; - add.s32 %r366, %r502, 6240; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3827, %r3828, %r3829, %r3830 }, [ %r366 + 0 ]; - add.s32 %r371, %r291, 8192; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3735, %r3736, %r3737, %r3738 }, [ %r371 + 0 ]; - add.s32 %r376, %r502, 8224; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3767, %r3768, %r3769, %r3770 }, [ %r376 + 0 ]; - add.s32 %r381, %r291, 8256; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3799, %r3800, %r3801, %r3802 }, [ %r381 + 0 ]; - add.s32 %r386, %r502, 8288; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3831, %r3832, %r3833, %r3834 }, [ %r386 + 0 ]; - add.s32 %r391, %r291, 10240; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3739, %r3740, %r3741, %r3742 }, [ %r391 + 0 ]; - add.s32 %r396, %r502, 10272; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3771, %r3772, %r3773, %r3774 }, [ %r396 + 0 ]; - add.s32 %r401, %r291, 10304; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3803, %r3804, %r3805, %r3806 }, [ %r401 + 0 ]; - add.s32 %r406, %r502, 10336; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3835, %r3836, %r3837, %r3838 }, [ %r406 + 0 ]; - add.s32 %r411, %r291, 12288; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3743, %r3744, %r3745, %r3746 }, [ %r411 + 0 ]; - add.s32 %r416, %r502, 12320; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3775, %r3776, %r3777, %r3778 }, [ %r416 + 0 ]; - add.s32 %r421, %r291, 12352; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3807, %r3808, %r3809, %r3810 }, [ %r421 + 0 ]; - add.s32 %r426, %r502, 12384; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3839, %r3840, %r3841, %r3842 }, [ %r426 + 0 ]; - add.s32 %r431, %r291, 14336; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3747, %r3748, %r3749, %r3750 }, [ %r431 + 0 ]; - add.s32 %r436, %r502, 14368; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3779, %r3780, %r3781, %r3782 }, [ %r436 + 0 ]; - add.s32 %r441, %r291, 14400; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3811, %r3812, %r3813, %r3814 }, [ %r441 + 0 ]; - add.s32 %r446, %r502, 14432; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3843, %r3844, %r3845, %r3846 }, [ %r446 + 0 ]; - setp.lt.s32 %p9, %r11, 1; - mov.f32 %f129, 0f00000000; - mov.f32 %f515, %f129; - mov.f32 %f516, %f129; - mov.f32 %f517, %f129; - mov.f32 %f518, %f129; - mov.f32 %f519, %f129; - mov.f32 %f520, %f129; - mov.f32 %f521, %f129; - mov.f32 %f522, %f129; - mov.f32 %f523, %f129; - mov.f32 %f524, %f129; - mov.f32 %f525, %f129; - mov.f32 %f526, %f129; - mov.f32 %f527, %f129; - mov.f32 %f528, %f129; - mov.f32 %f529, %f129; - mov.f32 %f530, %f129; - mov.f32 %f531, %f129; - mov.f32 %f532, %f129; - mov.f32 %f533, %f129; - mov.f32 %f534, %f129; - mov.f32 %f535, %f129; - mov.f32 %f536, %f129; - mov.f32 %f537, %f129; - mov.f32 %f538, %f129; - mov.f32 %f539, %f129; - mov.f32 %f540, %f129; - mov.f32 %f541, %f129; - mov.f32 %f542, %f129; - mov.f32 %f543, %f129; - mov.f32 %f544, %f129; - mov.f32 %f545, %f129; - mov.f32 %f546, %f129; - @%p9 bra LBB0_3; - ld.param.f32 %f97, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3]; - ld.param.u64 %rd27, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0]; - ld.param.u64 %rd30, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5]; - and.b32 %r7, %r6, 112; - or.b32 %r9, %r7, %r8; - add.s64 %rd34, %rd27, %rd33; - add.s64 %rd37, %rd30, %rd33; - add.s64 %rd46, %rd34, %rd42; - add.s64 %rd47, %rd34, %rd43; - add.s64 %rd48, %rd34, %rd44; - add.s64 %rd49, %rd34, %rd45; - add.s64 %rd50, %rd37, %rd42; - add.s64 %rd51, %rd37, %rd43; - add.s64 %rd52, %rd37, %rd44; - add.s64 %rd53, %rd37, %rd45; - mov.b32 %hh1, %r3591; - mov.b32 %hh2, %r3592; - mov.b32 %hh3, %r3593; - mov.b32 %hh4, %r3594; - mov.b32 %hh5, %r3623; - mov.b32 %hh6, %r3624; - mov.b32 %hh7, %r3625; - mov.b32 %hh8, %r3626; - mov.b32 %hh9, %r3655; - mov.b32 %hh10, %r3656; - mov.b32 %hh11, %r3657; - mov.b32 %hh12, %r3658; - mov.b32 %hh13, %r3687; - mov.b32 %hh14, %r3688; - mov.b32 %hh15, %r3689; - mov.b32 %hh16, %r3690; - mov.b32 %hh17, %r3595; - mov.b32 %hh18, %r3596; - mov.b32 %hh19, %r3597; - mov.b32 %hh20, %r3598; - mov.b32 %hh21, %r3627; - mov.b32 %hh22, %r3628; - mov.b32 %hh23, %r3629; - mov.b32 %hh24, %r3630; - mov.b32 %hh25, %r3659; - mov.b32 %hh26, %r3660; - mov.b32 %hh27, %r3661; - mov.b32 %hh28, %r3662; - mov.b32 %hh29, %r3691; - mov.b32 %hh30, %r3692; - mov.b32 %hh31, %r3693; - mov.b32 %hh32, %r3694; - mov.b32 %hh33, %r3599; - mov.b32 %hh34, %r3600; - mov.b32 %hh35, %r3601; - mov.b32 %hh36, %r3602; - mov.b32 %hh37, %r3631; - mov.b32 %hh38, %r3632; - mov.b32 %hh39, %r3633; - mov.b32 %hh40, %r3634; - mov.b32 %hh41, %r3663; - mov.b32 %hh42, %r3664; - mov.b32 %hh43, %r3665; - mov.b32 %hh44, %r3666; - mov.b32 %hh45, %r3695; - mov.b32 %hh46, %r3696; - mov.b32 %hh47, %r3697; - mov.b32 %hh48, %r3698; - mov.b32 %hh49, %r3603; - mov.b32 %hh50, %r3604; - mov.b32 %hh51, %r3605; - mov.b32 %hh52, %r3606; - mov.b32 %hh53, %r3635; - mov.b32 %hh54, %r3636; - mov.b32 %hh55, %r3637; - mov.b32 %hh56, %r3638; - mov.b32 %hh57, %r3667; - mov.b32 %hh58, %r3668; - mov.b32 %hh59, %r3669; - mov.b32 %hh60, %r3670; - mov.b32 %hh61, %r3699; - mov.b32 %hh62, %r3700; - mov.b32 %hh63, %r3701; - mov.b32 %hh64, %r3702; - mov.b32 %hh65, %r3607; - mov.b32 %hh66, %r3608; - mov.b32 %hh67, %r3609; - mov.b32 %hh68, %r3610; - mov.b32 %hh69, %r3639; - mov.b32 %hh70, %r3640; - mov.b32 %hh71, %r3641; - mov.b32 %hh72, %r3642; - mov.b32 %hh73, %r3671; - mov.b32 %hh74, %r3672; - mov.b32 %hh75, %r3673; - mov.b32 %hh76, %r3674; - mov.b32 %hh77, %r3703; - mov.b32 %hh78, %r3704; - mov.b32 %hh79, %r3705; - mov.b32 %hh80, %r3706; - mov.b32 %hh81, %r3611; - mov.b32 %hh82, %r3612; - mov.b32 %hh83, %r3613; - mov.b32 %hh84, %r3614; - mov.b32 %hh85, %r3643; - mov.b32 %hh86, %r3644; - mov.b32 %hh87, %r3645; - mov.b32 %hh88, %r3646; - mov.b32 %hh89, %r3675; - mov.b32 %hh90, %r3676; - mov.b32 %hh91, %r3677; - mov.b32 %hh92, %r3678; - mov.b32 %hh93, %r3707; - mov.b32 %hh94, %r3708; - mov.b32 %hh95, %r3709; - mov.b32 %hh96, %r3710; - mov.b32 %hh97, %r3615; - mov.b32 %hh98, %r3616; - mov.b32 %hh99, %r3617; - mov.b32 %hh100, %r3618; - mov.b32 %hh101, %r3647; - mov.b32 %hh102, %r3648; - mov.b32 %hh103, %r3649; - mov.b32 %hh104, %r3650; - mov.b32 %hh105, %r3679; - mov.b32 %hh106, %r3680; - mov.b32 %hh107, %r3681; - mov.b32 %hh108, %r3682; - mov.b32 %hh109, %r3711; - mov.b32 %hh110, %r3712; - mov.b32 %hh111, %r3713; - mov.b32 %hh112, %r3714; - mov.b32 %hh113, %r3619; - mov.b32 %hh114, %r3620; - mov.b32 %hh115, %r3621; - mov.b32 %hh116, %r3622; - mov.b32 %hh117, %r3651; - mov.b32 %hh118, %r3652; - mov.b32 %hh119, %r3653; - mov.b32 %hh120, %r3654; - mov.b32 %hh121, %r3683; - mov.b32 %hh122, %r3684; - mov.b32 %hh123, %r3685; - mov.b32 %hh124, %r3686; - mov.b32 %hh125, %r3715; - mov.b32 %hh126, %r3716; - mov.b32 %hh127, %r3717; - mov.b32 %hh128, %r3718; - mov.b32 %hh129, %r3719; - mov.b32 %hh130, %r3720; - mov.b32 %hh131, %r3721; - mov.b32 %hh132, %r3722; - mov.b32 %hh133, %r3751; - mov.b32 %hh134, %r3752; - mov.b32 %hh135, %r3753; - mov.b32 %hh136, %r3754; - mov.b32 %hh137, %r3783; - mov.b32 %hh138, %r3784; - mov.b32 %hh139, %r3785; - mov.b32 %hh140, %r3786; - mov.b32 %hh141, %r3815; - mov.b32 %hh142, %r3816; - mov.b32 %hh143, %r3817; - mov.b32 %hh144, %r3818; - mov.b32 %hh145, %r3723; - mov.b32 %hh146, %r3724; - mov.b32 %hh147, %r3725; - mov.b32 %hh148, %r3726; - mov.b32 %hh149, %r3755; - mov.b32 %hh150, %r3756; - mov.b32 %hh151, %r3757; - mov.b32 %hh152, %r3758; - mov.b32 %hh153, %r3787; - mov.b32 %hh154, %r3788; - mov.b32 %hh155, %r3789; - mov.b32 %hh156, %r3790; - mov.b32 %hh157, %r3819; - mov.b32 %hh158, %r3820; - mov.b32 %hh159, %r3821; - mov.b32 %hh160, %r3822; - mov.b32 %hh161, %r3727; - mov.b32 %hh162, %r3728; - mov.b32 %hh163, %r3729; - mov.b32 %hh164, %r3730; - mov.b32 %hh165, %r3759; - mov.b32 %hh166, %r3760; - mov.b32 %hh167, %r3761; - mov.b32 %hh168, %r3762; - mov.b32 %hh169, %r3791; - mov.b32 %hh170, %r3792; - mov.b32 %hh171, %r3793; - mov.b32 %hh172, %r3794; - mov.b32 %hh173, %r3823; - mov.b32 %hh174, %r3824; - mov.b32 %hh175, %r3825; - mov.b32 %hh176, %r3826; - mov.b32 %hh177, %r3731; - mov.b32 %hh178, %r3732; - mov.b32 %hh179, %r3733; - mov.b32 %hh180, %r3734; - mov.b32 %hh181, %r3763; - mov.b32 %hh182, %r3764; - mov.b32 %hh183, %r3765; - mov.b32 %hh184, %r3766; - mov.b32 %hh185, %r3795; - mov.b32 %hh186, %r3796; - mov.b32 %hh187, %r3797; - mov.b32 %hh188, %r3798; - mov.b32 %hh189, %r3827; - mov.b32 %hh190, %r3828; - mov.b32 %hh191, %r3829; - mov.b32 %hh192, %r3830; - mov.b32 %hh193, %r3735; - mov.b32 %hh194, %r3736; - mov.b32 %hh195, %r3737; - mov.b32 %hh196, %r3738; - mov.b32 %hh197, %r3767; - mov.b32 %hh198, %r3768; - mov.b32 %hh199, %r3769; - mov.b32 %hh200, %r3770; - mov.b32 %hh201, %r3799; - mov.b32 %hh202, %r3800; - mov.b32 %hh203, %r3801; - mov.b32 %hh204, %r3802; - mov.b32 %hh205, %r3831; - mov.b32 %hh206, %r3832; - mov.b32 %hh207, %r3833; - mov.b32 %hh208, %r3834; - mov.b32 %hh209, %r3739; - mov.b32 %hh210, %r3740; - mov.b32 %hh211, %r3741; - mov.b32 %hh212, %r3742; - mov.b32 %hh213, %r3771; - mov.b32 %hh214, %r3772; - mov.b32 %hh215, %r3773; - mov.b32 %hh216, %r3774; - mov.b32 %hh217, %r3803; - mov.b32 %hh218, %r3804; - mov.b32 %hh219, %r3805; - mov.b32 %hh220, %r3806; - mov.b32 %hh221, %r3835; - mov.b32 %hh222, %r3836; - mov.b32 %hh223, %r3837; - mov.b32 %hh224, %r3838; - mov.b32 %hh225, %r3743; - mov.b32 %hh226, %r3744; - mov.b32 %hh227, %r3745; - mov.b32 %hh228, %r3746; - mov.b32 %hh229, %r3775; - mov.b32 %hh230, %r3776; - mov.b32 %hh231, %r3777; - mov.b32 %hh232, %r3778; - mov.b32 %hh233, %r3807; - mov.b32 %hh234, %r3808; - mov.b32 %hh235, %r3809; - mov.b32 %hh236, %r3810; - mov.b32 %hh237, %r3839; - mov.b32 %hh238, %r3840; - mov.b32 %hh239, %r3841; - mov.b32 %hh240, %r3842; - mov.b32 %hh241, %r3747; - mov.b32 %hh242, %r3748; - mov.b32 %hh243, %r3749; - mov.b32 %hh244, %r3750; - mov.b32 %hh245, %r3779; - mov.b32 %hh246, %r3780; - mov.b32 %hh247, %r3781; - mov.b32 %hh248, %r3782; - mov.b32 %hh249, %r3811; - mov.b32 %hh250, %r3812; - mov.b32 %hh251, %r3813; - mov.b32 %hh252, %r3814; - mov.b32 %hh253, %r3843; - mov.b32 %hh254, %r3844; - mov.b32 %hh255, %r3845; - mov.b32 %hh256, %r3846; - and.b32 %r504, %r3, 7; - xor.b32 %r505, %r504, %r4; - shl.b32 %r506, %r505, 3; - or.b32 %r507, %r506, %r12; - shl.b32 %r508, %r507, 1; - add.s32 %r20, %r478, %r508; - or.b32 %r510, %r506, %r13; + add.s32 %r93, %r468, %r497; + xor.b32 %r498, %r470, 56; + shl.b32 %r499, %r498, 1; + add.s32 %r94, %r468, %r499; + xor.b32 %r500, %r466, 72; + shl.b32 %r501, %r500, 1; + add.s32 %r97, %r468, %r501; + xor.b32 %r502, %r470, 72; + shl.b32 %r503, %r502, 1; + add.s32 %r98, %r468, %r503; + xor.b32 %r504, %r466, 80; + shl.b32 %r505, %r504, 1; + add.s32 %r99, %r468, %r505; + xor.b32 %r506, %r470, 80; + shl.b32 %r507, %r506, 1; + add.s32 %r100, %r468, %r507; + xor.b32 %r508, %r466, 88; + shl.b32 %r509, %r508, 1; + add.s32 %r101, %r468, %r509; + xor.b32 %r510, %r470, 88; shl.b32 %r511, %r510, 1; - add.s32 %r21, %r478, %r511; - or.b32 %r512, %r506, %r14; + add.s32 %r102, %r468, %r511; + xor.b32 %r512, %r466, 96; shl.b32 %r513, %r512, 1; - add.s32 %r22, %r478, %r513; - or.b32 %r514, %r506, %r15; + add.s32 %r103, %r468, %r513; + xor.b32 %r514, %r470, 96; shl.b32 %r515, %r514, 1; - add.s32 %r23, %r478, %r515; - or.b32 %r516, %r19, %r4; - or.b32 %r517, %r516, %r7; - xor.b32 %r518, %r17, %r4; - shl.b32 %r519, %r518, 4; - shl.b32 %r520, %r517, 7; - or.b32 %r521, %r520, %r519; - add.s32 %r599, %r478, %r521; - or.b32 %r522, %r17, 2; - xor.b32 %r523, %r522, %r4; - shl.b32 %r524, %r523, 4; - or.b32 %r525, %r524, %r520; - add.s32 %r604, %r478, %r525; - or.b32 %r526, %r17, 4; - xor.b32 %r527, %r526, %r4; - shl.b32 %r528, %r527, 4; - or.b32 %r529, %r528, %r520; - add.s32 %r609, %r478, %r529; - or.b32 %r530, %r17, 6; - xor.b32 %r531, %r530, %r4; - shl.b32 %r532, %r531, 4; - or.b32 %r533, %r532, %r520; - add.s32 %r614, %r478, %r533; - shl.b32 %r534, %r9, 1; - shl.b32 %r535, %r10, 8; - or.b32 %r536, %r535, %r534; - add.s32 %r28, %r478, %r536; - or.b32 %r537, %r536, 256; - add.s32 %r30, %r478, %r537; - add.s32 %r31, %r28, 2048; - or.b32 %r538, %r536, 2304; - add.s32 %r32, %r478, %r538; - add.s32 %r33, %r28, 4096; - or.b32 %r539, %r536, 4352; - add.s32 %r34, %r478, %r539; - add.s32 %r35, %r28, 6144; - or.b32 %r540, %r536, 6400; - add.s32 %r36, %r478, %r540; - add.s32 %r37, %r28, 8192; - or.b32 %r541, %r536, 8448; - add.s32 %r38, %r478, %r541; - add.s32 %r39, %r28, 10240; - or.b32 %r542, %r536, 10496; - add.s32 %r40, %r478, %r542; - add.s32 %r41, %r28, 12288; - or.b32 %r543, %r536, 12544; - add.s32 %r42, %r478, %r543; - add.s32 %r43, %r28, 14336; - or.b32 %r544, %r536, 14592; - add.s32 %r44, %r478, %r544; - add.s32 %r45, %r28, 16384; - or.b32 %r545, %r536, 16640; - add.s32 %r46, %r478, %r545; - add.s32 %r47, %r28, 18432; - or.b32 %r546, %r536, 18688; - add.s32 %r48, %r478, %r546; - add.s32 %r49, %r28, 20480; - or.b32 %r547, %r536, 20736; - add.s32 %r50, %r478, %r547; - add.s32 %r51, %r28, 22528; - or.b32 %r548, %r536, 22784; - add.s32 %r52, %r478, %r548; - add.s32 %r53, %r28, 24576; - or.b32 %r549, %r536, 24832; - add.s32 %r54, %r478, %r549; - add.s32 %r55, %r28, 26624; - or.b32 %r550, %r536, 26880; - add.s32 %r56, %r478, %r550; - add.s32 %r57, %r28, 28672; - or.b32 %r551, %r536, 28928; - add.s32 %r58, %r478, %r551; - add.s32 %r59, %r28, 30720; - or.b32 %r552, %r536, 30976; - add.s32 %r60, %r478, %r552; - shl.b32 %r553, %r2, 1; - and.b32 %r554, %r553, 6; - or.b32 %r555, %r554, %r16; - shl.b32 %r556, %r555, 10; - shl.b32 %r557, %r4, 7; - or.b32 %r558, %r556, %r557; - or.b32 %r559, %r558, %r18; - shl.b32 %r560, %r559, 1; - add.s32 %r1531, %r478, %r560; - shl.b32 %r561, %r522, 4; - shl.b32 %r562, %r558, 1; - or.b32 %r563, %r561, %r562; - add.s32 %r1536, %r478, %r563; - shl.b32 %r564, %r526, 4; - or.b32 %r565, %r564, %r562; - add.s32 %r1541, %r478, %r565; - shl.b32 %r566, %r530, 4; - or.b32 %r567, %r566, %r562; - add.s32 %r1546, %r478, %r567; - add.s32 %r1551, %r1531, 128; - add.s32 %r1556, %r1531, 160; - add.s32 %r1561, %r1531, 192; - add.s32 %r1566, %r1531, 224; - add.s32 %r1571, %r1531, 16384; - add.s32 %r1576, %r1536, 16384; - add.s32 %r1581, %r1541, 16384; - add.s32 %r1586, %r1546, 16384; - add.s32 %r1591, %r1531, 16512; - add.s32 %r1596, %r1531, 16544; - add.s32 %r1601, %r1531, 16576; - add.s32 %r1606, %r1531, 16608; - bfe.u32 %r568, %r1, 7, 1; - shl.b32 %r569, %r17, 1; - or.b32 %r570, %r569, %r568; - xor.b32 %r571, %r570, %r4; - shl.b32 %r572, %r571, 4; - shl.b32 %r573, %r516, 7; - or.b32 %r574, %r572, %r573; - add.s32 %r1611, %r478, %r574; - add.s32 %r1616, %r1611, 2048; - add.s32 %r1621, %r1611, 4096; - add.s32 %r1626, %r1611, 6144; - add.s32 %r1631, %r1611, 8192; - add.s32 %r1636, %r1611, 10240; - add.s32 %r1641, %r1611, 12288; - add.s32 %r1646, %r1611, 14336; - or.b32 %r575, %r570, 4; - xor.b32 %r576, %r575, %r4; - shl.b32 %r577, %r576, 4; - or.b32 %r578, %r577, %r573; - add.s32 %r1651, %r478, %r578; - add.s32 %r1656, %r1651, 2048; - add.s32 %r1661, %r1651, 4096; - add.s32 %r1666, %r1651, 6144; - add.s32 %r1671, %r1651, 8192; - add.s32 %r1676, %r1651, 10240; - add.s32 %r1681, %r1651, 12288; - add.s32 %r1686, %r1651, 14336; - mov.f32 %f515, 0f00000000; - mov.u32 %r625, 0; - mov.f32 %f516, %f515; - mov.f32 %f517, %f515; - mov.f32 %f518, %f515; - mov.f32 %f519, %f515; - mov.f32 %f520, %f515; - mov.f32 %f521, %f515; - mov.f32 %f522, %f515; - mov.f32 %f523, %f515; - mov.f32 %f524, %f515; - mov.f32 %f525, %f515; - mov.f32 %f526, %f515; - mov.f32 %f527, %f515; - mov.f32 %f528, %f515; - mov.f32 %f529, %f515; - mov.f32 %f530, %f515; - mov.f32 %f531, %f515; - mov.f32 %f532, %f515; - mov.f32 %f533, %f515; - mov.f32 %f534, %f515; - mov.f32 %f535, %f515; - mov.f32 %f536, %f515; - mov.f32 %f537, %f515; - mov.f32 %f538, %f515; - mov.f32 %f539, %f515; - mov.f32 %f540, %f515; - mov.f32 %f541, %f515; - mov.f32 %f542, %f515; - mov.f32 %f543, %f515; - mov.f32 %f544, %f515; - mov.f32 %f545, %f515; - mov.f32 %f546, %f515; - mov.u32 %r3847, %r625; + add.s32 %r104, %r468, %r515; + xor.b32 %r516, %r466, 104; + shl.b32 %r517, %r516, 1; + add.s32 %r105, %r468, %r517; + xor.b32 %r518, %r470, 104; + shl.b32 %r519, %r518, 1; + add.s32 %r106, %r468, %r519; + xor.b32 %r520, %r466, 112; + shl.b32 %r521, %r520, 1; + add.s32 %r107, %r468, %r521; + xor.b32 %r522, %r470, 112; + shl.b32 %r523, %r522, 1; + add.s32 %r108, %r468, %r523; + xor.b32 %r524, %r466, 120; + shl.b32 %r525, %r524, 1; + add.s32 %r109, %r468, %r525; + xor.b32 %r526, %r470, 120; + shl.b32 %r527, %r526, 1; + add.s32 %r110, %r468, %r527; + shl.b32 %r528, %r2, 1; + or.b32 %r529, %r528, %r416; + xor.b32 %r530, %r529, %r19; + shl.b32 %r531, %r530, 4; + shl.b32 %r532, %r441, 8; + or.b32 %r533, %r531, %r532; + add.s32 %r1797, %r468, %r533; + add.s32 %r1802, %r1797, 4096; + add.s32 %r1807, %r1797, 8192; + add.s32 %r1812, %r1797, 12288; + add.s32 %r1817, %r1797, 16384; + add.s32 %r1822, %r1797, 20480; + add.s32 %r1827, %r1797, 24576; + add.s32 %r1832, %r1797, 28672; + or.b32 %r534, %r529, 8; + xor.b32 %r535, %r534, %r19; + shl.b32 %r536, %r535, 4; + or.b32 %r537, %r536, %r532; + add.s32 %r1837, %r468, %r537; + add.s32 %r1842, %r1837, 4096; + add.s32 %r1847, %r1837, 8192; + add.s32 %r1852, %r1837, 12288; + add.s32 %r1857, %r1837, 16384; + add.s32 %r1862, %r1837, 20480; + add.s32 %r1867, %r1837, 24576; + add.s32 %r1872, %r1837, 28672; + bfe.u32 %r538, %r1, 7, 1; + shl.b32 %r539, %r10, 1; + or.b32 %r540, %r539, %r538; + xor.b32 %r541, %r540, %r19; + shl.b32 %r542, %r416, 9; + shl.b32 %r543, %r19, 6; + or.b32 %r544, %r542, %r543; + shl.b32 %r545, %r541, 4; + shl.b32 %r546, %r544, 1; + or.b32 %r547, %r545, %r546; + add.s32 %r1877, %r459, %r547; + add.s32 %r1882, %r1877, 2048; + add.s32 %r1887, %r1877, 4096; + add.s32 %r1892, %r1877, 6144; + add.s32 %r1897, %r1877, 8192; + add.s32 %r1902, %r1877, 10240; + add.s32 %r1907, %r1877, 12288; + add.s32 %r1912, %r1877, 14336; + or.b32 %r548, %r540, 4; + xor.b32 %r549, %r548, %r19; + shl.b32 %r550, %r549, 4; + or.b32 %r551, %r550, %r546; + add.s32 %r1917, %r459, %r551; + add.s32 %r1922, %r1917, 2048; + add.s32 %r1927, %r1917, 4096; + add.s32 %r1932, %r1917, 6144; + add.s32 %r1937, %r1917, 8192; + add.s32 %r1942, %r1917, 10240; + add.s32 %r1947, %r1917, 12288; + add.s32 %r1952, %r1917, 14336; + add.s32 %r143, %r468, %r458; + add.s32 %r144, %r468, %r460; + add.s32 %r145, %r468, %r462; + add.s32 %r2854, %r459, %r424; + add.s32 %r2859, %r459, %r429; + add.s32 %r2864, %r459, %r434; + add.s32 %r2869, %r459, %r439; + add.s32 %r2874, %r414, %r445; + add.s32 %r2879, %r414, %r449; + add.s32 %r2884, %r414, %r453; + add.s32 %r2889, %r414, %r457; + add.s32 %r2894, %r2874, 2048; + add.s32 %r2899, %r2879, 2048; + add.s32 %r2904, %r2884, 2048; + add.s32 %r2909, %r2889, 2048; + add.s32 %r2914, %r2874, 4096; + add.s32 %r2919, %r2879, 4096; + add.s32 %r2924, %r2884, 4096; + add.s32 %r2929, %r2889, 4096; + add.s32 %r2934, %r2874, 6144; + add.s32 %r2939, %r2879, 6144; + add.s32 %r2944, %r2884, 6144; + add.s32 %r2949, %r2889, 6144; + add.s32 %r2954, %r2874, 8192; + add.s32 %r2959, %r2879, 8192; + add.s32 %r2964, %r2884, 8192; + add.s32 %r2969, %r2889, 8192; + add.s32 %r2974, %r2874, 10240; + add.s32 %r2979, %r2879, 10240; + add.s32 %r2984, %r2884, 10240; + add.s32 %r2989, %r2889, 10240; + add.s32 %r2994, %r2874, 12288; + add.s32 %r2999, %r2879, 12288; + add.s32 %r3004, %r2884, 12288; + add.s32 %r3009, %r2889, 12288; + add.s32 %r3014, %r2874, 14336; + add.s32 %r3019, %r2879, 14336; + add.s32 %r3024, %r2884, 14336; + add.s32 %r3029, %r2889, 14336; + add.s32 %r552, %r407, 50176; + add.s32 %r182, %r552, %r467; + add.s32 %r183, %r552, %r471; + add.s32 %r184, %r552, %r473; + add.s32 %r185, %r552, %r475; + add.s32 %r186, %r552, %r477; + add.s32 %r187, %r552, %r479; + add.s32 %r188, %r552, %r481; + add.s32 %r189, %r552, %r483; + add.s32 %r190, %r552, %r485; + add.s32 %r191, %r552, %r487; + add.s32 %r192, %r552, %r489; + add.s32 %r193, %r552, %r491; + add.s32 %r194, %r552, %r493; + add.s32 %r195, %r552, %r495; + add.s32 %r196, %r552, %r497; + add.s32 %r197, %r552, %r499; + add.s32 %r200, %r552, %r501; + add.s32 %r201, %r552, %r503; + add.s32 %r202, %r552, %r505; + add.s32 %r203, %r552, %r507; + add.s32 %r204, %r552, %r509; + add.s32 %r205, %r552, %r511; + add.s32 %r206, %r552, %r513; + add.s32 %r207, %r552, %r515; + add.s32 %r208, %r552, %r517; + add.s32 %r209, %r552, %r519; + add.s32 %r210, %r552, %r521; + add.s32 %r211, %r552, %r523; + add.s32 %r212, %r552, %r525; + add.s32 %r213, %r552, %r527; + add.s32 %r3930, %r552, %r533; + add.s32 %r3935, %r3930, 4096; + add.s32 %r3940, %r3930, 8192; + add.s32 %r3945, %r3930, 12288; + add.s32 %r3950, %r3930, 16384; + add.s32 %r3955, %r3930, 20480; + add.s32 %r3960, %r3930, 24576; + add.s32 %r3965, %r3930, 28672; + add.s32 %r3970, %r552, %r537; + add.s32 %r3975, %r3970, 4096; + add.s32 %r3980, %r3970, 8192; + add.s32 %r3985, %r3970, 12288; + add.s32 %r3990, %r3970, 16384; + add.s32 %r3995, %r3970, 20480; + add.s32 %r4000, %r3970, 24576; + add.s32 %r4005, %r3970, 28672; + add.s32 %r4010, %r415, %r547; + add.s32 %r4015, %r4010, 2048; + add.s32 %r4020, %r4010, 4096; + add.s32 %r4025, %r4010, 6144; + add.s32 %r4030, %r4010, 8192; + add.s32 %r4035, %r4010, 10240; + add.s32 %r4040, %r4010, 12288; + add.s32 %r4045, %r4010, 14336; + add.s32 %r4050, %r415, %r551; + add.s32 %r4055, %r4050, 2048; + add.s32 %r4060, %r4050, 4096; + add.s32 %r4065, %r4050, 6144; + add.s32 %r4070, %r4050, 8192; + add.s32 %r4075, %r4050, 10240; + add.s32 %r4080, %r4050, 12288; + add.s32 %r4085, %r4050, 14336; + mad.lo.s32 %r553, %r11, 68, %r21; + shl.b32 %r554, %r553, 2; + add.s32 %r246, %r415, %r554; + shl.b32 %r555, %r2, 4; + or.b32 %r556, %r555, %r16; + and.b32 %r558, %r386, 56; + or.b32 %r559, %r9, %r558; + mad.lo.s32 %r560, %r556, 68, %r559; + shl.b32 %r561, %r560, 2; + add.s32 %r247, %r415, %r561; + or.b32 %r562, %r556, 8; + mad.lo.s32 %r563, %r562, 68, %r559; + shl.b32 %r564, %r563, 2; + add.s32 %r248, %r415, %r564; + shl.b32 %r565, %r529, 10; + shl.b32 %r566, %r19, 7; + or.b32 %r567, %r565, %r566; + or.b32 %r568, %r567, %r422; + shl.b32 %r569, %r568, 1; + add.s32 %r5018, %r552, %r569; + or.b32 %r570, %r567, %r427; + shl.b32 %r571, %r570, 1; + add.s32 %r5023, %r552, %r571; + or.b32 %r572, %r567, %r432; + shl.b32 %r573, %r572, 1; + add.s32 %r5028, %r552, %r573; + or.b32 %r574, %r567, %r437; + shl.b32 %r575, %r574, 1; + add.s32 %r5033, %r552, %r575; + or.b32 %r576, %r10, 8; + xor.b32 %r577, %r576, %r19; + shl.b32 %r578, %r577, 4; + shl.b32 %r579, %r567, 1; + or.b32 %r580, %r578, %r579; + add.s32 %r5038, %r552, %r580; + or.b32 %r581, %r10, 10; + xor.b32 %r582, %r581, %r19; + shl.b32 %r583, %r582, 4; + or.b32 %r584, %r583, %r579; + add.s32 %r5043, %r552, %r584; + or.b32 %r585, %r10, 12; + xor.b32 %r586, %r585, %r19; + shl.b32 %r587, %r586, 4; + or.b32 %r588, %r587, %r579; + add.s32 %r5048, %r552, %r588; + or.b32 %r589, %r10, 14; + xor.b32 %r590, %r589, %r19; + shl.b32 %r591, %r590, 4; + or.b32 %r592, %r591, %r579; + add.s32 %r5053, %r552, %r592; + add.s32 %r5058, %r5018, 16384; + add.s32 %r5063, %r5023, 16384; + add.s32 %r5068, %r5028, 16384; + add.s32 %r5073, %r5033, 16384; + add.s32 %r5078, %r5038, 16384; + add.s32 %r5083, %r5043, 16384; + add.s32 %r5088, %r5048, 16384; + add.s32 %r5093, %r5053, 16384; + add.s32 %r5098, %r407, %r547; + add.s32 %r5103, %r5098, 2048; + add.s32 %r5108, %r5098, 4096; + add.s32 %r5113, %r5098, 6144; + add.s32 %r5118, %r5098, 8192; + add.s32 %r5123, %r5098, 10240; + add.s32 %r5128, %r5098, 12288; + add.s32 %r5133, %r5098, 14336; + add.s32 %r5138, %r407, %r551; + add.s32 %r5143, %r5138, 2048; + add.s32 %r5148, %r5138, 4096; + add.s32 %r5153, %r5138, 6144; + add.s32 %r5158, %r5138, 8192; + add.s32 %r5163, %r5138, 10240; + add.s32 %r5168, %r5138, 12288; + add.s32 %r5173, %r5138, 14336; + mad.lo.s32 %r593, %r556, 72, %r559; + shl.b32 %r594, %r593, 1; + add.s32 %r281, %r407, %r594; + add.s32 %r282, %r281, 1152; + mad.lo.s32 %r595, %r5, 72, %r20; + shl.b32 %r596, %r595, 1; + add.s32 %r283, %r407, %r596; + shl.b64 %rd9, %rd1, 1; + mad.lo.s32 %r6174, %r378, %r8, %r20; + mul.wide.s32 %rd10, %r23, 2; + mad.lo.s32 %r6173, %r378, %r7, %r20; + mad.lo.s32 %r6172, %r378, %r6, %r20; + mul.wide.s32 %rd11, %r23, 4; + mad.lo.s32 %r6171, %r378, %r14, %r21; + mad.lo.s32 %r6170, %r378, %r13, %r21; + mad.lo.s32 %r6169, %r378, %r12, %r21; + mov.u32 %r6175, 0; + mov.pred %p102, -1; + mov.f32 %f227, 0f00000000; + bra.uni LBB0_2; +LBB0_5: + add.s64 %rd113, %rd6, %rd129; + shl.b64 %rd122, %rd26, 1; + add.s64 %rd114, %rd6, %rd122; + shl.b64 %rd123, %rd27, 1; + add.s64 %rd115, %rd6, %rd123; + shl.b64 %rd124, %rd28, 1; + add.s64 %rd116, %rd6, %rd124; + cvt.rn.f16.f32 %h129, %f909; + cvt.rn.f16.f32 %h130, %f910; + cvt.rn.f16.f32 %h131, %f911; + cvt.rn.f16.f32 %h132, %f912; + cvt.rn.f16.f32 %h133, %f913; + cvt.rn.f16.f32 %h134, %f914; + cvt.rn.f16.f32 %h135, %f915; + cvt.rn.f16.f32 %h136, %f916; + cvt.rn.f16.f32 %h137, %f917; + cvt.rn.f16.f32 %h138, %f918; + cvt.rn.f16.f32 %h139, %f919; + cvt.rn.f16.f32 %h140, %f920; + cvt.rn.f16.f32 %h141, %f921; + cvt.rn.f16.f32 %h142, %f922; + cvt.rn.f16.f32 %h143, %f923; + cvt.rn.f16.f32 %h144, %f924; + cvt.rn.f16.f32 %h145, %f925; + cvt.rn.f16.f32 %h146, %f926; + cvt.rn.f16.f32 %h147, %f927; + cvt.rn.f16.f32 %h148, %f928; + cvt.rn.f16.f32 %h149, %f929; + cvt.rn.f16.f32 %h150, %f930; + cvt.rn.f16.f32 %h151, %f931; + cvt.rn.f16.f32 %h152, %f932; + cvt.rn.f16.f32 %h153, %f933; + cvt.rn.f16.f32 %h154, %f934; + cvt.rn.f16.f32 %h155, %f935; + cvt.rn.f16.f32 %h156, %f936; + cvt.rn.f16.f32 %h157, %f937; + cvt.rn.f16.f32 %h158, %f938; + cvt.rn.f16.f32 %h159, %f939; + cvt.rn.f16.f32 %h160, %f940; + st.shared.v2.b16 [%r281], {%h129, %h130}; + st.shared.v2.b16 [%r282], {%h131, %h132}; + st.shared.v2.b16 [%r281+32], {%h133, %h134}; + st.shared.v2.b16 [%r282+32], {%h135, %h136}; + st.shared.v2.b16 [%r281+64], {%h137, %h138}; + st.shared.v2.b16 [%r282+64], {%h139, %h140}; + st.shared.v2.b16 [%r281+96], {%h141, %h142}; + st.shared.v2.b16 [%r282+96], {%h143, %h144}; + bar.sync 0; + ld.shared.v4.u32 {%r6137, %r6138, %r6139, %r6140}, [%r283]; + ld.shared.v4.u32 {%r6141, %r6142, %r6143, %r6144}, [%r283+4608]; + bar.sync 0; + st.shared.v2.b16 [%r281], {%h145, %h146}; + st.shared.v2.b16 [%r282], {%h147, %h148}; + st.shared.v2.b16 [%r281+32], {%h149, %h150}; + st.shared.v2.b16 [%r282+32], {%h151, %h152}; + st.shared.v2.b16 [%r281+64], {%h153, %h154}; + st.shared.v2.b16 [%r282+64], {%h155, %h156}; + st.shared.v2.b16 [%r281+96], {%h157, %h158}; + st.shared.v2.b16 [%r282+96], {%h159, %h160}; + bar.sync 0; + ld.shared.v4.u32 {%r6145, %r6146, %r6147, %r6148}, [%r283]; + ld.shared.v4.u32 {%r6149, %r6150, %r6151, %r6152}, [%r283+4608]; + @%p102 st.global.v4.b32 [ %rd113 + 0 ], { %r6137, %r6138, %r6139, %r6140 }; + @%p102 st.global.v4.b32 [ %rd114 + 0 ], { %r6141, %r6142, %r6143, %r6144 }; + @%p102 st.global.v4.b32 [ %rd115 + 0 ], { %r6145, %r6146, %r6147, %r6148 }; + @%p102 st.global.v4.b32 [ %rd116 + 0 ], { %r6149, %r6150, %r6151, %r6152 }; + shl.b64 %rd125, %rd21, 1; + add.s64 %rd117, %rd5, %rd125; + shl.b64 %rd126, %rd22, 1; + add.s64 %rd118, %rd5, %rd126; + shl.b64 %rd127, %rd23, 1; + add.s64 %rd119, %rd5, %rd127; + shl.b64 %rd128, %rd24, 1; + add.s64 %rd120, %rd5, %rd128; + cvt.rn.f16.f32 %h161, %f941; + cvt.rn.f16.f32 %h162, %f942; + cvt.rn.f16.f32 %h163, %f943; + cvt.rn.f16.f32 %h164, %f944; + cvt.rn.f16.f32 %h165, %f945; + cvt.rn.f16.f32 %h166, %f946; + cvt.rn.f16.f32 %h167, %f947; + cvt.rn.f16.f32 %h168, %f948; + cvt.rn.f16.f32 %h169, %f949; + cvt.rn.f16.f32 %h170, %f950; + cvt.rn.f16.f32 %h171, %f951; + cvt.rn.f16.f32 %h172, %f952; + cvt.rn.f16.f32 %h173, %f953; + cvt.rn.f16.f32 %h174, %f954; + cvt.rn.f16.f32 %h175, %f955; + cvt.rn.f16.f32 %h176, %f956; + cvt.rn.f16.f32 %h177, %f957; + cvt.rn.f16.f32 %h178, %f958; + cvt.rn.f16.f32 %h179, %f959; + cvt.rn.f16.f32 %h180, %f960; + cvt.rn.f16.f32 %h181, %f961; + cvt.rn.f16.f32 %h182, %f962; + cvt.rn.f16.f32 %h183, %f963; + cvt.rn.f16.f32 %h184, %f964; + cvt.rn.f16.f32 %h185, %f965; + cvt.rn.f16.f32 %h186, %f966; + cvt.rn.f16.f32 %h187, %f967; + cvt.rn.f16.f32 %h188, %f968; + cvt.rn.f16.f32 %h189, %f969; + cvt.rn.f16.f32 %h190, %f970; + cvt.rn.f16.f32 %h191, %f971; + cvt.rn.f16.f32 %h192, %f972; + bar.sync 0; + st.shared.v2.b16 [%r281], {%h161, %h162}; + st.shared.v2.b16 [%r282], {%h163, %h164}; + st.shared.v2.b16 [%r281+32], {%h165, %h166}; + st.shared.v2.b16 [%r282+32], {%h167, %h168}; + st.shared.v2.b16 [%r281+64], {%h169, %h170}; + st.shared.v2.b16 [%r282+64], {%h171, %h172}; + st.shared.v2.b16 [%r281+96], {%h173, %h174}; + st.shared.v2.b16 [%r282+96], {%h175, %h176}; + bar.sync 0; + ld.shared.v4.u32 {%r6153, %r6154, %r6155, %r6156}, [%r283]; + ld.shared.v4.u32 {%r6157, %r6158, %r6159, %r6160}, [%r283+4608]; + bar.sync 0; + st.shared.v2.b16 [%r281], {%h177, %h178}; + st.shared.v2.b16 [%r282], {%h179, %h180}; + st.shared.v2.b16 [%r281+32], {%h181, %h182}; + st.shared.v2.b16 [%r282+32], {%h183, %h184}; + st.shared.v2.b16 [%r281+64], {%h185, %h186}; + st.shared.v2.b16 [%r282+64], {%h187, %h188}; + st.shared.v2.b16 [%r281+96], {%h189, %h190}; + st.shared.v2.b16 [%r282+96], {%h191, %h192}; + bar.sync 0; + ld.shared.v4.u32 {%r6161, %r6162, %r6163, %r6164}, [%r283]; + ld.shared.v4.u32 {%r6165, %r6166, %r6167, %r6168}, [%r283+4608]; + @%p102 st.global.v4.b32 [ %rd117 + 0 ], { %r6153, %r6154, %r6155, %r6156 }; + @%p102 st.global.v4.b32 [ %rd118 + 0 ], { %r6157, %r6158, %r6159, %r6160 }; + @%p102 st.global.v4.b32 [ %rd119 + 0 ], { %r6161, %r6162, %r6163, %r6164 }; + @%p102 st.global.v4.b32 [ %rd120 + 0 ], { %r6165, %r6166, %r6167, %r6168 }; + add.s32 %r6175, %r6175, 1; + add.s32 %r6174, %r6174, %r23; + add.s32 %r6173, %r6173, %r23; + add.s32 %r6172, %r6172, %r23; + add.s32 %r6171, %r6171, %r23; + add.s32 %r6170, %r6170, %r23; + add.s32 %r6169, %r6169, %r23; + setp.lt.s32 %p110, %r6175, %r380; + @%p110 bra LBB0_2; + bra.uni LBB0_6; LBB0_2: - @%p19 ld.global.v4.b32 { %r3499, %r3500, %r3501, %r3502 }, [ %rd46 + 0 ]; - mov.b32 %hh257, %r3499; - mov.b32 %hh258, %r3500; - mov.b32 %hh259, %r3501; - mov.b32 %hh260, %r3502; - @%p19 ld.global.v4.b32 { %r3503, %r3504, %r3505, %r3506 }, [ %rd47 + 0 ]; - mov.b32 %hh261, %r3503; - mov.b32 %hh262, %r3504; - mov.b32 %hh263, %r3505; - mov.b32 %hh264, %r3506; - @%p19 ld.global.v4.b32 { %r3507, %r3508, %r3509, %r3510 }, [ %rd48 + 0 ]; - mov.b32 %hh265, %r3507; - mov.b32 %hh266, %r3508; - mov.b32 %hh267, %r3509; - mov.b32 %hh268, %r3510; - @%p19 ld.global.v4.b32 { %r3511, %r3512, %r3513, %r3514 }, [ %rd49 + 0 ]; - mov.b32 %hh269, %r3511; - mov.b32 %hh270, %r3512; - mov.b32 %hh271, %r3513; - mov.b32 %hh272, %r3514; + shl.b32 %r6176, %r6175, 7; + or.b32 %r629, %r6176, %r5; + add.s32 %r630, %r6176, %r6; + add.s32 %r631, %r6176, %r7; + add.s32 %r632, %r6176, %r8; + mad.lo.s32 %r633, %r629, %r379, %r20; + mad.lo.s32 %r634, %r630, %r379, %r20; + mad.lo.s32 %r635, %r631, %r379, %r20; + mad.lo.s32 %r636, %r632, %r379, %r20; + cvt.s64.s32 %rd21, %r633; + mul.wide.s32 %rd77, %r633, 2; + add.s64 %rd66, %rd2, %rd77; + cvt.s64.s32 %rd22, %r634; + mul.wide.s32 %rd78, %r634, 2; + add.s64 %rd67, %rd2, %rd78; + cvt.s64.s32 %rd23, %r635; + mul.wide.s32 %rd79, %r635, 2; + add.s64 %rd68, %rd2, %rd79; + cvt.s64.s32 %rd24, %r636; + mul.wide.s32 %rd80, %r636, 2; + add.s64 %rd69, %rd2, %rd80; + @%p102 ld.global.v4.b32 { %r641, %r642, %r643, %r644 }, [ %rd66 + 0 ]; + mov.b32 %hh1, %r641; + mov.b32 %hh2, %r642; + mov.b32 %hh3, %r643; + mov.b32 %hh4, %r644; + @%p102 ld.global.v4.b32 { %r645, %r646, %r647, %r648 }, [ %rd67 + 0 ]; + mov.b32 %hh5, %r645; + mov.b32 %hh6, %r646; + mov.b32 %hh7, %r647; + mov.b32 %hh8, %r648; + @%p102 ld.global.v4.b32 { %r649, %r650, %r651, %r652 }, [ %rd68 + 0 ]; + mov.b32 %hh9, %r649; + mov.b32 %hh10, %r650; + mov.b32 %hh11, %r651; + mov.b32 %hh12, %r652; + @%p102 ld.global.v4.b32 { %r653, %r654, %r655, %r656 }, [ %rd69 + 0 ]; + mov.b32 %hh13, %r653; + mov.b32 %hh14, %r654; + mov.b32 %hh15, %r655; + mov.b32 %hh16, %r656; + mad.lo.s32 %r637, %r629, %r378, %r20; + mad.lo.s32 %r638, %r630, %r378, %r20; + mad.lo.s32 %r639, %r631, %r378, %r20; + mad.lo.s32 %r640, %r632, %r378, %r20; + cvt.s64.s32 %rd25, %r637; + mul.wide.s32 %rd81, %r637, 2; + add.s64 %rd70, %rd3, %rd81; + cvt.s64.s32 %rd26, %r638; + mul.wide.s32 %rd82, %r638, 2; + add.s64 %rd71, %rd3, %rd82; + cvt.s64.s32 %rd27, %r639; + mul.wide.s32 %rd83, %r639, 2; + add.s64 %rd72, %rd3, %rd83; + cvt.s64.s32 %rd28, %r640; + mul.wide.s32 %rd84, %r640, 2; + add.s64 %rd73, %rd3, %rd84; + @%p102 ld.global.v4.b32 { %r657, %r658, %r659, %r660 }, [ %rd70 + 0 ]; + mov.b32 %hh17, %r657; + mov.b32 %hh18, %r658; + mov.b32 %hh19, %r659; + mov.b32 %hh20, %r660; + @%p102 ld.global.v4.b32 { %r661, %r662, %r663, %r664 }, [ %rd71 + 0 ]; + mov.b32 %hh21, %r661; + mov.b32 %hh22, %r662; + mov.b32 %hh23, %r663; + mov.b32 %hh24, %r664; + @%p102 ld.global.v4.b32 { %r665, %r666, %r667, %r668 }, [ %rd72 + 0 ]; + mov.b32 %hh25, %r665; + mov.b32 %hh26, %r666; + mov.b32 %hh27, %r667; + mov.b32 %hh28, %r668; + @%p102 ld.global.v4.b32 { %r669, %r670, %r671, %r672 }, [ %rd73 + 0 ]; + mov.b32 %hh29, %r669; + mov.b32 %hh30, %r670; + mov.b32 %hh31, %r671; + mov.b32 %hh32, %r672; bar.sync 0; - st.shared.v4.b32 [%r20], {%r3499, %r3500, %r3501, %r3502}; - st.shared.v4.b32 [%r21], {%r3503, %r3504, %r3505, %r3506}; - st.shared.v4.b32 [%r22], {%r3507, %r3508, %r3509, %r3510}; - st.shared.v4.b32 [%r23], {%r3511, %r3512, %r3513, %r3514}; + st.shared.v4.b32 [%r24], {%r641, %r642, %r643, %r644}; + st.shared.v4.b32 [%r25], {%r645, %r646, %r647, %r648}; + st.shared.v4.b32 [%r26], {%r649, %r650, %r651, %r652}; + st.shared.v4.b32 [%r27], {%r653, %r654, %r655, %r656}; bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r619, %r620, %r621, %r622 }, [ %r599 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r843, %r844, %r845, %r846 }, [ %r604 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1067, %r1068, %r1069, %r1070 }, [ %r609 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1291, %r1292, %r1293, %r1294 }, [ %r614 + 0 ]; - mov.u32 %r839, %r625; - mov.u32 %r840, %r625; - mov.u32 %r841, %r625; - mov.u32 %r842, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r619, %r620, %r621, %r622 }, { %r3591, %r3592 }, { %r839, %r840, %r841, %r842 }; - mov.u32 %r853, %r625; - mov.u32 %r854, %r625; - mov.u32 %r855, %r625; - mov.u32 %r856, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r619, %r620, %r621, %r622 }, { %r3593, %r3594 }, { %r853, %r854, %r855, %r856 }; - mov.u32 %r867, %r625; - mov.u32 %r868, %r625; - mov.u32 %r869, %r625; - mov.u32 %r870, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r619, %r620, %r621, %r622 }, { %r3595, %r3596 }, { %r867, %r868, %r869, %r870 }; - mov.u32 %r881, %r625; - mov.u32 %r882, %r625; - mov.u32 %r883, %r625; - mov.u32 %r884, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r619, %r620, %r621, %r622 }, { %r3597, %r3598 }, { %r881, %r882, %r883, %r884 }; - mov.u32 %r895, %r625; - mov.u32 %r896, %r625; - mov.u32 %r897, %r625; - mov.u32 %r898, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r619, %r620, %r621, %r622 }, { %r3599, %r3600 }, { %r895, %r896, %r897, %r898 }; - mov.u32 %r909, %r625; - mov.u32 %r910, %r625; - mov.u32 %r911, %r625; - mov.u32 %r912, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r619, %r620, %r621, %r622 }, { %r3601, %r3602 }, { %r909, %r910, %r911, %r912 }; - mov.u32 %r923, %r625; - mov.u32 %r924, %r625; - mov.u32 %r925, %r625; - mov.u32 %r926, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r619, %r620, %r621, %r622 }, { %r3603, %r3604 }, { %r923, %r924, %r925, %r926 }; - mov.u32 %r937, %r625; - mov.u32 %r938, %r625; - mov.u32 %r939, %r625; - mov.u32 %r940, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r619, %r620, %r621, %r622 }, { %r3605, %r3606 }, { %r937, %r938, %r939, %r940 }; - mov.u32 %r951, %r625; - mov.u32 %r952, %r625; - mov.u32 %r953, %r625; - mov.u32 %r954, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r619, %r620, %r621, %r622 }, { %r3607, %r3608 }, { %r951, %r952, %r953, %r954 }; - mov.u32 %r965, %r625; - mov.u32 %r966, %r625; - mov.u32 %r967, %r625; - mov.u32 %r968, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r619, %r620, %r621, %r622 }, { %r3609, %r3610 }, { %r965, %r966, %r967, %r968 }; - mov.u32 %r979, %r625; - mov.u32 %r980, %r625; - mov.u32 %r981, %r625; - mov.u32 %r982, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r619, %r620, %r621, %r622 }, { %r3611, %r3612 }, { %r979, %r980, %r981, %r982 }; - mov.u32 %r993, %r625; - mov.u32 %r994, %r625; - mov.u32 %r995, %r625; - mov.u32 %r996, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r619, %r620, %r621, %r622 }, { %r3613, %r3614 }, { %r993, %r994, %r995, %r996 }; - mov.u32 %r1007, %r625; - mov.u32 %r1008, %r625; - mov.u32 %r1009, %r625; - mov.u32 %r1010, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r619, %r620, %r621, %r622 }, { %r3615, %r3616 }, { %r1007, %r1008, %r1009, %r1010 }; - mov.u32 %r1021, %r625; - mov.u32 %r1022, %r625; - mov.u32 %r1023, %r625; - mov.u32 %r1024, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r619, %r620, %r621, %r622 }, { %r3617, %r3618 }, { %r1021, %r1022, %r1023, %r1024 }; - mov.u32 %r1035, %r625; - mov.u32 %r1036, %r625; - mov.u32 %r1037, %r625; - mov.u32 %r1038, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r619, %r620, %r621, %r622 }, { %r3619, %r3620 }, { %r1035, %r1036, %r1037, %r1038 }; - mov.u32 %r1049, %r625; - mov.u32 %r1050, %r625; - mov.u32 %r1051, %r625; - mov.u32 %r1052, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r619, %r620, %r621, %r622 }, { %r3621, %r3622 }, { %r1049, %r1050, %r1051, %r1052 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r843, %r844, %r845, %r846 }, { %r3623, %r3624 }, { %r839, %r840, %r841, %r842 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r843, %r844, %r845, %r846 }, { %r3625, %r3626 }, { %r853, %r854, %r855, %r856 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r843, %r844, %r845, %r846 }, { %r3627, %r3628 }, { %r867, %r868, %r869, %r870 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r843, %r844, %r845, %r846 }, { %r3629, %r3630 }, { %r881, %r882, %r883, %r884 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r843, %r844, %r845, %r846 }, { %r3631, %r3632 }, { %r895, %r896, %r897, %r898 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r843, %r844, %r845, %r846 }, { %r3633, %r3634 }, { %r909, %r910, %r911, %r912 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r843, %r844, %r845, %r846 }, { %r3635, %r3636 }, { %r923, %r924, %r925, %r926 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r843, %r844, %r845, %r846 }, { %r3637, %r3638 }, { %r937, %r938, %r939, %r940 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r843, %r844, %r845, %r846 }, { %r3639, %r3640 }, { %r951, %r952, %r953, %r954 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r843, %r844, %r845, %r846 }, { %r3641, %r3642 }, { %r965, %r966, %r967, %r968 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r843, %r844, %r845, %r846 }, { %r3643, %r3644 }, { %r979, %r980, %r981, %r982 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r843, %r844, %r845, %r846 }, { %r3645, %r3646 }, { %r993, %r994, %r995, %r996 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r843, %r844, %r845, %r846 }, { %r3647, %r3648 }, { %r1007, %r1008, %r1009, %r1010 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r843, %r844, %r845, %r846 }, { %r3649, %r3650 }, { %r1021, %r1022, %r1023, %r1024 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r843, %r844, %r845, %r846 }, { %r3651, %r3652 }, { %r1035, %r1036, %r1037, %r1038 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r843, %r844, %r845, %r846 }, { %r3653, %r3654 }, { %r1049, %r1050, %r1051, %r1052 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3655, %r3656 }, { %r839, %r840, %r841, %r842 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3657, %r3658 }, { %r853, %r854, %r855, %r856 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3659, %r3660 }, { %r867, %r868, %r869, %r870 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3661, %r3662 }, { %r881, %r882, %r883, %r884 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3663, %r3664 }, { %r895, %r896, %r897, %r898 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3665, %r3666 }, { %r909, %r910, %r911, %r912 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3667, %r3668 }, { %r923, %r924, %r925, %r926 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3669, %r3670 }, { %r937, %r938, %r939, %r940 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3671, %r3672 }, { %r951, %r952, %r953, %r954 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3673, %r3674 }, { %r965, %r966, %r967, %r968 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3675, %r3676 }, { %r979, %r980, %r981, %r982 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3677, %r3678 }, { %r993, %r994, %r995, %r996 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3679, %r3680 }, { %r1007, %r1008, %r1009, %r1010 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3681, %r3682 }, { %r1021, %r1022, %r1023, %r1024 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3683, %r3684 }, { %r1035, %r1036, %r1037, %r1038 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3685, %r3686 }, { %r1049, %r1050, %r1051, %r1052 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3687, %r3688 }, { %r839, %r840, %r841, %r842 }; - mov.b32 %f162, %r842; - mov.b32 %f163, %r841; - mov.b32 %f164, %r840; - mov.b32 %f165, %r839; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3689, %r3690 }, { %r853, %r854, %r855, %r856 }; - mov.b32 %f166, %r856; - mov.b32 %f167, %r855; - mov.b32 %f168, %r854; - mov.b32 %f169, %r853; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3691, %r3692 }, { %r867, %r868, %r869, %r870 }; - mov.b32 %f170, %r870; - mov.b32 %f171, %r869; - mov.b32 %f172, %r868; - mov.b32 %f173, %r867; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3693, %r3694 }, { %r881, %r882, %r883, %r884 }; - mov.b32 %f174, %r884; - mov.b32 %f175, %r883; - mov.b32 %f176, %r882; - mov.b32 %f177, %r881; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3695, %r3696 }, { %r895, %r896, %r897, %r898 }; - mov.b32 %f178, %r898; - mov.b32 %f179, %r897; - mov.b32 %f180, %r896; - mov.b32 %f181, %r895; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3697, %r3698 }, { %r909, %r910, %r911, %r912 }; - mov.b32 %f182, %r912; - mov.b32 %f183, %r911; - mov.b32 %f184, %r910; - mov.b32 %f185, %r909; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3699, %r3700 }, { %r923, %r924, %r925, %r926 }; - mov.b32 %f186, %r926; - mov.b32 %f187, %r925; - mov.b32 %f188, %r924; - mov.b32 %f189, %r923; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3701, %r3702 }, { %r937, %r938, %r939, %r940 }; - mov.b32 %f190, %r940; - mov.b32 %f191, %r939; - mov.b32 %f192, %r938; - mov.b32 %f193, %r937; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3703, %r3704 }, { %r951, %r952, %r953, %r954 }; - mov.b32 %f194, %r954; - mov.b32 %f195, %r953; - mov.b32 %f196, %r952; - mov.b32 %f197, %r951; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3705, %r3706 }, { %r965, %r966, %r967, %r968 }; - mov.b32 %f198, %r968; - mov.b32 %f199, %r967; - mov.b32 %f200, %r966; - mov.b32 %f201, %r965; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3707, %r3708 }, { %r979, %r980, %r981, %r982 }; - mov.b32 %f202, %r982; - mov.b32 %f203, %r981; - mov.b32 %f204, %r980; - mov.b32 %f205, %r979; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3709, %r3710 }, { %r993, %r994, %r995, %r996 }; - mov.b32 %f206, %r996; - mov.b32 %f207, %r995; - mov.b32 %f208, %r994; - mov.b32 %f209, %r993; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3711, %r3712 }, { %r1007, %r1008, %r1009, %r1010 }; - mov.b32 %f210, %r1010; - mov.b32 %f211, %r1009; - mov.b32 %f212, %r1008; - mov.b32 %f213, %r1007; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3713, %r3714 }, { %r1021, %r1022, %r1023, %r1024 }; - mov.b32 %f214, %r1024; - mov.b32 %f215, %r1023; - mov.b32 %f216, %r1022; - mov.b32 %f217, %r1021; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3715, %r3716 }, { %r1035, %r1036, %r1037, %r1038 }; - mov.b32 %f218, %r1038; - mov.b32 %f219, %r1037; - mov.b32 %f220, %r1036; - mov.b32 %f221, %r1035; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3717, %r3718 }, { %r1049, %r1050, %r1051, %r1052 }; - mov.b32 %f222, %r1052; - mov.b32 %f223, %r1051; - mov.b32 %f224, %r1050; - mov.b32 %f225, %r1049; - @%p19 ld.global.v4.b32 { %r3531, %r3532, %r3533, %r3534 }, [ %rd50 + 0 ]; - mov.b32 %hh273, %r3531; - mov.b32 %hh274, %r3532; - mov.b32 %hh275, %r3533; - mov.b32 %hh276, %r3534; - @%p19 ld.global.v4.b32 { %r3535, %r3536, %r3537, %r3538 }, [ %rd51 + 0 ]; - mov.b32 %hh277, %r3535; - mov.b32 %hh278, %r3536; - mov.b32 %hh279, %r3537; - mov.b32 %hh280, %r3538; - @%p19 ld.global.v4.b32 { %r3539, %r3540, %r3541, %r3542 }, [ %rd52 + 0 ]; - mov.b32 %hh281, %r3539; - mov.b32 %hh282, %r3540; - mov.b32 %hh283, %r3541; - mov.b32 %hh284, %r3542; - @%p19 ld.global.v4.b32 { %r3543, %r3544, %r3545, %r3546 }, [ %rd53 + 0 ]; - mov.b32 %hh285, %r3543; - mov.b32 %hh286, %r3544; - mov.b32 %hh287, %r3545; - mov.b32 %hh288, %r3546; - cvt.rn.f16.f32 %h65, %f165; - cvt.rn.f16.f32 %h66, %f164; - cvt.rn.f16.f32 %h67, %f163; - cvt.rn.f16.f32 %h68, %f162; - cvt.rn.f16.f32 %h69, %f169; - cvt.rn.f16.f32 %h70, %f168; - cvt.rn.f16.f32 %h71, %f167; - cvt.rn.f16.f32 %h72, %f166; - cvt.rn.f16.f32 %h73, %f173; - cvt.rn.f16.f32 %h74, %f172; - cvt.rn.f16.f32 %h75, %f171; - cvt.rn.f16.f32 %h76, %f170; - cvt.rn.f16.f32 %h77, %f177; - cvt.rn.f16.f32 %h78, %f176; - cvt.rn.f16.f32 %h79, %f175; - cvt.rn.f16.f32 %h80, %f174; - cvt.rn.f16.f32 %h81, %f181; - cvt.rn.f16.f32 %h82, %f180; - cvt.rn.f16.f32 %h83, %f179; - cvt.rn.f16.f32 %h84, %f178; - cvt.rn.f16.f32 %h85, %f185; - cvt.rn.f16.f32 %h86, %f184; - cvt.rn.f16.f32 %h87, %f183; - cvt.rn.f16.f32 %h88, %f182; - cvt.rn.f16.f32 %h89, %f189; - cvt.rn.f16.f32 %h90, %f188; - cvt.rn.f16.f32 %h91, %f187; - cvt.rn.f16.f32 %h92, %f186; - cvt.rn.f16.f32 %h93, %f193; - cvt.rn.f16.f32 %h94, %f192; - cvt.rn.f16.f32 %h95, %f191; - cvt.rn.f16.f32 %h96, %f190; - cvt.rn.f16.f32 %h97, %f197; - cvt.rn.f16.f32 %h98, %f196; - cvt.rn.f16.f32 %h99, %f195; - cvt.rn.f16.f32 %h100, %f194; - cvt.rn.f16.f32 %h101, %f201; - cvt.rn.f16.f32 %h102, %f200; - cvt.rn.f16.f32 %h103, %f199; - cvt.rn.f16.f32 %h104, %f198; - cvt.rn.f16.f32 %h105, %f205; - cvt.rn.f16.f32 %h106, %f204; - cvt.rn.f16.f32 %h107, %f203; - cvt.rn.f16.f32 %h108, %f202; - cvt.rn.f16.f32 %h109, %f209; - cvt.rn.f16.f32 %h110, %f208; - cvt.rn.f16.f32 %h111, %f207; - cvt.rn.f16.f32 %h112, %f206; - cvt.rn.f16.f32 %h113, %f213; - cvt.rn.f16.f32 %h114, %f212; - cvt.rn.f16.f32 %h115, %f211; - cvt.rn.f16.f32 %h116, %f210; - cvt.rn.f16.f32 %h117, %f217; - cvt.rn.f16.f32 %h118, %f216; - cvt.rn.f16.f32 %h119, %f215; - cvt.rn.f16.f32 %h120, %f214; - cvt.rn.f16.f32 %h121, %f221; - cvt.rn.f16.f32 %h122, %f220; - cvt.rn.f16.f32 %h123, %f219; - cvt.rn.f16.f32 %h124, %f218; - cvt.rn.f16.f32 %h125, %f225; - cvt.rn.f16.f32 %h126, %f224; - cvt.rn.f16.f32 %h127, %f223; - cvt.rn.f16.f32 %h128, %f222; + st.shared.v4.b32 [%r28], {%r657, %r658, %r659, %r660}; + st.shared.v4.b32 [%r29], {%r661, %r662, %r663, %r664}; + st.shared.v4.b32 [%r30], {%r665, %r666, %r667, %r668}; + st.shared.v4.b32 [%r31], {%r669, %r670, %r671, %r672}; bar.sync 0; - st.shared.b16 [%r28], %h65; - st.shared.b16 [%r28+256], %h66; - st.shared.b16 [%r28+16], %h67; - st.shared.b16 [%r30+16], %h68; - st.shared.b16 [%r28+2048], %h69; - st.shared.b16 [%r28+2304], %h70; - st.shared.b16 [%r31+16], %h71; - st.shared.b16 [%r32+16], %h72; - st.shared.b16 [%r28+4096], %h73; - st.shared.b16 [%r28+4352], %h74; - st.shared.b16 [%r33+16], %h75; - st.shared.b16 [%r34+16], %h76; - st.shared.b16 [%r28+6144], %h77; - st.shared.b16 [%r28+6400], %h78; - st.shared.b16 [%r35+16], %h79; - st.shared.b16 [%r36+16], %h80; - st.shared.b16 [%r28+8192], %h81; - st.shared.b16 [%r28+8448], %h82; - st.shared.b16 [%r37+16], %h83; - st.shared.b16 [%r38+16], %h84; - st.shared.b16 [%r28+10240], %h85; - st.shared.b16 [%r28+10496], %h86; - st.shared.b16 [%r39+16], %h87; - st.shared.b16 [%r40+16], %h88; - st.shared.b16 [%r28+12288], %h89; - st.shared.b16 [%r28+12544], %h90; - st.shared.b16 [%r41+16], %h91; - st.shared.b16 [%r42+16], %h92; - st.shared.b16 [%r28+14336], %h93; - st.shared.b16 [%r28+14592], %h94; - st.shared.b16 [%r43+16], %h95; - st.shared.b16 [%r44+16], %h96; - st.shared.b16 [%r28+16384], %h97; - st.shared.b16 [%r28+16640], %h98; - st.shared.b16 [%r45+16], %h99; - st.shared.b16 [%r46+16], %h100; - st.shared.b16 [%r28+18432], %h101; - st.shared.b16 [%r28+18688], %h102; - st.shared.b16 [%r47+16], %h103; - st.shared.b16 [%r48+16], %h104; - st.shared.b16 [%r28+20480], %h105; - st.shared.b16 [%r28+20736], %h106; - st.shared.b16 [%r49+16], %h107; - st.shared.b16 [%r50+16], %h108; - st.shared.b16 [%r28+22528], %h109; - st.shared.b16 [%r28+22784], %h110; - st.shared.b16 [%r51+16], %h111; - st.shared.b16 [%r52+16], %h112; - st.shared.b16 [%r28+24576], %h113; - st.shared.b16 [%r28+24832], %h114; - st.shared.b16 [%r53+16], %h115; - st.shared.b16 [%r54+16], %h116; - st.shared.b16 [%r28+26624], %h117; - st.shared.b16 [%r28+26880], %h118; - st.shared.b16 [%r55+16], %h119; - st.shared.b16 [%r56+16], %h120; - st.shared.b16 [%r28+28672], %h121; - st.shared.b16 [%r28+28928], %h122; - st.shared.b16 [%r57+16], %h123; - st.shared.b16 [%r58+16], %h124; - st.shared.b16 [%r28+30720], %h125; - st.shared.b16 [%r28+30976], %h126; - st.shared.b16 [%r59+16], %h127; - st.shared.b16 [%r60+16], %h128; + setp.ge.s32 %p10, %r6176, %r22; + shl.b64 %rd129, %rd25, 1; + mov.f32 %f909, %f227; + mov.f32 %f910, %f227; + mov.f32 %f911, %f227; + mov.f32 %f912, %f227; + mov.f32 %f913, %f227; + mov.f32 %f914, %f227; + mov.f32 %f915, %f227; + mov.f32 %f916, %f227; + mov.f32 %f917, %f227; + mov.f32 %f918, %f227; + mov.f32 %f919, %f227; + mov.f32 %f920, %f227; + mov.f32 %f921, %f227; + mov.f32 %f922, %f227; + mov.f32 %f923, %f227; + mov.f32 %f924, %f227; + mov.f32 %f925, %f227; + mov.f32 %f926, %f227; + mov.f32 %f927, %f227; + mov.f32 %f928, %f227; + mov.f32 %f929, %f227; + mov.f32 %f930, %f227; + mov.f32 %f931, %f227; + mov.f32 %f932, %f227; + mov.f32 %f933, %f227; + mov.f32 %f934, %f227; + mov.f32 %f935, %f227; + mov.f32 %f936, %f227; + mov.f32 %f937, %f227; + mov.f32 %f938, %f227; + mov.f32 %f939, %f227; + mov.f32 %f940, %f227; + mov.f32 %f941, %f227; + mov.f32 %f942, %f227; + mov.f32 %f943, %f227; + mov.f32 %f944, %f227; + mov.f32 %f945, %f227; + mov.f32 %f946, %f227; + mov.f32 %f947, %f227; + mov.f32 %f948, %f227; + mov.f32 %f949, %f227; + mov.f32 %f950, %f227; + mov.f32 %f951, %f227; + mov.f32 %f952, %f227; + mov.f32 %f953, %f227; + mov.f32 %f954, %f227; + mov.f32 %f955, %f227; + mov.f32 %f956, %f227; + mov.f32 %f957, %f227; + mov.f32 %f958, %f227; + mov.f32 %f959, %f227; + mov.f32 %f960, %f227; + mov.f32 %f961, %f227; + mov.f32 %f962, %f227; + mov.f32 %f963, %f227; + mov.f32 %f964, %f227; + mov.f32 %f965, %f227; + mov.f32 %f966, %f227; + mov.f32 %f967, %f227; + mov.f32 %f968, %f227; + mov.f32 %f969, %f227; + mov.f32 %f970, %f227; + mov.f32 %f971, %f227; + mov.f32 %f972, %f227; + @%p10 bra LBB0_5; + mul.wide.s32 %rd74, %r6174, 2; + add.s64 %rd138, %rd55, %rd74; + mul.wide.s32 %rd75, %r6173, 2; + add.s64 %rd137, %rd55, %rd75; + mul.wide.s32 %rd76, %r6172, 2; + add.s64 %rd136, %rd55, %rd76; + add.s64 %rd134, %rd54, %rd74; + add.s64 %rd133, %rd54, %rd75; + add.s64 %rd132, %rd54, %rd76; + mul.wide.s32 %rd18, %r6171, 4; + mul.wide.s32 %rd19, %r6170, 4; + mul.wide.s32 %rd20, %r6169, 4; + or.b32 %r300, %r6176, %r11; + or.b32 %r301, %r6176, %r9; + or.b32 %r302, %r301, 1; + or.b32 %r303, %r301, 8; + or.b32 %r304, %r301, 9; + or.b32 %r336, %r301, 64; + or.b32 %r335, %r301, 65; + or.b32 %r332, %r301, 72; + or.b32 %r331, %r301, 73; + or.b32 %r328, %r301, 80; + or.b32 %r327, %r301, 81; + or.b32 %r324, %r301, 88; + or.b32 %r323, %r301, 89; + or.b32 %r320, %r301, 96; + or.b32 %r319, %r301, 97; + or.b32 %r316, %r301, 104; + or.b32 %r315, %r301, 105; + or.b32 %r312, %r301, 112; + or.b32 %r311, %r301, 113; + or.b32 %r308, %r301, 120; + or.b32 %r307, %r301, 121; + or.b32 %r352, %r301, 32; + or.b32 %r351, %r301, 33; + or.b32 %r348, %r301, 40; + or.b32 %r347, %r301, 41; + or.b32 %r344, %r301, 48; + or.b32 %r343, %r301, 49; + or.b32 %r340, %r301, 56; + or.b32 %r339, %r301, 57; + or.b32 %r360, %r301, 16; + or.b32 %r359, %r301, 17; + or.b32 %r356, %r301, 24; + or.b32 %r355, %r301, 25; + add.s32 %r673, %r300, 112; + mul.lo.s32 %r674, %r673, %r378; + add.s32 %r675, %r674, %r21; + shl.b32 %r676, %r378, 4; + sub.s32 %r677, %r674, %r676; + add.s32 %r678, %r677, %r21; + sub.s32 %r679, %r677, %r676; + add.s32 %r680, %r679, %r21; + sub.s32 %r681, %r679, %r676; + add.s32 %r682, %r681, %r21; + mad.lo.s32 %r683, %r300, %r378, %r21; + add.s64 %rd135, %rd55, %rd129; + add.s64 %rd131, %rd54, %rd129; + mul.wide.s32 %rd31, %r675, 4; + mul.wide.s32 %rd32, %r678, 4; + mul.wide.s32 %rd33, %r680, 4; + mul.wide.s32 %rd34, %r682, 4; + mul.wide.s32 %rd35, %r683, 4; + mov.f32 %f259, 0f00000000; + mov.u64 %rd130, %rd4; + mov.f32 %f941, %f259; + mov.f32 %f942, %f259; + mov.f32 %f943, %f259; + mov.f32 %f944, %f259; + mov.f32 %f945, %f259; + mov.f32 %f946, %f259; + mov.f32 %f947, %f259; + mov.f32 %f948, %f259; + mov.f32 %f949, %f259; + mov.f32 %f950, %f259; + mov.f32 %f951, %f259; + mov.f32 %f952, %f259; + mov.f32 %f953, %f259; + mov.f32 %f954, %f259; + mov.f32 %f955, %f259; + mov.f32 %f956, %f259; + mov.f32 %f957, %f259; + mov.f32 %f958, %f259; + mov.f32 %f959, %f259; + mov.f32 %f960, %f259; + mov.f32 %f961, %f259; + mov.f32 %f962, %f259; + mov.f32 %f963, %f259; + mov.f32 %f964, %f259; + mov.f32 %f965, %f259; + mov.f32 %f966, %f259; + mov.f32 %f967, %f259; + mov.f32 %f968, %f259; + mov.f32 %f969, %f259; + mov.f32 %f970, %f259; + mov.f32 %f971, %f259; + mov.f32 %f972, %f259; + mov.f32 %f909, %f259; + mov.f32 %f910, %f259; + mov.f32 %f911, %f259; + mov.f32 %f912, %f259; + mov.f32 %f913, %f259; + mov.f32 %f914, %f259; + mov.f32 %f915, %f259; + mov.f32 %f916, %f259; + mov.f32 %f917, %f259; + mov.f32 %f918, %f259; + mov.f32 %f919, %f259; + mov.f32 %f920, %f259; + mov.f32 %f921, %f259; + mov.f32 %f922, %f259; + mov.f32 %f923, %f259; + mov.f32 %f924, %f259; + mov.f32 %f925, %f259; + mov.f32 %f926, %f259; + mov.f32 %f927, %f259; + mov.f32 %f928, %f259; + mov.f32 %f929, %f259; + mov.f32 %f930, %f259; + mov.f32 %f931, %f259; + mov.f32 %f932, %f259; + mov.f32 %f933, %f259; + mov.f32 %f934, %f259; + mov.f32 %f935, %f259; + mov.f32 %f936, %f259; + mov.f32 %f937, %f259; + mov.f32 %f938, %f259; + mov.f32 %f939, %f259; + mov.f32 %f940, %f259; +LBB0_4: + add.s64 %rd94, %rd138, %rd9; + add.s64 %rd93, %rd137, %rd9; + add.s64 %rd92, %rd136, %rd9; + add.s64 %rd91, %rd135, %rd9; + add.s64 %rd89, %rd134, %rd9; + add.s64 %rd88, %rd133, %rd9; + add.s64 %rd87, %rd132, %rd9; + add.s64 %rd86, %rd131, %rd9; + add.s64 %rd103, %rd130, %rd31; + add.s64 %rd102, %rd130, %rd32; + add.s64 %rd101, %rd130, %rd33; + add.s64 %rd100, %rd130, %rd34; + add.s64 %rd99, %rd130, %rd18; + add.s64 %rd98, %rd130, %rd19; + add.s64 %rd97, %rd130, %rd20; + add.s64 %rd96, %rd130, %rd35; + or.b32 %r6102, %r6176, %r3; + @%p102 ld.global.v4.b32 { %r6103, %r6104, %r6105, %r6106 }, [ %rd86 + 0 ]; + mov.b32 %hh33, %r6103; + mov.b32 %hh34, %r6104; + mov.b32 %hh35, %r6105; + mov.b32 %hh36, %r6106; + @%p102 ld.global.v4.b32 { %r6107, %r6108, %r6109, %r6110 }, [ %rd87 + 0 ]; + mov.b32 %hh37, %r6107; + mov.b32 %hh38, %r6108; + mov.b32 %hh39, %r6109; + mov.b32 %hh40, %r6110; + @%p102 ld.global.v4.b32 { %r6111, %r6112, %r6113, %r6114 }, [ %rd88 + 0 ]; + mov.b32 %hh41, %r6111; + mov.b32 %hh42, %r6112; + mov.b32 %hh43, %r6113; + mov.b32 %hh44, %r6114; + @%p102 ld.global.v4.b32 { %r6115, %r6116, %r6117, %r6118 }, [ %rd89 + 0 ]; + mov.b32 %hh45, %r6115; + mov.b32 %hh46, %r6116; + mov.b32 %hh47, %r6117; + mov.b32 %hh48, %r6118; bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1691, %r1692, %r1693, %r1694 }, [ %r1531 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1803, %r1804, %r1805, %r1806 }, [ %r1536 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1915, %r1916, %r1917, %r1918 }, [ %r1541 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2027, %r2028, %r2029, %r2030 }, [ %r1546 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2139, %r2140, %r2141, %r2142 }, [ %r1551 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2251, %r2252, %r2253, %r2254 }, [ %r1556 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2363, %r2364, %r2365, %r2366 }, [ %r1561 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2475, %r2476, %r2477, %r2478 }, [ %r1566 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1747, %r1748, %r1749, %r1750 }, [ %r1571 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1859, %r1860, %r1861, %r1862 }, [ %r1576 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1971, %r1972, %r1973, %r1974 }, [ %r1581 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2083, %r2084, %r2085, %r2086 }, [ %r1586 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2195, %r2196, %r2197, %r2198 }, [ %r1591 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2307, %r2308, %r2309, %r2310 }, [ %r1596 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2419, %r2420, %r2421, %r2422 }, [ %r1601 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2531, %r2532, %r2533, %r2534 }, [ %r1606 + 0 ]; + st.shared.v4.b32 [%r32], {%r6103, %r6104, %r6105, %r6106}; + st.shared.v4.b32 [%r33], {%r6107, %r6108, %r6109, %r6110}; + st.shared.v4.b32 [%r34], {%r6111, %r6112, %r6113, %r6114}; + st.shared.v4.b32 [%r35], {%r6115, %r6116, %r6117, %r6118}; bar.sync 0; - st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534}; - st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538}; - st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542}; - st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546}; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r884, %r885, %r886, %r887 }, [ %r704 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1108, %r1109, %r1110, %r1111 }, [ %r709 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1332, %r1333, %r1334, %r1335 }, [ %r714 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1556, %r1557, %r1558, %r1559 }, [ %r719 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r720, %r721, %r722, %r723 }, [ %r724 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r725, %r726, %r727, %r728 }, [ %r729 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r730, %r731, %r732, %r733 }, [ %r734 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r735, %r736, %r737, %r738 }, [ %r739 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r740, %r741, %r742, %r743 }, [ %r744 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r745, %r746, %r747, %r748 }, [ %r749 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r750, %r751, %r752, %r753 }, [ %r754 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r755, %r756, %r757, %r758 }, [ %r759 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r760, %r761, %r762, %r763 }, [ %r764 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r765, %r766, %r767, %r768 }, [ %r769 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r770, %r771, %r772, %r773 }, [ %r774 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r775, %r776, %r777, %r778 }, [ %r779 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r780, %r781, %r782, %r783 }, [ %r784 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r785, %r786, %r787, %r788 }, [ %r789 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r790, %r791, %r792, %r793 }, [ %r794 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r795, %r796, %r797, %r798 }, [ %r799 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r800, %r801, %r802, %r803 }, [ %r804 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r805, %r806, %r807, %r808 }, [ %r809 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r810, %r811, %r812, %r813 }, [ %r814 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r815, %r816, %r817, %r818 }, [ %r819 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r820, %r821, %r822, %r823 }, [ %r824 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r825, %r826, %r827, %r828 }, [ %r829 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r830, %r831, %r832, %r833 }, [ %r834 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r835, %r836, %r837, %r838 }, [ %r839 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r840, %r841, %r842, %r843 }, [ %r844 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r845, %r846, %r847, %r848 }, [ %r849 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r850, %r851, %r852, %r853 }, [ %r854 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r855, %r856, %r857, %r858 }, [ %r859 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r860, %r861, %r862, %r863 }, [ %r864 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r865, %r866, %r867, %r868 }, [ %r869 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r870, %r871, %r872, %r873 }, [ %r874 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r875, %r876, %r877, %r878 }, [ %r879 + 0 ]; + mov.u32 %r1317, 0; + mov.u32 %r1104, %r1317; + mov.u32 %r1105, %r1317; + mov.u32 %r1106, %r1317; + mov.u32 %r1107, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r884, %r885, %r886, %r887 }, { %r720, %r721 }, { %r1104, %r1105, %r1106, %r1107 }; + mov.u32 %r1118, %r1317; + mov.u32 %r1119, %r1317; + mov.u32 %r1120, %r1317; + mov.u32 %r1121, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r884, %r885, %r886, %r887 }, { %r722, %r723 }, { %r1118, %r1119, %r1120, %r1121 }; + mov.u32 %r1132, %r1317; + mov.u32 %r1133, %r1317; + mov.u32 %r1134, %r1317; + mov.u32 %r1135, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r884, %r885, %r886, %r887 }, { %r740, %r741 }, { %r1132, %r1133, %r1134, %r1135 }; + mov.u32 %r1146, %r1317; + mov.u32 %r1147, %r1317; + mov.u32 %r1148, %r1317; + mov.u32 %r1149, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r884, %r885, %r886, %r887 }, { %r742, %r743 }, { %r1146, %r1147, %r1148, %r1149 }; + mov.u32 %r1160, %r1317; + mov.u32 %r1161, %r1317; + mov.u32 %r1162, %r1317; + mov.u32 %r1163, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r884, %r885, %r886, %r887 }, { %r760, %r761 }, { %r1160, %r1161, %r1162, %r1163 }; + mov.u32 %r1174, %r1317; + mov.u32 %r1175, %r1317; + mov.u32 %r1176, %r1317; + mov.u32 %r1177, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r884, %r885, %r886, %r887 }, { %r762, %r763 }, { %r1174, %r1175, %r1176, %r1177 }; + mov.u32 %r1188, %r1317; + mov.u32 %r1189, %r1317; + mov.u32 %r1190, %r1317; + mov.u32 %r1191, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r884, %r885, %r886, %r887 }, { %r780, %r781 }, { %r1188, %r1189, %r1190, %r1191 }; + mov.u32 %r1202, %r1317; + mov.u32 %r1203, %r1317; + mov.u32 %r1204, %r1317; + mov.u32 %r1205, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r884, %r885, %r886, %r887 }, { %r782, %r783 }, { %r1202, %r1203, %r1204, %r1205 }; + mov.u32 %r1216, %r1317; + mov.u32 %r1217, %r1317; + mov.u32 %r1218, %r1317; + mov.u32 %r1219, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r884, %r885, %r886, %r887 }, { %r800, %r801 }, { %r1216, %r1217, %r1218, %r1219 }; + mov.u32 %r1230, %r1317; + mov.u32 %r1231, %r1317; + mov.u32 %r1232, %r1317; + mov.u32 %r1233, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r884, %r885, %r886, %r887 }, { %r802, %r803 }, { %r1230, %r1231, %r1232, %r1233 }; + mov.u32 %r1244, %r1317; + mov.u32 %r1245, %r1317; + mov.u32 %r1246, %r1317; + mov.u32 %r1247, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r884, %r885, %r886, %r887 }, { %r820, %r821 }, { %r1244, %r1245, %r1246, %r1247 }; + mov.u32 %r1258, %r1317; + mov.u32 %r1259, %r1317; + mov.u32 %r1260, %r1317; + mov.u32 %r1261, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r884, %r885, %r886, %r887 }, { %r822, %r823 }, { %r1258, %r1259, %r1260, %r1261 }; + mov.u32 %r1272, %r1317; + mov.u32 %r1273, %r1317; + mov.u32 %r1274, %r1317; + mov.u32 %r1275, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r884, %r885, %r886, %r887 }, { %r840, %r841 }, { %r1272, %r1273, %r1274, %r1275 }; + mov.u32 %r1286, %r1317; + mov.u32 %r1287, %r1317; + mov.u32 %r1288, %r1317; + mov.u32 %r1289, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r884, %r885, %r886, %r887 }, { %r842, %r843 }, { %r1286, %r1287, %r1288, %r1289 }; + mov.u32 %r1300, %r1317; + mov.u32 %r1301, %r1317; + mov.u32 %r1302, %r1317; + mov.u32 %r1303, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r884, %r885, %r886, %r887 }, { %r860, %r861 }, { %r1300, %r1301, %r1302, %r1303 }; + mov.u32 %r1314, %r1317; + mov.u32 %r1315, %r1317; + mov.u32 %r1316, %r1317; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r884, %r885, %r886, %r887 }, { %r862, %r863 }, { %r1314, %r1315, %r1316, %r1317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1108, %r1109, %r1110, %r1111 }, { %r725, %r726 }, { %r1104, %r1105, %r1106, %r1107 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1108, %r1109, %r1110, %r1111 }, { %r727, %r728 }, { %r1118, %r1119, %r1120, %r1121 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1108, %r1109, %r1110, %r1111 }, { %r745, %r746 }, { %r1132, %r1133, %r1134, %r1135 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1108, %r1109, %r1110, %r1111 }, { %r747, %r748 }, { %r1146, %r1147, %r1148, %r1149 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1108, %r1109, %r1110, %r1111 }, { %r765, %r766 }, { %r1160, %r1161, %r1162, %r1163 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1108, %r1109, %r1110, %r1111 }, { %r767, %r768 }, { %r1174, %r1175, %r1176, %r1177 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1108, %r1109, %r1110, %r1111 }, { %r785, %r786 }, { %r1188, %r1189, %r1190, %r1191 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1108, %r1109, %r1110, %r1111 }, { %r787, %r788 }, { %r1202, %r1203, %r1204, %r1205 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1108, %r1109, %r1110, %r1111 }, { %r805, %r806 }, { %r1216, %r1217, %r1218, %r1219 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1108, %r1109, %r1110, %r1111 }, { %r807, %r808 }, { %r1230, %r1231, %r1232, %r1233 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1108, %r1109, %r1110, %r1111 }, { %r825, %r826 }, { %r1244, %r1245, %r1246, %r1247 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1108, %r1109, %r1110, %r1111 }, { %r827, %r828 }, { %r1258, %r1259, %r1260, %r1261 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1108, %r1109, %r1110, %r1111 }, { %r845, %r846 }, { %r1272, %r1273, %r1274, %r1275 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1108, %r1109, %r1110, %r1111 }, { %r847, %r848 }, { %r1286, %r1287, %r1288, %r1289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1108, %r1109, %r1110, %r1111 }, { %r865, %r866 }, { %r1300, %r1301, %r1302, %r1303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1108, %r1109, %r1110, %r1111 }, { %r867, %r868 }, { %r1314, %r1315, %r1316, %r1317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1332, %r1333, %r1334, %r1335 }, { %r730, %r731 }, { %r1104, %r1105, %r1106, %r1107 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1332, %r1333, %r1334, %r1335 }, { %r732, %r733 }, { %r1118, %r1119, %r1120, %r1121 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1332, %r1333, %r1334, %r1335 }, { %r750, %r751 }, { %r1132, %r1133, %r1134, %r1135 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1332, %r1333, %r1334, %r1335 }, { %r752, %r753 }, { %r1146, %r1147, %r1148, %r1149 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1332, %r1333, %r1334, %r1335 }, { %r770, %r771 }, { %r1160, %r1161, %r1162, %r1163 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1332, %r1333, %r1334, %r1335 }, { %r772, %r773 }, { %r1174, %r1175, %r1176, %r1177 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1332, %r1333, %r1334, %r1335 }, { %r790, %r791 }, { %r1188, %r1189, %r1190, %r1191 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1332, %r1333, %r1334, %r1335 }, { %r792, %r793 }, { %r1202, %r1203, %r1204, %r1205 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1332, %r1333, %r1334, %r1335 }, { %r810, %r811 }, { %r1216, %r1217, %r1218, %r1219 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1332, %r1333, %r1334, %r1335 }, { %r812, %r813 }, { %r1230, %r1231, %r1232, %r1233 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1332, %r1333, %r1334, %r1335 }, { %r830, %r831 }, { %r1244, %r1245, %r1246, %r1247 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1332, %r1333, %r1334, %r1335 }, { %r832, %r833 }, { %r1258, %r1259, %r1260, %r1261 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1332, %r1333, %r1334, %r1335 }, { %r850, %r851 }, { %r1272, %r1273, %r1274, %r1275 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1332, %r1333, %r1334, %r1335 }, { %r852, %r853 }, { %r1286, %r1287, %r1288, %r1289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1332, %r1333, %r1334, %r1335 }, { %r870, %r871 }, { %r1300, %r1301, %r1302, %r1303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1332, %r1333, %r1334, %r1335 }, { %r872, %r873 }, { %r1314, %r1315, %r1316, %r1317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1556, %r1557, %r1558, %r1559 }, { %r735, %r736 }, { %r1104, %r1105, %r1106, %r1107 }; + mov.b32 %f388, %r1107; + mov.b32 %f389, %r1106; + mov.b32 %f390, %r1105; + mov.b32 %f391, %r1104; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1556, %r1557, %r1558, %r1559 }, { %r737, %r738 }, { %r1118, %r1119, %r1120, %r1121 }; + mov.b32 %f392, %r1121; + mov.b32 %f393, %r1120; + mov.b32 %f394, %r1119; + mov.b32 %f395, %r1118; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1556, %r1557, %r1558, %r1559 }, { %r755, %r756 }, { %r1132, %r1133, %r1134, %r1135 }; + mov.b32 %f396, %r1135; + mov.b32 %f397, %r1134; + mov.b32 %f398, %r1133; + mov.b32 %f399, %r1132; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1556, %r1557, %r1558, %r1559 }, { %r757, %r758 }, { %r1146, %r1147, %r1148, %r1149 }; + mov.b32 %f400, %r1149; + mov.b32 %f401, %r1148; + mov.b32 %f402, %r1147; + mov.b32 %f403, %r1146; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1556, %r1557, %r1558, %r1559 }, { %r775, %r776 }, { %r1160, %r1161, %r1162, %r1163 }; + mov.b32 %f404, %r1163; + mov.b32 %f405, %r1162; + mov.b32 %f406, %r1161; + mov.b32 %f407, %r1160; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1556, %r1557, %r1558, %r1559 }, { %r777, %r778 }, { %r1174, %r1175, %r1176, %r1177 }; + mov.b32 %f408, %r1177; + mov.b32 %f409, %r1176; + mov.b32 %f410, %r1175; + mov.b32 %f411, %r1174; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1556, %r1557, %r1558, %r1559 }, { %r795, %r796 }, { %r1188, %r1189, %r1190, %r1191 }; + mov.b32 %f412, %r1191; + mov.b32 %f413, %r1190; + mov.b32 %f414, %r1189; + mov.b32 %f415, %r1188; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1556, %r1557, %r1558, %r1559 }, { %r797, %r798 }, { %r1202, %r1203, %r1204, %r1205 }; + mov.b32 %f416, %r1205; + mov.b32 %f417, %r1204; + mov.b32 %f418, %r1203; + mov.b32 %f419, %r1202; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1556, %r1557, %r1558, %r1559 }, { %r815, %r816 }, { %r1216, %r1217, %r1218, %r1219 }; + mov.b32 %f420, %r1219; + mov.b32 %f421, %r1218; + mov.b32 %f422, %r1217; + mov.b32 %f423, %r1216; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1556, %r1557, %r1558, %r1559 }, { %r817, %r818 }, { %r1230, %r1231, %r1232, %r1233 }; + mov.b32 %f424, %r1233; + mov.b32 %f425, %r1232; + mov.b32 %f426, %r1231; + mov.b32 %f427, %r1230; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1556, %r1557, %r1558, %r1559 }, { %r835, %r836 }, { %r1244, %r1245, %r1246, %r1247 }; + mov.b32 %f428, %r1247; + mov.b32 %f429, %r1246; + mov.b32 %f430, %r1245; + mov.b32 %f431, %r1244; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1556, %r1557, %r1558, %r1559 }, { %r837, %r838 }, { %r1258, %r1259, %r1260, %r1261 }; + mov.b32 %f432, %r1261; + mov.b32 %f433, %r1260; + mov.b32 %f434, %r1259; + mov.b32 %f435, %r1258; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1556, %r1557, %r1558, %r1559 }, { %r855, %r856 }, { %r1272, %r1273, %r1274, %r1275 }; + mov.b32 %f436, %r1275; + mov.b32 %f437, %r1274; + mov.b32 %f438, %r1273; + mov.b32 %f439, %r1272; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1556, %r1557, %r1558, %r1559 }, { %r857, %r858 }, { %r1286, %r1287, %r1288, %r1289 }; + mov.b32 %f440, %r1289; + mov.b32 %f441, %r1288; + mov.b32 %f442, %r1287; + mov.b32 %f443, %r1286; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1556, %r1557, %r1558, %r1559 }, { %r875, %r876 }, { %r1300, %r1301, %r1302, %r1303 }; + mov.b32 %f444, %r1303; + mov.b32 %f445, %r1302; + mov.b32 %f446, %r1301; + mov.b32 %f447, %r1300; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1556, %r1557, %r1558, %r1559 }, { %r877, %r878 }, { %r1314, %r1315, %r1316, %r1317 }; + mov.b32 %f448, %r1317; + mov.b32 %f449, %r1316; + mov.b32 %f450, %r1315; + mov.b32 %f451, %r1314; + or.b32 %r6119, %r6176, %r17; + or.b32 %r6120, %r6176, %r18; + setp.lt.s32 %p37, %r6120, %r307; + setp.lt.s32 %p38, %r6120, %r308; + setp.lt.s32 %p39, %r6119, %r307; + setp.lt.s32 %p40, %r6119, %r308; + setp.lt.s32 %p41, %r6120, %r311; + setp.lt.s32 %p42, %r6120, %r312; + setp.lt.s32 %p43, %r6119, %r311; + setp.lt.s32 %p44, %r6119, %r312; + setp.lt.s32 %p45, %r6120, %r315; + setp.lt.s32 %p46, %r6120, %r316; + setp.lt.s32 %p47, %r6119, %r315; + setp.lt.s32 %p48, %r6119, %r316; + setp.lt.s32 %p49, %r6120, %r319; + setp.lt.s32 %p50, %r6120, %r320; + setp.lt.s32 %p51, %r6119, %r319; + setp.lt.s32 %p52, %r6119, %r320; + setp.lt.s32 %p53, %r6120, %r323; + setp.lt.s32 %p54, %r6120, %r324; + setp.lt.s32 %p55, %r6119, %r323; + setp.lt.s32 %p56, %r6119, %r324; + setp.lt.s32 %p57, %r6120, %r327; + setp.lt.s32 %p58, %r6120, %r328; + setp.lt.s32 %p59, %r6119, %r327; + setp.lt.s32 %p60, %r6119, %r328; + setp.lt.s32 %p61, %r6120, %r331; + setp.lt.s32 %p62, %r6120, %r332; + setp.lt.s32 %p63, %r6119, %r331; + setp.lt.s32 %p64, %r6119, %r332; + setp.lt.s32 %p65, %r6120, %r335; + setp.lt.s32 %p66, %r6120, %r336; + setp.lt.s32 %p67, %r6119, %r335; + setp.lt.s32 %p68, %r6119, %r336; + setp.lt.s32 %p69, %r6120, %r339; + setp.lt.s32 %p70, %r6120, %r340; + setp.lt.s32 %p71, %r6119, %r339; + setp.lt.s32 %p72, %r6119, %r340; + setp.lt.s32 %p73, %r6120, %r343; + setp.lt.s32 %p74, %r6120, %r344; + setp.lt.s32 %p75, %r6119, %r343; + setp.lt.s32 %p76, %r6119, %r344; + setp.lt.s32 %p77, %r6120, %r347; + setp.lt.s32 %p78, %r6120, %r348; + setp.lt.s32 %p79, %r6119, %r347; + setp.lt.s32 %p80, %r6119, %r348; + setp.lt.s32 %p81, %r6120, %r351; + setp.lt.s32 %p82, %r6120, %r352; + setp.lt.s32 %p83, %r6119, %r351; + setp.lt.s32 %p84, %r6119, %r352; + setp.lt.s32 %p85, %r6120, %r355; + setp.lt.s32 %p86, %r6120, %r356; + setp.lt.s32 %p87, %r6119, %r355; + setp.lt.s32 %p88, %r6119, %r356; + setp.lt.s32 %p89, %r6120, %r359; + setp.lt.s32 %p90, %r6120, %r360; + setp.lt.s32 %p91, %r6119, %r359; + setp.lt.s32 %p92, %r6119, %r360; + setp.lt.s32 %p93, %r6120, %r304; + setp.lt.s32 %p94, %r6120, %r303; + setp.lt.s32 %p95, %r6119, %r304; + setp.lt.s32 %p96, %r6119, %r303; + setp.lt.s32 %p97, %r6120, %r302; + setp.lt.s32 %p98, %r6120, %r301; + setp.lt.s32 %p99, %r6119, %r302; + setp.lt.s32 %p100, %r6119, %r301; + selp.f32 %f452, 0fFF800000, %f391, %p100; + selp.f32 %f453, 0fFF800000, %f390, %p99; + selp.f32 %f454, 0fFF800000, %f389, %p98; + selp.f32 %f455, 0fFF800000, %f388, %p97; + selp.f32 %f456, 0fFF800000, %f395, %p96; + selp.f32 %f457, 0fFF800000, %f394, %p95; + selp.f32 %f458, 0fFF800000, %f393, %p94; + selp.f32 %f459, 0fFF800000, %f392, %p93; + selp.f32 %f460, 0fFF800000, %f399, %p92; + selp.f32 %f461, 0fFF800000, %f398, %p91; + selp.f32 %f462, 0fFF800000, %f397, %p90; + selp.f32 %f463, 0fFF800000, %f396, %p89; + selp.f32 %f464, 0fFF800000, %f403, %p88; + selp.f32 %f465, 0fFF800000, %f402, %p87; + selp.f32 %f466, 0fFF800000, %f401, %p86; + selp.f32 %f467, 0fFF800000, %f400, %p85; + selp.f32 %f468, 0fFF800000, %f407, %p84; + selp.f32 %f469, 0fFF800000, %f406, %p83; + selp.f32 %f470, 0fFF800000, %f405, %p82; + selp.f32 %f471, 0fFF800000, %f404, %p81; + selp.f32 %f472, 0fFF800000, %f411, %p80; + selp.f32 %f473, 0fFF800000, %f410, %p79; + selp.f32 %f474, 0fFF800000, %f409, %p78; + selp.f32 %f475, 0fFF800000, %f408, %p77; + selp.f32 %f476, 0fFF800000, %f415, %p76; + selp.f32 %f477, 0fFF800000, %f414, %p75; + selp.f32 %f478, 0fFF800000, %f413, %p74; + selp.f32 %f479, 0fFF800000, %f412, %p73; + selp.f32 %f480, 0fFF800000, %f419, %p72; + selp.f32 %f481, 0fFF800000, %f418, %p71; + selp.f32 %f482, 0fFF800000, %f417, %p70; + selp.f32 %f483, 0fFF800000, %f416, %p69; + selp.f32 %f484, 0fFF800000, %f423, %p68; + selp.f32 %f485, 0fFF800000, %f422, %p67; + selp.f32 %f486, 0fFF800000, %f421, %p66; + selp.f32 %f487, 0fFF800000, %f420, %p65; + selp.f32 %f488, 0fFF800000, %f427, %p64; + selp.f32 %f489, 0fFF800000, %f426, %p63; + selp.f32 %f490, 0fFF800000, %f425, %p62; + selp.f32 %f491, 0fFF800000, %f424, %p61; + selp.f32 %f492, 0fFF800000, %f431, %p60; + selp.f32 %f493, 0fFF800000, %f430, %p59; + selp.f32 %f494, 0fFF800000, %f429, %p58; + selp.f32 %f495, 0fFF800000, %f428, %p57; + selp.f32 %f496, 0fFF800000, %f435, %p56; + selp.f32 %f497, 0fFF800000, %f434, %p55; + selp.f32 %f498, 0fFF800000, %f433, %p54; + selp.f32 %f499, 0fFF800000, %f432, %p53; + selp.f32 %f500, 0fFF800000, %f439, %p52; + selp.f32 %f501, 0fFF800000, %f438, %p51; + selp.f32 %f502, 0fFF800000, %f437, %p50; + selp.f32 %f503, 0fFF800000, %f436, %p49; + selp.f32 %f504, 0fFF800000, %f443, %p48; + selp.f32 %f505, 0fFF800000, %f442, %p47; + selp.f32 %f506, 0fFF800000, %f441, %p46; + selp.f32 %f507, 0fFF800000, %f440, %p45; + selp.f32 %f508, 0fFF800000, %f447, %p44; + selp.f32 %f509, 0fFF800000, %f446, %p43; + selp.f32 %f510, 0fFF800000, %f445, %p42; + selp.f32 %f511, 0fFF800000, %f444, %p41; + selp.f32 %f512, 0fFF800000, %f451, %p40; + selp.f32 %f513, 0fFF800000, %f450, %p39; + selp.f32 %f514, 0fFF800000, %f449, %p38; + selp.f32 %f515, 0fFF800000, %f448, %p37; + mul.wide.s32 %rd112, %r6102, 4; + add.s64 %rd90, %rd8, %rd112; + @%p102 ld.global.b32 { %r1776 }, [ %rd90 + 0 ]; + st.shared.u32 [%r72], %r1776; bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1695, %r1696, %r1709, %r1710 }, [ %r1611 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1807, %r1808, %r1821, %r1822 }, [ %r1616 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1919, %r1920, %r1933, %r1934 }, [ %r1621 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2031, %r2032, %r2045, %r2046 }, [ %r1626 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2143, %r2144, %r2157, %r2158 }, [ %r1631 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2255, %r2256, %r2269, %r2270 }, [ %r1636 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2367, %r2368, %r2381, %r2382 }, [ %r1641 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2479, %r2480, %r2493, %r2494 }, [ %r1646 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1723, %r1724, %r1737, %r1738 }, [ %r1651 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1835, %r1836, %r1849, %r1850 }, [ %r1656 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1947, %r1948, %r1961, %r1962 }, [ %r1661 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2059, %r2060, %r2073, %r2074 }, [ %r1666 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2171, %r2172, %r2185, %r2186 }, [ %r1671 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2283, %r2284, %r2297, %r2298 }, [ %r1676 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2395, %r2396, %r2409, %r2410 }, [ %r1681 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2507, %r2508, %r2521, %r2522 }, [ %r1686 + 0 ]; - mov.b32 %r1799, %f515; - mov.b32 %r1800, %f516; - mov.b32 %r1801, %f517; - mov.b32 %r1802, %f518; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1695, %r1696 }, { %r1799, %r1800, %r1801, %r1802 }; - mov.b32 %r1813, %f519; - mov.b32 %r1814, %f520; - mov.b32 %r1815, %f521; - mov.b32 %r1816, %f522; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1709, %r1710 }, { %r1813, %r1814, %r1815, %r1816 }; - mov.b32 %r1827, %f523; - mov.b32 %r1828, %f524; - mov.b32 %r1829, %f525; - mov.b32 %r1830, %f526; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1723, %r1724 }, { %r1827, %r1828, %r1829, %r1830 }; - mov.b32 %r1841, %f527; - mov.b32 %r1842, %f528; - mov.b32 %r1843, %f529; - mov.b32 %r1844, %f530; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1737, %r1738 }, { %r1841, %r1842, %r1843, %r1844 }; - mov.b32 %r1855, %f531; - mov.b32 %r1856, %f532; - mov.b32 %r1857, %f533; - mov.b32 %r1858, %f534; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1695, %r1696 }, { %r1855, %r1856, %r1857, %r1858 }; - mov.b32 %r1869, %f535; - mov.b32 %r1870, %f536; - mov.b32 %r1871, %f537; - mov.b32 %r1872, %f538; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1709, %r1710 }, { %r1869, %r1870, %r1871, %r1872 }; - mov.b32 %r1883, %f539; - mov.b32 %r1884, %f540; - mov.b32 %r1885, %f541; - mov.b32 %r1886, %f542; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1723, %r1724 }, { %r1883, %r1884, %r1885, %r1886 }; - mov.b32 %r1897, %f543; - mov.b32 %r1898, %f544; - mov.b32 %r1899, %f545; - mov.b32 %r1900, %f546; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1737, %r1738 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1807, %r1808 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1821, %r1822 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1835, %r1836 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1849, %r1850 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1807, %r1808 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1821, %r1822 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1835, %r1836 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1849, %r1850 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1919, %r1920 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1933, %r1934 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1947, %r1948 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1961, %r1962 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1919, %r1920 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1933, %r1934 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1947, %r1948 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1961, %r1962 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2031, %r2032 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2045, %r2046 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2059, %r2060 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2073, %r2074 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2031, %r2032 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2045, %r2046 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2059, %r2060 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2073, %r2074 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2143, %r2144 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2157, %r2158 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2171, %r2172 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2185, %r2186 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2143, %r2144 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2157, %r2158 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2171, %r2172 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2185, %r2186 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2255, %r2256 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2269, %r2270 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2283, %r2284 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2297, %r2298 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2255, %r2256 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2269, %r2270 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2283, %r2284 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2297, %r2298 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2367, %r2368 }, { %r1799, %r1800, %r1801, %r1802 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2381, %r2382 }, { %r1813, %r1814, %r1815, %r1816 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2395, %r2396 }, { %r1827, %r1828, %r1829, %r1830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2409, %r2410 }, { %r1841, %r1842, %r1843, %r1844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2367, %r2368 }, { %r1855, %r1856, %r1857, %r1858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2381, %r2382 }, { %r1869, %r1870, %r1871, %r1872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2395, %r2396 }, { %r1883, %r1884, %r1885, %r1886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2409, %r2410 }, { %r1897, %r1898, %r1899, %r1900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2479, %r2480 }, { %r1799, %r1800, %r1801, %r1802 }; - mov.b32 %f518, %r1802; - mov.b32 %f517, %r1801; - mov.b32 %f516, %r1800; - mov.b32 %f515, %r1799; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2493, %r2494 }, { %r1813, %r1814, %r1815, %r1816 }; - mov.b32 %f522, %r1816; - mov.b32 %f521, %r1815; - mov.b32 %f520, %r1814; - mov.b32 %f519, %r1813; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2507, %r2508 }, { %r1827, %r1828, %r1829, %r1830 }; - mov.b32 %f526, %r1830; - mov.b32 %f525, %r1829; - mov.b32 %f524, %r1828; - mov.b32 %f523, %r1827; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2521, %r2522 }, { %r1841, %r1842, %r1843, %r1844 }; - mov.b32 %f530, %r1844; - mov.b32 %f529, %r1843; - mov.b32 %f528, %r1842; - mov.b32 %f527, %r1841; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2479, %r2480 }, { %r1855, %r1856, %r1857, %r1858 }; - mov.b32 %f534, %r1858; - mov.b32 %f533, %r1857; - mov.b32 %f532, %r1856; - mov.b32 %f531, %r1855; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2493, %r2494 }, { %r1869, %r1870, %r1871, %r1872 }; - mov.b32 %f538, %r1872; - mov.b32 %f537, %r1871; - mov.b32 %f536, %r1870; - mov.b32 %f535, %r1869; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2507, %r2508 }, { %r1883, %r1884, %r1885, %r1886 }; - mov.b32 %f542, %r1886; - mov.b32 %f541, %r1885; - mov.b32 %f540, %r1884; - mov.b32 %f539, %r1883; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2521, %r2522 }, { %r1897, %r1898, %r1899, %r1900 }; - mov.b32 %f546, %r1900; - mov.b32 %f545, %r1899; - mov.b32 %f544, %r1898; - mov.b32 %f543, %r1897; + ld.shared.f32 %f516, [%r73]; + ld.shared.f32 %f517, [%r74+32]; + neg.f32 %f518, %f516; + fma.rn.f32 %f519, %f452, %f195, %f518; + fma.rn.f32 %f520, %f453, %f195, %f518; + neg.f32 %f521, %f517; + fma.rn.f32 %f522, %f454, %f195, %f521; + fma.rn.f32 %f523, %f455, %f195, %f521; + fma.rn.f32 %f524, %f456, %f195, %f518; + fma.rn.f32 %f525, %f457, %f195, %f518; + fma.rn.f32 %f526, %f458, %f195, %f521; + fma.rn.f32 %f527, %f459, %f195, %f521; + fma.rn.f32 %f528, %f460, %f195, %f518; + fma.rn.f32 %f529, %f461, %f195, %f518; + fma.rn.f32 %f530, %f462, %f195, %f521; + fma.rn.f32 %f531, %f463, %f195, %f521; + fma.rn.f32 %f532, %f464, %f195, %f518; + fma.rn.f32 %f533, %f465, %f195, %f518; + fma.rn.f32 %f534, %f466, %f195, %f521; + fma.rn.f32 %f535, %f467, %f195, %f521; + fma.rn.f32 %f536, %f468, %f195, %f518; + fma.rn.f32 %f537, %f469, %f195, %f518; + fma.rn.f32 %f538, %f470, %f195, %f521; + fma.rn.f32 %f539, %f471, %f195, %f521; + fma.rn.f32 %f540, %f472, %f195, %f518; + fma.rn.f32 %f541, %f473, %f195, %f518; + fma.rn.f32 %f542, %f474, %f195, %f521; + fma.rn.f32 %f543, %f475, %f195, %f521; + fma.rn.f32 %f544, %f476, %f195, %f518; + fma.rn.f32 %f545, %f477, %f195, %f518; + fma.rn.f32 %f546, %f478, %f195, %f521; + fma.rn.f32 %f547, %f479, %f195, %f521; + fma.rn.f32 %f548, %f480, %f195, %f518; + fma.rn.f32 %f549, %f481, %f195, %f518; + fma.rn.f32 %f550, %f482, %f195, %f521; + fma.rn.f32 %f551, %f483, %f195, %f521; + fma.rn.f32 %f552, %f484, %f195, %f518; + fma.rn.f32 %f553, %f485, %f195, %f518; + fma.rn.f32 %f554, %f486, %f195, %f521; + fma.rn.f32 %f555, %f487, %f195, %f521; + fma.rn.f32 %f556, %f488, %f195, %f518; + fma.rn.f32 %f557, %f489, %f195, %f518; + fma.rn.f32 %f558, %f490, %f195, %f521; + fma.rn.f32 %f559, %f491, %f195, %f521; + fma.rn.f32 %f560, %f492, %f195, %f518; + fma.rn.f32 %f561, %f493, %f195, %f518; + fma.rn.f32 %f562, %f494, %f195, %f521; + fma.rn.f32 %f563, %f495, %f195, %f521; + fma.rn.f32 %f564, %f496, %f195, %f518; + fma.rn.f32 %f565, %f497, %f195, %f518; + fma.rn.f32 %f566, %f498, %f195, %f521; + fma.rn.f32 %f567, %f499, %f195, %f521; + fma.rn.f32 %f568, %f500, %f195, %f518; + fma.rn.f32 %f569, %f501, %f195, %f518; + fma.rn.f32 %f570, %f502, %f195, %f521; + fma.rn.f32 %f571, %f503, %f195, %f521; + fma.rn.f32 %f572, %f504, %f195, %f518; + fma.rn.f32 %f573, %f505, %f195, %f518; + fma.rn.f32 %f574, %f506, %f195, %f521; + fma.rn.f32 %f575, %f507, %f195, %f521; + fma.rn.f32 %f576, %f508, %f195, %f518; + fma.rn.f32 %f577, %f509, %f195, %f518; + fma.rn.f32 %f578, %f510, %f195, %f521; + fma.rn.f32 %f579, %f511, %f195, %f521; + fma.rn.f32 %f580, %f512, %f195, %f518; + fma.rn.f32 %f581, %f513, %f195, %f518; + fma.rn.f32 %f582, %f514, %f195, %f521; + fma.rn.f32 %f583, %f515, %f195, %f521; + mul.f32 %f261, %f519, 0f3FB8AA3B; + ex2.approx.f32 %f260, %f261; + mul.f32 %f263, %f520, 0f3FB8AA3B; + ex2.approx.f32 %f262, %f263; + mul.f32 %f265, %f522, 0f3FB8AA3B; + ex2.approx.f32 %f264, %f265; + mul.f32 %f267, %f523, 0f3FB8AA3B; + ex2.approx.f32 %f266, %f267; + mul.f32 %f269, %f524, 0f3FB8AA3B; + ex2.approx.f32 %f268, %f269; + mul.f32 %f271, %f525, 0f3FB8AA3B; + ex2.approx.f32 %f270, %f271; + mul.f32 %f273, %f526, 0f3FB8AA3B; + ex2.approx.f32 %f272, %f273; + mul.f32 %f275, %f527, 0f3FB8AA3B; + ex2.approx.f32 %f274, %f275; + mul.f32 %f277, %f528, 0f3FB8AA3B; + ex2.approx.f32 %f276, %f277; + mul.f32 %f279, %f529, 0f3FB8AA3B; + ex2.approx.f32 %f278, %f279; + mul.f32 %f281, %f530, 0f3FB8AA3B; + ex2.approx.f32 %f280, %f281; + mul.f32 %f283, %f531, 0f3FB8AA3B; + ex2.approx.f32 %f282, %f283; + mul.f32 %f285, %f532, 0f3FB8AA3B; + ex2.approx.f32 %f284, %f285; + mul.f32 %f287, %f533, 0f3FB8AA3B; + ex2.approx.f32 %f286, %f287; + mul.f32 %f289, %f534, 0f3FB8AA3B; + ex2.approx.f32 %f288, %f289; + mul.f32 %f291, %f535, 0f3FB8AA3B; + ex2.approx.f32 %f290, %f291; + mul.f32 %f293, %f536, 0f3FB8AA3B; + ex2.approx.f32 %f292, %f293; + mul.f32 %f295, %f537, 0f3FB8AA3B; + ex2.approx.f32 %f294, %f295; + mul.f32 %f297, %f538, 0f3FB8AA3B; + ex2.approx.f32 %f296, %f297; + mul.f32 %f299, %f539, 0f3FB8AA3B; + ex2.approx.f32 %f298, %f299; + mul.f32 %f301, %f540, 0f3FB8AA3B; + ex2.approx.f32 %f300, %f301; + mul.f32 %f303, %f541, 0f3FB8AA3B; + ex2.approx.f32 %f302, %f303; + mul.f32 %f305, %f542, 0f3FB8AA3B; + ex2.approx.f32 %f304, %f305; + mul.f32 %f307, %f543, 0f3FB8AA3B; + ex2.approx.f32 %f306, %f307; + mul.f32 %f309, %f544, 0f3FB8AA3B; + ex2.approx.f32 %f308, %f309; + mul.f32 %f311, %f545, 0f3FB8AA3B; + ex2.approx.f32 %f310, %f311; + mul.f32 %f313, %f546, 0f3FB8AA3B; + ex2.approx.f32 %f312, %f313; + mul.f32 %f315, %f547, 0f3FB8AA3B; + ex2.approx.f32 %f314, %f315; + mul.f32 %f317, %f548, 0f3FB8AA3B; + ex2.approx.f32 %f316, %f317; + mul.f32 %f319, %f549, 0f3FB8AA3B; + ex2.approx.f32 %f318, %f319; + mul.f32 %f321, %f550, 0f3FB8AA3B; + ex2.approx.f32 %f320, %f321; + mul.f32 %f323, %f551, 0f3FB8AA3B; + ex2.approx.f32 %f322, %f323; + mul.f32 %f325, %f552, 0f3FB8AA3B; + ex2.approx.f32 %f324, %f325; + mul.f32 %f327, %f553, 0f3FB8AA3B; + ex2.approx.f32 %f326, %f327; + mul.f32 %f329, %f554, 0f3FB8AA3B; + ex2.approx.f32 %f328, %f329; + mul.f32 %f331, %f555, 0f3FB8AA3B; + ex2.approx.f32 %f330, %f331; + mul.f32 %f333, %f556, 0f3FB8AA3B; + ex2.approx.f32 %f332, %f333; + mul.f32 %f335, %f557, 0f3FB8AA3B; + ex2.approx.f32 %f334, %f335; + mul.f32 %f337, %f558, 0f3FB8AA3B; + ex2.approx.f32 %f336, %f337; + mul.f32 %f339, %f559, 0f3FB8AA3B; + ex2.approx.f32 %f338, %f339; + mul.f32 %f341, %f560, 0f3FB8AA3B; + ex2.approx.f32 %f340, %f341; + mul.f32 %f343, %f561, 0f3FB8AA3B; + ex2.approx.f32 %f342, %f343; + mul.f32 %f345, %f562, 0f3FB8AA3B; + ex2.approx.f32 %f344, %f345; + mul.f32 %f347, %f563, 0f3FB8AA3B; + ex2.approx.f32 %f346, %f347; + mul.f32 %f349, %f564, 0f3FB8AA3B; + ex2.approx.f32 %f348, %f349; + mul.f32 %f351, %f565, 0f3FB8AA3B; + ex2.approx.f32 %f350, %f351; + mul.f32 %f353, %f566, 0f3FB8AA3B; + ex2.approx.f32 %f352, %f353; + mul.f32 %f355, %f567, 0f3FB8AA3B; + ex2.approx.f32 %f354, %f355; + mul.f32 %f357, %f568, 0f3FB8AA3B; + ex2.approx.f32 %f356, %f357; + mul.f32 %f359, %f569, 0f3FB8AA3B; + ex2.approx.f32 %f358, %f359; + mul.f32 %f361, %f570, 0f3FB8AA3B; + ex2.approx.f32 %f360, %f361; + mul.f32 %f363, %f571, 0f3FB8AA3B; + ex2.approx.f32 %f362, %f363; + mul.f32 %f365, %f572, 0f3FB8AA3B; + ex2.approx.f32 %f364, %f365; + mul.f32 %f367, %f573, 0f3FB8AA3B; + ex2.approx.f32 %f366, %f367; + mul.f32 %f369, %f574, 0f3FB8AA3B; + ex2.approx.f32 %f368, %f369; + mul.f32 %f371, %f575, 0f3FB8AA3B; + ex2.approx.f32 %f370, %f371; + mul.f32 %f373, %f576, 0f3FB8AA3B; + ex2.approx.f32 %f372, %f373; + mul.f32 %f375, %f577, 0f3FB8AA3B; + ex2.approx.f32 %f374, %f375; + mul.f32 %f377, %f578, 0f3FB8AA3B; + ex2.approx.f32 %f376, %f377; + mul.f32 %f379, %f579, 0f3FB8AA3B; + ex2.approx.f32 %f378, %f379; + mul.f32 %f381, %f580, 0f3FB8AA3B; + ex2.approx.f32 %f380, %f381; + mul.f32 %f383, %f581, 0f3FB8AA3B; + ex2.approx.f32 %f382, %f383; + mul.f32 %f385, %f582, 0f3FB8AA3B; + ex2.approx.f32 %f384, %f385; + mul.f32 %f387, %f583, 0f3FB8AA3B; + ex2.approx.f32 %f386, %f387; + @%p102 ld.global.v4.b32 { %r6121, %r6122, %r6123, %r6124 }, [ %rd91 + 0 ]; + mov.b32 %hh49, %r6121; + mov.b32 %hh50, %r6122; + mov.b32 %hh51, %r6123; + mov.b32 %hh52, %r6124; + @%p102 ld.global.v4.b32 { %r6125, %r6126, %r6127, %r6128 }, [ %rd92 + 0 ]; + mov.b32 %hh53, %r6125; + mov.b32 %hh54, %r6126; + mov.b32 %hh55, %r6127; + mov.b32 %hh56, %r6128; + @%p102 ld.global.v4.b32 { %r6129, %r6130, %r6131, %r6132 }, [ %rd93 + 0 ]; + mov.b32 %hh57, %r6129; + mov.b32 %hh58, %r6130; + mov.b32 %hh59, %r6131; + mov.b32 %hh60, %r6132; + @%p102 ld.global.v4.b32 { %r6133, %r6134, %r6135, %r6136 }, [ %rd94 + 0 ]; + mov.b32 %hh61, %r6133; + mov.b32 %hh62, %r6134; + mov.b32 %hh63, %r6135; + mov.b32 %hh64, %r6136; bar.sync 0; - st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534}; - st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538}; - st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542}; - st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546}; + st.shared.v4.b32 [%r75], {%r6121, %r6122, %r6123, %r6124}; + st.shared.v4.b32 [%r76], {%r6125, %r6126, %r6127, %r6128}; + st.shared.v4.b32 [%r77], {%r6129, %r6130, %r6131, %r6132}; + st.shared.v4.b32 [%r78], {%r6133, %r6134, %r6135, %r6136}; + cvt.rn.f16.f32 %h1, %f262; + cvt.rn.f16.f32 %h2, %f260; + cvt.rn.f16.f32 %h3, %f266; + cvt.rn.f16.f32 %h4, %f264; + cvt.rn.f16.f32 %h5, %f270; + cvt.rn.f16.f32 %h6, %f268; + cvt.rn.f16.f32 %h7, %f274; + cvt.rn.f16.f32 %h8, %f272; + cvt.rn.f16.f32 %h9, %f278; + cvt.rn.f16.f32 %h10, %f276; + cvt.rn.f16.f32 %h11, %f282; + cvt.rn.f16.f32 %h12, %f280; + cvt.rn.f16.f32 %h13, %f286; + cvt.rn.f16.f32 %h14, %f284; + cvt.rn.f16.f32 %h15, %f290; + cvt.rn.f16.f32 %h16, %f288; + cvt.rn.f16.f32 %h17, %f294; + cvt.rn.f16.f32 %h18, %f292; + cvt.rn.f16.f32 %h19, %f298; + cvt.rn.f16.f32 %h20, %f296; + cvt.rn.f16.f32 %h21, %f302; + cvt.rn.f16.f32 %h22, %f300; + cvt.rn.f16.f32 %h23, %f306; + cvt.rn.f16.f32 %h24, %f304; + cvt.rn.f16.f32 %h25, %f310; + cvt.rn.f16.f32 %h26, %f308; + cvt.rn.f16.f32 %h27, %f314; + cvt.rn.f16.f32 %h28, %f312; + cvt.rn.f16.f32 %h29, %f318; + cvt.rn.f16.f32 %h30, %f316; + cvt.rn.f16.f32 %h31, %f322; + cvt.rn.f16.f32 %h32, %f320; + cvt.rn.f16.f32 %h33, %f326; + cvt.rn.f16.f32 %h34, %f324; + cvt.rn.f16.f32 %h35, %f330; + cvt.rn.f16.f32 %h36, %f328; + cvt.rn.f16.f32 %h37, %f334; + cvt.rn.f16.f32 %h38, %f332; + cvt.rn.f16.f32 %h39, %f338; + cvt.rn.f16.f32 %h40, %f336; + cvt.rn.f16.f32 %h41, %f342; + cvt.rn.f16.f32 %h42, %f340; + cvt.rn.f16.f32 %h43, %f346; + cvt.rn.f16.f32 %h44, %f344; + cvt.rn.f16.f32 %h45, %f350; + cvt.rn.f16.f32 %h46, %f348; + cvt.rn.f16.f32 %h47, %f354; + cvt.rn.f16.f32 %h48, %f352; + cvt.rn.f16.f32 %h49, %f358; + cvt.rn.f16.f32 %h50, %f356; + cvt.rn.f16.f32 %h51, %f362; + cvt.rn.f16.f32 %h52, %f360; + cvt.rn.f16.f32 %h53, %f366; + cvt.rn.f16.f32 %h54, %f364; + cvt.rn.f16.f32 %h55, %f370; + cvt.rn.f16.f32 %h56, %f368; + cvt.rn.f16.f32 %h57, %f374; + cvt.rn.f16.f32 %h58, %f372; + cvt.rn.f16.f32 %h59, %f378; + cvt.rn.f16.f32 %h60, %f376; + cvt.rn.f16.f32 %h61, %f382; + cvt.rn.f16.f32 %h62, %f380; + cvt.rn.f16.f32 %h63, %f386; + cvt.rn.f16.f32 %h64, %f384; + st.shared.v2.b16 [%r79], {%h2, %h1}; + st.shared.v2.b16 [%r80], {%h4, %h3}; + st.shared.v2.b16 [%r81], {%h6, %h5}; + st.shared.v2.b16 [%r82], {%h8, %h7}; + st.shared.v2.b16 [%r83], {%h10, %h9}; + st.shared.v2.b16 [%r84], {%h12, %h11}; + st.shared.v2.b16 [%r85], {%h14, %h13}; + st.shared.v2.b16 [%r86], {%h16, %h15}; + st.shared.v2.b16 [%r87], {%h18, %h17}; + st.shared.v2.b16 [%r88], {%h20, %h19}; + st.shared.v2.b16 [%r89], {%h22, %h21}; + st.shared.v2.b16 [%r90], {%h24, %h23}; + st.shared.v2.b16 [%r91], {%h26, %h25}; + st.shared.v2.b16 [%r92], {%h28, %h27}; + st.shared.v2.b16 [%r93], {%h30, %h29}; + st.shared.v2.b16 [%r94], {%h32, %h31}; + st.shared.v2.b16 [%r79+128], {%h34, %h33}; + st.shared.v2.b16 [%r80+128], {%h36, %h35}; + st.shared.v2.b16 [%r97], {%h38, %h37}; + st.shared.v2.b16 [%r98], {%h40, %h39}; + st.shared.v2.b16 [%r99], {%h42, %h41}; + st.shared.v2.b16 [%r100], {%h44, %h43}; + st.shared.v2.b16 [%r101], {%h46, %h45}; + st.shared.v2.b16 [%r102], {%h48, %h47}; + st.shared.v2.b16 [%r103], {%h50, %h49}; + st.shared.v2.b16 [%r104], {%h52, %h51}; + st.shared.v2.b16 [%r105], {%h54, %h53}; + st.shared.v2.b16 [%r106], {%h56, %h55}; + st.shared.v2.b16 [%r107], {%h58, %h57}; + st.shared.v2.b16 [%r108], {%h60, %h59}; + st.shared.v2.b16 [%r109], {%h62, %h61}; + st.shared.v2.b16 [%r110], {%h64, %h63}; bar.sync 0; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2607, %r2608, %r2609, %r2610 }, [ %r599 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2831, %r2832, %r2833, %r2834 }, [ %r604 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3055, %r3056, %r3057, %r3058 }, [ %r609 + 0 ]; - ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3279, %r3280, %r3281, %r3282 }, [ %r614 + 0 ]; - mov.u32 %r2827, %r625; - mov.u32 %r2828, %r625; - mov.u32 %r2829, %r625; - mov.u32 %r2830, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3719, %r3720 }, { %r2827, %r2828, %r2829, %r2830 }; - mov.u32 %r2841, %r625; - mov.u32 %r2842, %r625; - mov.u32 %r2843, %r625; - mov.u32 %r2844, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3721, %r3722 }, { %r2841, %r2842, %r2843, %r2844 }; - mov.u32 %r2855, %r625; - mov.u32 %r2856, %r625; - mov.u32 %r2857, %r625; - mov.u32 %r2858, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3723, %r3724 }, { %r2855, %r2856, %r2857, %r2858 }; - mov.u32 %r2869, %r625; - mov.u32 %r2870, %r625; - mov.u32 %r2871, %r625; - mov.u32 %r2872, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3725, %r3726 }, { %r2869, %r2870, %r2871, %r2872 }; - mov.u32 %r2883, %r625; - mov.u32 %r2884, %r625; - mov.u32 %r2885, %r625; - mov.u32 %r2886, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3727, %r3728 }, { %r2883, %r2884, %r2885, %r2886 }; - mov.u32 %r2897, %r625; - mov.u32 %r2898, %r625; - mov.u32 %r2899, %r625; - mov.u32 %r2900, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3729, %r3730 }, { %r2897, %r2898, %r2899, %r2900 }; - mov.u32 %r2911, %r625; - mov.u32 %r2912, %r625; - mov.u32 %r2913, %r625; - mov.u32 %r2914, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3731, %r3732 }, { %r2911, %r2912, %r2913, %r2914 }; - mov.u32 %r2925, %r625; - mov.u32 %r2926, %r625; - mov.u32 %r2927, %r625; - mov.u32 %r2928, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3733, %r3734 }, { %r2925, %r2926, %r2927, %r2928 }; - mov.u32 %r2939, %r625; - mov.u32 %r2940, %r625; - mov.u32 %r2941, %r625; - mov.u32 %r2942, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3735, %r3736 }, { %r2939, %r2940, %r2941, %r2942 }; - mov.u32 %r2953, %r625; - mov.u32 %r2954, %r625; - mov.u32 %r2955, %r625; - mov.u32 %r2956, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3737, %r3738 }, { %r2953, %r2954, %r2955, %r2956 }; - mov.u32 %r2967, %r625; - mov.u32 %r2968, %r625; - mov.u32 %r2969, %r625; - mov.u32 %r2970, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3739, %r3740 }, { %r2967, %r2968, %r2969, %r2970 }; - mov.u32 %r2981, %r625; - mov.u32 %r2982, %r625; - mov.u32 %r2983, %r625; - mov.u32 %r2984, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3741, %r3742 }, { %r2981, %r2982, %r2983, %r2984 }; - mov.u32 %r2995, %r625; - mov.u32 %r2996, %r625; - mov.u32 %r2997, %r625; - mov.u32 %r2998, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3743, %r3744 }, { %r2995, %r2996, %r2997, %r2998 }; - mov.u32 %r3009, %r625; - mov.u32 %r3010, %r625; - mov.u32 %r3011, %r625; - mov.u32 %r3012, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3745, %r3746 }, { %r3009, %r3010, %r3011, %r3012 }; - mov.u32 %r3023, %r625; - mov.u32 %r3024, %r625; - mov.u32 %r3025, %r625; - mov.u32 %r3026, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3747, %r3748 }, { %r3023, %r3024, %r3025, %r3026 }; - mov.u32 %r3040, %r625; - mov.u32 %r3037, %r625; - mov.u32 %r3038, %r625; - mov.u32 %r3039, %r625; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3749, %r3750 }, { %r3037, %r3038, %r3039, %r3040 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3751, %r3752 }, { %r2827, %r2828, %r2829, %r2830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3753, %r3754 }, { %r2841, %r2842, %r2843, %r2844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3755, %r3756 }, { %r2855, %r2856, %r2857, %r2858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3757, %r3758 }, { %r2869, %r2870, %r2871, %r2872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3759, %r3760 }, { %r2883, %r2884, %r2885, %r2886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3761, %r3762 }, { %r2897, %r2898, %r2899, %r2900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3763, %r3764 }, { %r2911, %r2912, %r2913, %r2914 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3765, %r3766 }, { %r2925, %r2926, %r2927, %r2928 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3767, %r3768 }, { %r2939, %r2940, %r2941, %r2942 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3769, %r3770 }, { %r2953, %r2954, %r2955, %r2956 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3771, %r3772 }, { %r2967, %r2968, %r2969, %r2970 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3773, %r3774 }, { %r2981, %r2982, %r2983, %r2984 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3775, %r3776 }, { %r2995, %r2996, %r2997, %r2998 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3777, %r3778 }, { %r3009, %r3010, %r3011, %r3012 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3779, %r3780 }, { %r3023, %r3024, %r3025, %r3026 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3781, %r3782 }, { %r3037, %r3038, %r3039, %r3040 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3783, %r3784 }, { %r2827, %r2828, %r2829, %r2830 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3785, %r3786 }, { %r2841, %r2842, %r2843, %r2844 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3787, %r3788 }, { %r2855, %r2856, %r2857, %r2858 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3789, %r3790 }, { %r2869, %r2870, %r2871, %r2872 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3791, %r3792 }, { %r2883, %r2884, %r2885, %r2886 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3793, %r3794 }, { %r2897, %r2898, %r2899, %r2900 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3795, %r3796 }, { %r2911, %r2912, %r2913, %r2914 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3797, %r3798 }, { %r2925, %r2926, %r2927, %r2928 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3799, %r3800 }, { %r2939, %r2940, %r2941, %r2942 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3801, %r3802 }, { %r2953, %r2954, %r2955, %r2956 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3803, %r3804 }, { %r2967, %r2968, %r2969, %r2970 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3805, %r3806 }, { %r2981, %r2982, %r2983, %r2984 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3807, %r3808 }, { %r2995, %r2996, %r2997, %r2998 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3809, %r3810 }, { %r3009, %r3010, %r3011, %r3012 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3811, %r3812 }, { %r3023, %r3024, %r3025, %r3026 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3813, %r3814 }, { %r3037, %r3038, %r3039, %r3040 }; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3815, %r3816 }, { %r2827, %r2828, %r2829, %r2830 }; - mov.b32 %f226, %r2830; - mov.b32 %f227, %r2829; - mov.b32 %f228, %r2828; - mov.b32 %f229, %r2827; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3817, %r3818 }, { %r2841, %r2842, %r2843, %r2844 }; - mov.b32 %f230, %r2844; - mov.b32 %f231, %r2843; - mov.b32 %f232, %r2842; - mov.b32 %f233, %r2841; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3819, %r3820 }, { %r2855, %r2856, %r2857, %r2858 }; - mov.b32 %f234, %r2858; - mov.b32 %f235, %r2857; - mov.b32 %f236, %r2856; - mov.b32 %f237, %r2855; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3821, %r3822 }, { %r2869, %r2870, %r2871, %r2872 }; - mov.b32 %f238, %r2872; - mov.b32 %f239, %r2871; - mov.b32 %f240, %r2870; - mov.b32 %f241, %r2869; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3823, %r3824 }, { %r2883, %r2884, %r2885, %r2886 }; - mov.b32 %f242, %r2886; - mov.b32 %f243, %r2885; - mov.b32 %f244, %r2884; - mov.b32 %f245, %r2883; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3825, %r3826 }, { %r2897, %r2898, %r2899, %r2900 }; - mov.b32 %f246, %r2900; - mov.b32 %f247, %r2899; - mov.b32 %f248, %r2898; - mov.b32 %f249, %r2897; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3827, %r3828 }, { %r2911, %r2912, %r2913, %r2914 }; - mov.b32 %f250, %r2914; - mov.b32 %f251, %r2913; - mov.b32 %f252, %r2912; - mov.b32 %f253, %r2911; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3829, %r3830 }, { %r2925, %r2926, %r2927, %r2928 }; - mov.b32 %f254, %r2928; - mov.b32 %f255, %r2927; - mov.b32 %f256, %r2926; - mov.b32 %f257, %r2925; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3831, %r3832 }, { %r2939, %r2940, %r2941, %r2942 }; - mov.b32 %f258, %r2942; - mov.b32 %f259, %r2941; - mov.b32 %f260, %r2940; - mov.b32 %f261, %r2939; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3833, %r3834 }, { %r2953, %r2954, %r2955, %r2956 }; - mov.b32 %f262, %r2956; - mov.b32 %f263, %r2955; - mov.b32 %f264, %r2954; - mov.b32 %f265, %r2953; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3835, %r3836 }, { %r2967, %r2968, %r2969, %r2970 }; - mov.b32 %f266, %r2970; - mov.b32 %f267, %r2969; - mov.b32 %f268, %r2968; - mov.b32 %f269, %r2967; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3837, %r3838 }, { %r2981, %r2982, %r2983, %r2984 }; - mov.b32 %f270, %r2984; - mov.b32 %f271, %r2983; - mov.b32 %f272, %r2982; - mov.b32 %f273, %r2981; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3839, %r3840 }, { %r2995, %r2996, %r2997, %r2998 }; - mov.b32 %f274, %r2998; - mov.b32 %f275, %r2997; - mov.b32 %f276, %r2996; - mov.b32 %f277, %r2995; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3841, %r3842 }, { %r3009, %r3010, %r3011, %r3012 }; - mov.b32 %f278, %r3012; - mov.b32 %f279, %r3011; - mov.b32 %f280, %r3010; - mov.b32 %f281, %r3009; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3843, %r3844 }, { %r3023, %r3024, %r3025, %r3026 }; - mov.b32 %f282, %r3026; - mov.b32 %f283, %r3025; - mov.b32 %f284, %r3024; - mov.b32 %f285, %r3023; - mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3845, %r3846 }, { %r3037, %r3038, %r3039, %r3040 }; - mov.b32 %f286, %r3040; - mov.b32 %f287, %r3039; - mov.b32 %f288, %r3038; - mov.b32 %f289, %r3037; - mul.f32 %f290, %f165, %f229; - mul.f32 %f291, %f164, %f228; - mul.f32 %f292, %f163, %f227; - mul.f32 %f293, %f162, %f226; - mul.f32 %f294, %f169, %f233; - mul.f32 %f295, %f168, %f232; - mul.f32 %f296, %f167, %f231; - mul.f32 %f297, %f166, %f230; - mul.f32 %f298, %f173, %f237; - mul.f32 %f299, %f172, %f236; - mul.f32 %f300, %f171, %f235; - mul.f32 %f301, %f170, %f234; - mul.f32 %f302, %f177, %f241; - mul.f32 %f303, %f176, %f240; - mul.f32 %f304, %f175, %f239; - mul.f32 %f305, %f174, %f238; - mul.f32 %f306, %f181, %f245; - mul.f32 %f307, %f180, %f244; - mul.f32 %f308, %f179, %f243; - mul.f32 %f309, %f178, %f242; - mul.f32 %f310, %f185, %f249; - mul.f32 %f311, %f184, %f248; - mul.f32 %f312, %f183, %f247; - mul.f32 %f313, %f182, %f246; - mul.f32 %f314, %f189, %f253; - mul.f32 %f315, %f188, %f252; - mul.f32 %f316, %f187, %f251; - mul.f32 %f317, %f186, %f250; - mul.f32 %f318, %f193, %f257; - mul.f32 %f319, %f192, %f256; - mul.f32 %f320, %f191, %f255; - mul.f32 %f321, %f190, %f254; - mul.f32 %f322, %f197, %f261; - mul.f32 %f323, %f196, %f260; - mul.f32 %f324, %f195, %f259; - mul.f32 %f325, %f194, %f258; - mul.f32 %f326, %f201, %f265; - mul.f32 %f327, %f200, %f264; - mul.f32 %f328, %f199, %f263; - mul.f32 %f329, %f198, %f262; - mul.f32 %f330, %f205, %f269; - mul.f32 %f331, %f204, %f268; - mul.f32 %f332, %f203, %f267; - mul.f32 %f333, %f202, %f266; - mul.f32 %f334, %f209, %f273; - mul.f32 %f335, %f208, %f272; - mul.f32 %f336, %f207, %f271; - mul.f32 %f337, %f206, %f270; - mul.f32 %f338, %f213, %f277; - mul.f32 %f339, %f212, %f276; - mul.f32 %f340, %f211, %f275; - mul.f32 %f341, %f210, %f274; - mul.f32 %f342, %f217, %f281; - mul.f32 %f343, %f216, %f280; - mul.f32 %f344, %f215, %f279; - mul.f32 %f345, %f214, %f278; - mul.f32 %f346, %f221, %f285; - mul.f32 %f347, %f220, %f284; - mul.f32 %f348, %f219, %f283; - mul.f32 %f349, %f218, %f282; - mul.f32 %f350, %f225, %f289; - mul.f32 %f351, %f224, %f288; - mul.f32 %f352, %f223, %f287; - mul.f32 %f353, %f222, %f286; - mul.f32 %f354, %f290, %f97; - mul.f32 %f355, %f291, %f97; - mul.f32 %f356, %f292, %f97; - mul.f32 %f357, %f293, %f97; - mul.f32 %f358, %f294, %f97; - mul.f32 %f359, %f295, %f97; - mul.f32 %f360, %f296, %f97; - mul.f32 %f361, %f297, %f97; - mul.f32 %f362, %f298, %f97; - mul.f32 %f363, %f299, %f97; - mul.f32 %f364, %f300, %f97; - mul.f32 %f365, %f301, %f97; - mul.f32 %f366, %f302, %f97; - mul.f32 %f367, %f303, %f97; - mul.f32 %f368, %f304, %f97; - mul.f32 %f369, %f305, %f97; - mul.f32 %f370, %f306, %f97; - mul.f32 %f371, %f307, %f97; - mul.f32 %f372, %f308, %f97; - mul.f32 %f373, %f309, %f97; - mul.f32 %f374, %f310, %f97; - mul.f32 %f375, %f311, %f97; - mul.f32 %f376, %f312, %f97; - mul.f32 %f377, %f313, %f97; - mul.f32 %f378, %f314, %f97; - mul.f32 %f379, %f315, %f97; - mul.f32 %f380, %f316, %f97; - mul.f32 %f381, %f317, %f97; - mul.f32 %f382, %f318, %f97; - mul.f32 %f383, %f319, %f97; - mul.f32 %f384, %f320, %f97; - mul.f32 %f385, %f321, %f97; - mul.f32 %f386, %f322, %f97; - mul.f32 %f387, %f323, %f97; - mul.f32 %f388, %f324, %f97; - mul.f32 %f389, %f325, %f97; - mul.f32 %f390, %f326, %f97; - mul.f32 %f391, %f327, %f97; - mul.f32 %f392, %f328, %f97; - mul.f32 %f393, %f329, %f97; - mul.f32 %f394, %f330, %f97; - mul.f32 %f395, %f331, %f97; - mul.f32 %f396, %f332, %f97; - mul.f32 %f397, %f333, %f97; - mul.f32 %f398, %f334, %f97; - mul.f32 %f399, %f335, %f97; - mul.f32 %f400, %f336, %f97; - mul.f32 %f401, %f337, %f97; - mul.f32 %f402, %f338, %f97; - mul.f32 %f403, %f339, %f97; - mul.f32 %f404, %f340, %f97; - mul.f32 %f405, %f341, %f97; - mul.f32 %f406, %f342, %f97; - mul.f32 %f407, %f343, %f97; - mul.f32 %f408, %f344, %f97; - mul.f32 %f409, %f345, %f97; - mul.f32 %f410, %f346, %f97; - mul.f32 %f411, %f347, %f97; - mul.f32 %f412, %f348, %f97; - mul.f32 %f413, %f349, %f97; - mul.f32 %f414, %f350, %f97; - mul.f32 %f415, %f351, %f97; - mul.f32 %f416, %f352, %f97; - mul.f32 %f417, %f353, %f97; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1957, %r1958, %r1959, %r1960 }, [ %r1797 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2069, %r2070, %r2071, %r2072 }, [ %r1802 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2181, %r2182, %r2183, %r2184 }, [ %r1807 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2293, %r2294, %r2295, %r2296 }, [ %r1812 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2405, %r2406, %r2407, %r2408 }, [ %r1817 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2517, %r2518, %r2519, %r2520 }, [ %r1822 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2629, %r2630, %r2631, %r2632 }, [ %r1827 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2741, %r2742, %r2743, %r2744 }, [ %r1832 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2013, %r2014, %r2015, %r2016 }, [ %r1837 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2125, %r2126, %r2127, %r2128 }, [ %r1842 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2237, %r2238, %r2239, %r2240 }, [ %r1847 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2349, %r2350, %r2351, %r2352 }, [ %r1852 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2461, %r2462, %r2463, %r2464 }, [ %r1857 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2573, %r2574, %r2575, %r2576 }, [ %r1862 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2685, %r2686, %r2687, %r2688 }, [ %r1867 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2797, %r2798, %r2799, %r2800 }, [ %r1872 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1961, %r1962, %r1975, %r1976 }, [ %r1877 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2073, %r2074, %r2087, %r2088 }, [ %r1882 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2185, %r2186, %r2199, %r2200 }, [ %r1887 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2297, %r2298, %r2311, %r2312 }, [ %r1892 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2409, %r2410, %r2423, %r2424 }, [ %r1897 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2521, %r2522, %r2535, %r2536 }, [ %r1902 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2633, %r2634, %r2647, %r2648 }, [ %r1907 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2745, %r2746, %r2759, %r2760 }, [ %r1912 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1989, %r1990, %r2003, %r2004 }, [ %r1917 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2101, %r2102, %r2115, %r2116 }, [ %r1922 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2213, %r2214, %r2227, %r2228 }, [ %r1927 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2325, %r2326, %r2339, %r2340 }, [ %r1932 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2437, %r2438, %r2451, %r2452 }, [ %r1937 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2549, %r2550, %r2563, %r2564 }, [ %r1942 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2661, %r2662, %r2675, %r2676 }, [ %r1947 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2773, %r2774, %r2787, %r2788 }, [ %r1952 + 0 ]; + mov.b32 %r2065, %f909; + mov.b32 %r2066, %f910; + mov.b32 %r2067, %f911; + mov.b32 %r2068, %f912; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1961, %r1962 }, { %r2065, %r2066, %r2067, %r2068 }; + mov.b32 %r2079, %f913; + mov.b32 %r2080, %f914; + mov.b32 %r2081, %f915; + mov.b32 %r2082, %f916; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1975, %r1976 }, { %r2079, %r2080, %r2081, %r2082 }; + mov.b32 %r2093, %f917; + mov.b32 %r2094, %f918; + mov.b32 %r2095, %f919; + mov.b32 %r2096, %f920; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1989, %r1990 }, { %r2093, %r2094, %r2095, %r2096 }; + mov.b32 %r2107, %f921; + mov.b32 %r2108, %f922; + mov.b32 %r2109, %f923; + mov.b32 %r2110, %f924; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r1957, %r1958, %r1959, %r1960 }, { %r2003, %r2004 }, { %r2107, %r2108, %r2109, %r2110 }; + mov.b32 %r2121, %f925; + mov.b32 %r2122, %f926; + mov.b32 %r2123, %f927; + mov.b32 %r2124, %f928; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1961, %r1962 }, { %r2121, %r2122, %r2123, %r2124 }; + mov.b32 %r2135, %f929; + mov.b32 %r2136, %f930; + mov.b32 %r2137, %f931; + mov.b32 %r2138, %f932; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1975, %r1976 }, { %r2135, %r2136, %r2137, %r2138 }; + mov.b32 %r2149, %f933; + mov.b32 %r2150, %f934; + mov.b32 %r2151, %f935; + mov.b32 %r2152, %f936; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1989, %r1990 }, { %r2149, %r2150, %r2151, %r2152 }; + mov.b32 %r2163, %f937; + mov.b32 %r2164, %f938; + mov.b32 %r2165, %f939; + mov.b32 %r2166, %f940; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2013, %r2014, %r2015, %r2016 }, { %r2003, %r2004 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2073, %r2074 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2087, %r2088 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2101, %r2102 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2115, %r2116 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2073, %r2074 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2087, %r2088 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2101, %r2102 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2115, %r2116 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2185, %r2186 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2199, %r2200 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2213, %r2214 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2227, %r2228 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2185, %r2186 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2199, %r2200 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2213, %r2214 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2227, %r2228 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2297, %r2298 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2311, %r2312 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2325, %r2326 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2339, %r2340 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2297, %r2298 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2311, %r2312 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2325, %r2326 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2339, %r2340 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2409, %r2410 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2423, %r2424 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2437, %r2438 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2451, %r2452 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2409, %r2410 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2423, %r2424 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2437, %r2438 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2451, %r2452 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2521, %r2522 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2535, %r2536 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2549, %r2550 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2563, %r2564 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2521, %r2522 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2535, %r2536 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2549, %r2550 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2563, %r2564 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2633, %r2634 }, { %r2065, %r2066, %r2067, %r2068 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2647, %r2648 }, { %r2079, %r2080, %r2081, %r2082 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2661, %r2662 }, { %r2093, %r2094, %r2095, %r2096 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2675, %r2676 }, { %r2107, %r2108, %r2109, %r2110 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2633, %r2634 }, { %r2121, %r2122, %r2123, %r2124 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2647, %r2648 }, { %r2135, %r2136, %r2137, %r2138 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2661, %r2662 }, { %r2149, %r2150, %r2151, %r2152 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2675, %r2676 }, { %r2163, %r2164, %r2165, %r2166 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2745, %r2746 }, { %r2065, %r2066, %r2067, %r2068 }; + mov.b32 %f912, %r2068; + mov.b32 %f911, %r2067; + mov.b32 %f910, %r2066; + mov.b32 %f909, %r2065; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2759, %r2760 }, { %r2079, %r2080, %r2081, %r2082 }; + mov.b32 %f916, %r2082; + mov.b32 %f915, %r2081; + mov.b32 %f914, %r2080; + mov.b32 %f913, %r2079; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2773, %r2774 }, { %r2093, %r2094, %r2095, %r2096 }; + mov.b32 %f920, %r2096; + mov.b32 %f919, %r2095; + mov.b32 %f918, %r2094; + mov.b32 %f917, %r2093; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2787, %r2788 }, { %r2107, %r2108, %r2109, %r2110 }; + mov.b32 %f924, %r2110; + mov.b32 %f923, %r2109; + mov.b32 %f922, %r2108; + mov.b32 %f921, %r2107; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2745, %r2746 }, { %r2121, %r2122, %r2123, %r2124 }; + mov.b32 %f928, %r2124; + mov.b32 %f927, %r2123; + mov.b32 %f926, %r2122; + mov.b32 %f925, %r2121; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2759, %r2760 }, { %r2135, %r2136, %r2137, %r2138 }; + mov.b32 %f932, %r2138; + mov.b32 %f931, %r2137; + mov.b32 %f930, %r2136; + mov.b32 %f929, %r2135; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2773, %r2774 }, { %r2149, %r2150, %r2151, %r2152 }; + mov.b32 %f936, %r2152; + mov.b32 %f935, %r2151; + mov.b32 %f934, %r2150; + mov.b32 %f933, %r2149; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2787, %r2788 }, { %r2163, %r2164, %r2165, %r2166 }; + mov.b32 %f940, %r2166; + mov.b32 %f939, %r2165; + mov.b32 %f938, %r2164; + mov.b32 %f937, %r2163; + add.s64 %rd95, %rd7, %rd112; + @%p102 ld.global.b32 { %r2849 }, [ %rd95 + 0 ]; bar.sync 0; - st.shared.b32 [%r28], %f254; - st.shared.b32 [%r28+256], %f255; - st.shared.b32 [%r28+16], %f256; - st.shared.b32 [%r28+16], %f257; - st.shared.b32 [%r28+2048], %f258; - st.shared.b32 [%r28+2304], %f259; - st.shared.b32 [%r28+16], %f260; - st.shared.b32 [%r28+16], %f261; - st.shared.b32 [%r28+4096], %f262; - st.shared.b32 [%r28+4352], %f263; - st.shared.b32 [%r28+16], %f264; - st.shared.b32 [%r28+16], %f265; - st.shared.b32 [%r28+6144], %f266; - st.shared.b32 [%r28+6400], %f267; - st.shared.b32 [%r28+16], %f268; - st.shared.b32 [%r28+16], %f269; - st.shared.b32 [%r28+8192], %f270; - st.shared.b32 [%r28+8448], %f271; - st.shared.b32 [%r28+16], %f272; - st.shared.b32 [%r28+16], %f273; - st.shared.b32 [%r28+10240], %f274; - st.shared.b32 [%r28+10496], %f275; - st.shared.b32 [%r28+16], %f276; - st.shared.b32 [%r28+16], %f277; - st.shared.b32 [%r28+12288], %f278; - st.shared.b32 [%r28+12544], %f279; - st.shared.b32 [%r28+16], %f280; - st.shared.b32 [%r28+16], %f281; - st.shared.b32 [%r28+14336], %f282; - st.shared.b32 [%r28+14592], %f283; - st.shared.b32 [%r28+16], %f284; - st.shared.b32 [%r28+16], %f285; - st.shared.b32 [%r28+16384], %f286; - st.shared.b32 [%r28+16640], %f287; - st.shared.b32 [%r28+16], %f288; - st.shared.b32 [%r28+16], %f289; - st.shared.b32 [%r28+18432], %f290; - st.shared.b32 [%r28+18688], %f291; - st.shared.b32 [%r28+16], %f292; - st.shared.b32 [%r28+16], %f293; - st.shared.b32 [%r28+20480], %f294; - st.shared.b32 [%r28+20736], %f295; - st.shared.b32 [%r28+16], %f296; - st.shared.b32 [%r28+16], %f297; - st.shared.b32 [%r28+22528], %f298; - st.shared.b32 [%r28+22784], %f299; - st.shared.b32 [%r28+16], %f300; - st.shared.b32 [%r28+16], %f301; - st.shared.b32 [%r28+24576], %f302; - st.shared.b32 [%r28+24832], %f303; - st.shared.b32 [%r28+16], %f304; - st.shared.b32 [%r28+16], %f305; - st.shared.b32 [%r28+26624], %f306; - st.shared.b32 [%r28+26880], %f307; - st.shared.b32 [%r28+16], %f308; - st.shared.b32 [%r28+16], %f309; - st.shared.b32 [%r28+28672], %f310; - st.shared.b32 [%r28+28928], %f311; - st.shared.b32 [%r28+16], %f312; - st.shared.b32 [%r28+16], %f313; - st.shared.b32 [%r28+30720], %f314; - st.shared.b32 [%r28+30976], %f315; - st.shared.b32 [%r28+16], %f316; - st.shared.b32 [%r28+16], %f317; - add.s32 %r3847, %r3847, 128; - setp.lt.s32 %p18, %r3847, %r11; - @%p18 bra LBB0_2; -LBB0_3: + st.shared.u32 [%r143], %r2849; bar.sync 0; - and.b32 %r3579, %r6, 48; - or.b32 %r3580, %r3579, %r8; - shr.u32 %r3581, %r1, 4; - and.b32 %r3582, %r3581, 56; - or.b32 %r3583, %r10, %r3582; - mad.lo.s32 %r3584, %r3580, 72, %r3583; - shl.b32 %r3585, %r3584, 2; - add.s32 %r3587, %r478, %r3585; - st.shared.v2.f32 [%r3587], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2304], {%f129, %f129}; - st.shared.v2.f32 [%r3587+64], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2368], {%f129, %f129}; - st.shared.v2.f32 [%r3587+128], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2432], {%f129, %f129}; - st.shared.v2.f32 [%r3587+192], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2496], {%f129, %f129}; + ld.shared.f32 %f584, [%r144]; + ld.shared.f32 %f585, [%r145+32]; + sub.f32 %f587, %f259, %f584; + sub.f32 %f588, %f259, %f585; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3034, %r3035, %r3036, %r3037 }, [ %r2854 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3258, %r3259, %r3260, %r3261 }, [ %r2859 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3482, %r3483, %r3484, %r3485 }, [ %r2864 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3706, %r3707, %r3708, %r3709 }, [ %r2869 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2870, %r2871, %r2872, %r2873 }, [ %r2874 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2875, %r2876, %r2877, %r2878 }, [ %r2879 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2880, %r2881, %r2882, %r2883 }, [ %r2884 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2885, %r2886, %r2887, %r2888 }, [ %r2889 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2890, %r2891, %r2892, %r2893 }, [ %r2894 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2895, %r2896, %r2897, %r2898 }, [ %r2899 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2900, %r2901, %r2902, %r2903 }, [ %r2904 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2905, %r2906, %r2907, %r2908 }, [ %r2909 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2910, %r2911, %r2912, %r2913 }, [ %r2914 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2915, %r2916, %r2917, %r2918 }, [ %r2919 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2920, %r2921, %r2922, %r2923 }, [ %r2924 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2925, %r2926, %r2927, %r2928 }, [ %r2929 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2930, %r2931, %r2932, %r2933 }, [ %r2934 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2935, %r2936, %r2937, %r2938 }, [ %r2939 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2940, %r2941, %r2942, %r2943 }, [ %r2944 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2945, %r2946, %r2947, %r2948 }, [ %r2949 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2950, %r2951, %r2952, %r2953 }, [ %r2954 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2955, %r2956, %r2957, %r2958 }, [ %r2959 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2960, %r2961, %r2962, %r2963 }, [ %r2964 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2965, %r2966, %r2967, %r2968 }, [ %r2969 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2970, %r2971, %r2972, %r2973 }, [ %r2974 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2975, %r2976, %r2977, %r2978 }, [ %r2979 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2980, %r2981, %r2982, %r2983 }, [ %r2984 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2985, %r2986, %r2987, %r2988 }, [ %r2989 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2990, %r2991, %r2992, %r2993 }, [ %r2994 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2995, %r2996, %r2997, %r2998 }, [ %r2999 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3000, %r3001, %r3002, %r3003 }, [ %r3004 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3005, %r3006, %r3007, %r3008 }, [ %r3009 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3010, %r3011, %r3012, %r3013 }, [ %r3014 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3015, %r3016, %r3017, %r3018 }, [ %r3019 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3020, %r3021, %r3022, %r3023 }, [ %r3024 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3025, %r3026, %r3027, %r3028 }, [ %r3029 + 0 ]; + mov.b32 %r3465, %f587; + mov.b32 %r3467, %f588; + mov.u32 %r3254, %r3465; + mov.u32 %r3255, %r3465; + mov.u32 %r3256, %r3467; + mov.u32 %r3257, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2870, %r2871 }, { %r3254, %r3255, %r3256, %r3257 }; + mov.u32 %r3268, %r3465; + mov.u32 %r3269, %r3465; + mov.u32 %r3270, %r3467; + mov.u32 %r3271, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2872, %r2873 }, { %r3268, %r3269, %r3270, %r3271 }; + mov.u32 %r3282, %r3465; + mov.u32 %r3283, %r3465; + mov.u32 %r3284, %r3467; + mov.u32 %r3285, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2890, %r2891 }, { %r3282, %r3283, %r3284, %r3285 }; + mov.u32 %r3296, %r3465; + mov.u32 %r3297, %r3465; + mov.u32 %r3298, %r3467; + mov.u32 %r3299, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2892, %r2893 }, { %r3296, %r3297, %r3298, %r3299 }; + mov.u32 %r3310, %r3465; + mov.u32 %r3311, %r3465; + mov.u32 %r3312, %r3467; + mov.u32 %r3313, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2910, %r2911 }, { %r3310, %r3311, %r3312, %r3313 }; + mov.u32 %r3324, %r3465; + mov.u32 %r3325, %r3465; + mov.u32 %r3326, %r3467; + mov.u32 %r3327, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2912, %r2913 }, { %r3324, %r3325, %r3326, %r3327 }; + mov.u32 %r3338, %r3465; + mov.u32 %r3339, %r3465; + mov.u32 %r3340, %r3467; + mov.u32 %r3341, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2930, %r2931 }, { %r3338, %r3339, %r3340, %r3341 }; + mov.u32 %r3352, %r3465; + mov.u32 %r3353, %r3465; + mov.u32 %r3354, %r3467; + mov.u32 %r3355, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2932, %r2933 }, { %r3352, %r3353, %r3354, %r3355 }; + mov.u32 %r3366, %r3465; + mov.u32 %r3367, %r3465; + mov.u32 %r3368, %r3467; + mov.u32 %r3369, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2950, %r2951 }, { %r3366, %r3367, %r3368, %r3369 }; + mov.u32 %r3380, %r3465; + mov.u32 %r3381, %r3465; + mov.u32 %r3382, %r3467; + mov.u32 %r3383, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2952, %r2953 }, { %r3380, %r3381, %r3382, %r3383 }; + mov.u32 %r3394, %r3465; + mov.u32 %r3395, %r3465; + mov.u32 %r3396, %r3467; + mov.u32 %r3397, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2970, %r2971 }, { %r3394, %r3395, %r3396, %r3397 }; + mov.u32 %r3408, %r3465; + mov.u32 %r3409, %r3465; + mov.u32 %r3410, %r3467; + mov.u32 %r3411, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2972, %r2973 }, { %r3408, %r3409, %r3410, %r3411 }; + mov.u32 %r3422, %r3465; + mov.u32 %r3423, %r3465; + mov.u32 %r3424, %r3467; + mov.u32 %r3425, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2990, %r2991 }, { %r3422, %r3423, %r3424, %r3425 }; + mov.u32 %r3436, %r3465; + mov.u32 %r3437, %r3465; + mov.u32 %r3438, %r3467; + mov.u32 %r3439, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2992, %r2993 }, { %r3436, %r3437, %r3438, %r3439 }; + mov.u32 %r3450, %r3465; + mov.u32 %r3451, %r3465; + mov.u32 %r3452, %r3467; + mov.u32 %r3453, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3010, %r3011 }, { %r3450, %r3451, %r3452, %r3453 }; + mov.u32 %r3464, %r3465; + mov.u32 %r3466, %r3467; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3012, %r3013 }, { %r3464, %r3465, %r3466, %r3467 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2875, %r2876 }, { %r3254, %r3255, %r3256, %r3257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2877, %r2878 }, { %r3268, %r3269, %r3270, %r3271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2895, %r2896 }, { %r3282, %r3283, %r3284, %r3285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2897, %r2898 }, { %r3296, %r3297, %r3298, %r3299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2915, %r2916 }, { %r3310, %r3311, %r3312, %r3313 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2917, %r2918 }, { %r3324, %r3325, %r3326, %r3327 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2935, %r2936 }, { %r3338, %r3339, %r3340, %r3341 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2937, %r2938 }, { %r3352, %r3353, %r3354, %r3355 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2955, %r2956 }, { %r3366, %r3367, %r3368, %r3369 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2957, %r2958 }, { %r3380, %r3381, %r3382, %r3383 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2975, %r2976 }, { %r3394, %r3395, %r3396, %r3397 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2977, %r2978 }, { %r3408, %r3409, %r3410, %r3411 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2995, %r2996 }, { %r3422, %r3423, %r3424, %r3425 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2997, %r2998 }, { %r3436, %r3437, %r3438, %r3439 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3015, %r3016 }, { %r3450, %r3451, %r3452, %r3453 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3017, %r3018 }, { %r3464, %r3465, %r3466, %r3467 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2880, %r2881 }, { %r3254, %r3255, %r3256, %r3257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2882, %r2883 }, { %r3268, %r3269, %r3270, %r3271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2900, %r2901 }, { %r3282, %r3283, %r3284, %r3285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2902, %r2903 }, { %r3296, %r3297, %r3298, %r3299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2920, %r2921 }, { %r3310, %r3311, %r3312, %r3313 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2922, %r2923 }, { %r3324, %r3325, %r3326, %r3327 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2940, %r2941 }, { %r3338, %r3339, %r3340, %r3341 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2942, %r2943 }, { %r3352, %r3353, %r3354, %r3355 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2960, %r2961 }, { %r3366, %r3367, %r3368, %r3369 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2962, %r2963 }, { %r3380, %r3381, %r3382, %r3383 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2980, %r2981 }, { %r3394, %r3395, %r3396, %r3397 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2982, %r2983 }, { %r3408, %r3409, %r3410, %r3411 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3000, %r3001 }, { %r3422, %r3423, %r3424, %r3425 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3002, %r3003 }, { %r3436, %r3437, %r3438, %r3439 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3020, %r3021 }, { %r3450, %r3451, %r3452, %r3453 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3022, %r3023 }, { %r3464, %r3465, %r3466, %r3467 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2885, %r2886 }, { %r3254, %r3255, %r3256, %r3257 }; + mov.b32 %f589, %r3256; + mov.b32 %f590, %r3257; + mov.b32 %f591, %r3254; + mov.b32 %f592, %r3255; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2887, %r2888 }, { %r3268, %r3269, %r3270, %r3271 }; + mov.b32 %f593, %r3270; + mov.b32 %f594, %r3271; + mov.b32 %f595, %r3268; + mov.b32 %f596, %r3269; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2905, %r2906 }, { %r3282, %r3283, %r3284, %r3285 }; + mov.b32 %f597, %r3284; + mov.b32 %f598, %r3285; + mov.b32 %f599, %r3282; + mov.b32 %f600, %r3283; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2907, %r2908 }, { %r3296, %r3297, %r3298, %r3299 }; + mov.b32 %f601, %r3298; + mov.b32 %f602, %r3299; + mov.b32 %f603, %r3296; + mov.b32 %f604, %r3297; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2925, %r2926 }, { %r3310, %r3311, %r3312, %r3313 }; + mov.b32 %f605, %r3312; + mov.b32 %f606, %r3313; + mov.b32 %f607, %r3310; + mov.b32 %f608, %r3311; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2927, %r2928 }, { %r3324, %r3325, %r3326, %r3327 }; + mov.b32 %f609, %r3326; + mov.b32 %f610, %r3327; + mov.b32 %f611, %r3324; + mov.b32 %f612, %r3325; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2945, %r2946 }, { %r3338, %r3339, %r3340, %r3341 }; + mov.b32 %f613, %r3340; + mov.b32 %f614, %r3341; + mov.b32 %f615, %r3338; + mov.b32 %f616, %r3339; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2947, %r2948 }, { %r3352, %r3353, %r3354, %r3355 }; + mov.b32 %f617, %r3354; + mov.b32 %f618, %r3355; + mov.b32 %f619, %r3352; + mov.b32 %f620, %r3353; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2965, %r2966 }, { %r3366, %r3367, %r3368, %r3369 }; + mov.b32 %f621, %r3368; + mov.b32 %f622, %r3369; + mov.b32 %f623, %r3366; + mov.b32 %f624, %r3367; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2967, %r2968 }, { %r3380, %r3381, %r3382, %r3383 }; + mov.b32 %f625, %r3382; + mov.b32 %f626, %r3383; + mov.b32 %f627, %r3380; + mov.b32 %f628, %r3381; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2985, %r2986 }, { %r3394, %r3395, %r3396, %r3397 }; + mov.b32 %f629, %r3396; + mov.b32 %f630, %r3397; + mov.b32 %f631, %r3394; + mov.b32 %f632, %r3395; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2987, %r2988 }, { %r3408, %r3409, %r3410, %r3411 }; + mov.b32 %f633, %r3410; + mov.b32 %f634, %r3411; + mov.b32 %f635, %r3408; + mov.b32 %f636, %r3409; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3005, %r3006 }, { %r3422, %r3423, %r3424, %r3425 }; + mov.b32 %f637, %r3424; + mov.b32 %f638, %r3425; + mov.b32 %f639, %r3422; + mov.b32 %f640, %r3423; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3007, %r3008 }, { %r3436, %r3437, %r3438, %r3439 }; + mov.b32 %f641, %r3438; + mov.b32 %f642, %r3439; + mov.b32 %f643, %r3436; + mov.b32 %f644, %r3437; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3025, %r3026 }, { %r3450, %r3451, %r3452, %r3453 }; + mov.b32 %f645, %r3452; + mov.b32 %f646, %r3453; + mov.b32 %f647, %r3450; + mov.b32 %f648, %r3451; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3027, %r3028 }, { %r3464, %r3465, %r3466, %r3467 }; + mov.b32 %f649, %r3466; + mov.b32 %f650, %r3467; + mov.b32 %f651, %r3464; + mov.b32 %f652, %r3465; + mul.f32 %f653, %f262, %f592; + mul.f32 %f654, %f260, %f591; + mul.f32 %f655, %f266, %f590; + mul.f32 %f656, %f264, %f589; + mul.f32 %f657, %f270, %f596; + mul.f32 %f658, %f268, %f595; + mul.f32 %f659, %f274, %f594; + mul.f32 %f660, %f272, %f593; + mul.f32 %f661, %f278, %f600; + mul.f32 %f662, %f276, %f599; + mul.f32 %f663, %f282, %f598; + mul.f32 %f664, %f280, %f597; + mul.f32 %f665, %f286, %f604; + mul.f32 %f666, %f284, %f603; + mul.f32 %f667, %f290, %f602; + mul.f32 %f668, %f288, %f601; + mul.f32 %f669, %f294, %f608; + mul.f32 %f670, %f292, %f607; + mul.f32 %f671, %f298, %f606; + mul.f32 %f672, %f296, %f605; + mul.f32 %f673, %f302, %f612; + mul.f32 %f674, %f300, %f611; + mul.f32 %f675, %f306, %f610; + mul.f32 %f676, %f304, %f609; + mul.f32 %f677, %f310, %f616; + mul.f32 %f678, %f308, %f615; + mul.f32 %f679, %f314, %f614; + mul.f32 %f680, %f312, %f613; + mul.f32 %f681, %f318, %f620; + mul.f32 %f682, %f316, %f619; + mul.f32 %f683, %f322, %f618; + mul.f32 %f684, %f320, %f617; + mul.f32 %f685, %f326, %f624; + mul.f32 %f686, %f324, %f623; + mul.f32 %f687, %f330, %f622; + mul.f32 %f688, %f328, %f621; + mul.f32 %f689, %f334, %f628; + mul.f32 %f690, %f332, %f627; + mul.f32 %f691, %f338, %f626; + mul.f32 %f692, %f336, %f625; + mul.f32 %f693, %f342, %f632; + mul.f32 %f694, %f340, %f631; + mul.f32 %f695, %f346, %f630; + mul.f32 %f696, %f344, %f629; + mul.f32 %f697, %f350, %f636; + mul.f32 %f698, %f348, %f635; + mul.f32 %f699, %f354, %f634; + mul.f32 %f700, %f352, %f633; + mul.f32 %f701, %f358, %f640; + mul.f32 %f702, %f356, %f639; + mul.f32 %f703, %f362, %f638; + mul.f32 %f704, %f360, %f637; + mul.f32 %f705, %f366, %f644; + mul.f32 %f706, %f364, %f643; + mul.f32 %f707, %f370, %f642; + mul.f32 %f708, %f368, %f641; + mul.f32 %f709, %f374, %f648; + mul.f32 %f710, %f372, %f647; + mul.f32 %f711, %f378, %f646; + mul.f32 %f712, %f376, %f645; + mul.f32 %f713, %f382, %f652; + mul.f32 %f714, %f380, %f651; + mul.f32 %f715, %f386, %f650; + mul.f32 %f716, %f384, %f649; + mul.f32 %f717, %f654, %f195; + mul.f32 %f718, %f653, %f195; + mul.f32 %f719, %f656, %f195; + mul.f32 %f720, %f655, %f195; + mul.f32 %f721, %f658, %f195; + mul.f32 %f722, %f657, %f195; + mul.f32 %f723, %f660, %f195; + mul.f32 %f724, %f659, %f195; + mul.f32 %f725, %f662, %f195; + mul.f32 %f726, %f661, %f195; + mul.f32 %f727, %f664, %f195; + mul.f32 %f728, %f663, %f195; + mul.f32 %f729, %f666, %f195; + mul.f32 %f730, %f665, %f195; + mul.f32 %f731, %f668, %f195; + mul.f32 %f732, %f667, %f195; + mul.f32 %f733, %f670, %f195; + mul.f32 %f734, %f669, %f195; + mul.f32 %f735, %f672, %f195; + mul.f32 %f736, %f671, %f195; + mul.f32 %f737, %f674, %f195; + mul.f32 %f738, %f673, %f195; + mul.f32 %f739, %f676, %f195; + mul.f32 %f740, %f675, %f195; + mul.f32 %f741, %f678, %f195; + mul.f32 %f742, %f677, %f195; + mul.f32 %f743, %f680, %f195; + mul.f32 %f744, %f679, %f195; + mul.f32 %f745, %f682, %f195; + mul.f32 %f746, %f681, %f195; + mul.f32 %f747, %f684, %f195; + mul.f32 %f748, %f683, %f195; + mul.f32 %f749, %f686, %f195; + mul.f32 %f750, %f685, %f195; + mul.f32 %f751, %f688, %f195; + mul.f32 %f752, %f687, %f195; + mul.f32 %f753, %f690, %f195; + mul.f32 %f754, %f689, %f195; + mul.f32 %f755, %f692, %f195; + mul.f32 %f756, %f691, %f195; + mul.f32 %f757, %f694, %f195; + mul.f32 %f758, %f693, %f195; + mul.f32 %f759, %f696, %f195; + mul.f32 %f760, %f695, %f195; + mul.f32 %f761, %f698, %f195; + mul.f32 %f762, %f697, %f195; + mul.f32 %f763, %f700, %f195; + mul.f32 %f764, %f699, %f195; + mul.f32 %f765, %f702, %f195; + mul.f32 %f766, %f701, %f195; + mul.f32 %f767, %f704, %f195; + mul.f32 %f768, %f703, %f195; + mul.f32 %f769, %f706, %f195; + mul.f32 %f770, %f705, %f195; + mul.f32 %f771, %f708, %f195; + mul.f32 %f772, %f707, %f195; + mul.f32 %f773, %f710, %f195; + mul.f32 %f774, %f709, %f195; + mul.f32 %f775, %f712, %f195; + mul.f32 %f776, %f711, %f195; + mul.f32 %f777, %f714, %f195; + mul.f32 %f778, %f713, %f195; + mul.f32 %f779, %f716, %f195; + mul.f32 %f780, %f715, %f195; + cvt.rn.f16.f32 %h65, %f718; + cvt.rn.f16.f32 %h66, %f717; + cvt.rn.f16.f32 %h67, %f720; + cvt.rn.f16.f32 %h68, %f719; + cvt.rn.f16.f32 %h69, %f722; + cvt.rn.f16.f32 %h70, %f721; + cvt.rn.f16.f32 %h71, %f724; + cvt.rn.f16.f32 %h72, %f723; + cvt.rn.f16.f32 %h73, %f726; + cvt.rn.f16.f32 %h74, %f725; + cvt.rn.f16.f32 %h75, %f728; + cvt.rn.f16.f32 %h76, %f727; + cvt.rn.f16.f32 %h77, %f730; + cvt.rn.f16.f32 %h78, %f729; + cvt.rn.f16.f32 %h79, %f732; + cvt.rn.f16.f32 %h80, %f731; + cvt.rn.f16.f32 %h81, %f734; + cvt.rn.f16.f32 %h82, %f733; + cvt.rn.f16.f32 %h83, %f736; + cvt.rn.f16.f32 %h84, %f735; + cvt.rn.f16.f32 %h85, %f738; + cvt.rn.f16.f32 %h86, %f737; + cvt.rn.f16.f32 %h87, %f740; + cvt.rn.f16.f32 %h88, %f739; + cvt.rn.f16.f32 %h89, %f742; + cvt.rn.f16.f32 %h90, %f741; + cvt.rn.f16.f32 %h91, %f744; + cvt.rn.f16.f32 %h92, %f743; + cvt.rn.f16.f32 %h93, %f746; + cvt.rn.f16.f32 %h94, %f745; + cvt.rn.f16.f32 %h95, %f748; + cvt.rn.f16.f32 %h96, %f747; + cvt.rn.f16.f32 %h97, %f750; + cvt.rn.f16.f32 %h98, %f749; + cvt.rn.f16.f32 %h99, %f752; + cvt.rn.f16.f32 %h100, %f751; + cvt.rn.f16.f32 %h101, %f754; + cvt.rn.f16.f32 %h102, %f753; + cvt.rn.f16.f32 %h103, %f756; + cvt.rn.f16.f32 %h104, %f755; + cvt.rn.f16.f32 %h105, %f758; + cvt.rn.f16.f32 %h106, %f757; + cvt.rn.f16.f32 %h107, %f760; + cvt.rn.f16.f32 %h108, %f759; + cvt.rn.f16.f32 %h109, %f762; + cvt.rn.f16.f32 %h110, %f761; + cvt.rn.f16.f32 %h111, %f764; + cvt.rn.f16.f32 %h112, %f763; + cvt.rn.f16.f32 %h113, %f766; + cvt.rn.f16.f32 %h114, %f765; + cvt.rn.f16.f32 %h115, %f768; + cvt.rn.f16.f32 %h116, %f767; + cvt.rn.f16.f32 %h117, %f770; + cvt.rn.f16.f32 %h118, %f769; + cvt.rn.f16.f32 %h119, %f772; + cvt.rn.f16.f32 %h120, %f771; + cvt.rn.f16.f32 %h121, %f774; + cvt.rn.f16.f32 %h122, %f773; + cvt.rn.f16.f32 %h123, %f776; + cvt.rn.f16.f32 %h124, %f775; + cvt.rn.f16.f32 %h125, %f778; + cvt.rn.f16.f32 %h126, %f777; + cvt.rn.f16.f32 %h127, %f780; + cvt.rn.f16.f32 %h128, %f779; bar.sync 0; - mad.lo.s32 %r3588, %r3, 72, %r5; - shl.b32 %r3589, %r3588, 2; - add.s32 %r3590, %r478, %r3589; - ld.shared.v4.f32 {%f419, %f420, %f421, %f422}, [%r3590]; - ld.shared.v4.f32 {%f423, %f424, %f425, %f426}, [%r3590+16]; - ld.shared.v4.f32 {%f427, %f428, %f429, %f430}, [%r3590+9216]; - ld.shared.v4.f32 {%f431, %f432, %f433, %f434}, [%r3590+9232]; + st.shared.v2.b16 [%r182], {%h66, %h65}; + st.shared.v2.b16 [%r183], {%h68, %h67}; + st.shared.v2.b16 [%r184], {%h70, %h69}; + st.shared.v2.b16 [%r185], {%h72, %h71}; + st.shared.v2.b16 [%r186], {%h74, %h73}; + st.shared.v2.b16 [%r187], {%h76, %h75}; + st.shared.v2.b16 [%r188], {%h78, %h77}; + st.shared.v2.b16 [%r189], {%h80, %h79}; + st.shared.v2.b16 [%r190], {%h82, %h81}; + st.shared.v2.b16 [%r191], {%h84, %h83}; + st.shared.v2.b16 [%r192], {%h86, %h85}; + st.shared.v2.b16 [%r193], {%h88, %h87}; + st.shared.v2.b16 [%r194], {%h90, %h89}; + st.shared.v2.b16 [%r195], {%h92, %h91}; + st.shared.v2.b16 [%r196], {%h94, %h93}; + st.shared.v2.b16 [%r197], {%h96, %h95}; + st.shared.v2.b16 [%r182+128], {%h98, %h97}; + st.shared.v2.b16 [%r183+128], {%h100, %h99}; + st.shared.v2.b16 [%r200], {%h102, %h101}; + st.shared.v2.b16 [%r201], {%h104, %h103}; + st.shared.v2.b16 [%r202], {%h106, %h105}; + st.shared.v2.b16 [%r203], {%h108, %h107}; + st.shared.v2.b16 [%r204], {%h110, %h109}; + st.shared.v2.b16 [%r205], {%h112, %h111}; + st.shared.v2.b16 [%r206], {%h114, %h113}; + st.shared.v2.b16 [%r207], {%h116, %h115}; + st.shared.v2.b16 [%r208], {%h118, %h117}; + st.shared.v2.b16 [%r209], {%h120, %h119}; + st.shared.v2.b16 [%r210], {%h122, %h121}; + st.shared.v2.b16 [%r211], {%h124, %h123}; + st.shared.v2.b16 [%r212], {%h126, %h125}; + st.shared.v2.b16 [%r213], {%h128, %h127}; bar.sync 0; - st.shared.v2.f32 [%r3587], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2304], {%f129, %f129}; - st.shared.v2.f32 [%r3587+64], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2368], {%f129, %f129}; - st.shared.v2.f32 [%r3587+128], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2432], {%f129, %f129}; - st.shared.v2.f32 [%r3587+192], {%f129, %f129}; - st.shared.v2.f32 [%r3587+2496], {%f129, %f129}; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4090, %r4091, %r4092, %r4093 }, [ %r3930 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4202, %r4203, %r4204, %r4205 }, [ %r3935 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4314, %r4315, %r4316, %r4317 }, [ %r3940 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4426, %r4427, %r4428, %r4429 }, [ %r3945 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4538, %r4539, %r4540, %r4541 }, [ %r3950 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4650, %r4651, %r4652, %r4653 }, [ %r3955 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4762, %r4763, %r4764, %r4765 }, [ %r3960 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4874, %r4875, %r4876, %r4877 }, [ %r3965 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4146, %r4147, %r4148, %r4149 }, [ %r3970 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4258, %r4259, %r4260, %r4261 }, [ %r3975 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4370, %r4371, %r4372, %r4373 }, [ %r3980 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4482, %r4483, %r4484, %r4485 }, [ %r3985 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4594, %r4595, %r4596, %r4597 }, [ %r3990 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4706, %r4707, %r4708, %r4709 }, [ %r3995 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4818, %r4819, %r4820, %r4821 }, [ %r4000 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4930, %r4931, %r4932, %r4933 }, [ %r4005 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4094, %r4095, %r4108, %r4109 }, [ %r4010 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4206, %r4207, %r4220, %r4221 }, [ %r4015 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4318, %r4319, %r4332, %r4333 }, [ %r4020 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4430, %r4431, %r4444, %r4445 }, [ %r4025 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4542, %r4543, %r4556, %r4557 }, [ %r4030 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4654, %r4655, %r4668, %r4669 }, [ %r4035 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4766, %r4767, %r4780, %r4781 }, [ %r4040 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4878, %r4879, %r4892, %r4893 }, [ %r4045 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4122, %r4123, %r4136, %r4137 }, [ %r4050 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4234, %r4235, %r4248, %r4249 }, [ %r4055 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4346, %r4347, %r4360, %r4361 }, [ %r4060 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4458, %r4459, %r4472, %r4473 }, [ %r4065 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4570, %r4571, %r4584, %r4585 }, [ %r4070 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4682, %r4683, %r4696, %r4697 }, [ %r4075 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4794, %r4795, %r4808, %r4809 }, [ %r4080 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4906, %r4907, %r4920, %r4921 }, [ %r4085 + 0 ]; + mov.b32 %r4198, %f941; + mov.b32 %r4199, %f942; + mov.b32 %r4200, %f943; + mov.b32 %r4201, %f944; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4094, %r4095 }, { %r4198, %r4199, %r4200, %r4201 }; + mov.b32 %r4212, %f945; + mov.b32 %r4213, %f946; + mov.b32 %r4214, %f947; + mov.b32 %r4215, %f948; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4108, %r4109 }, { %r4212, %r4213, %r4214, %r4215 }; + mov.b32 %r4226, %f949; + mov.b32 %r4227, %f950; + mov.b32 %r4228, %f951; + mov.b32 %r4229, %f952; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4122, %r4123 }, { %r4226, %r4227, %r4228, %r4229 }; + mov.b32 %r4240, %f953; + mov.b32 %r4241, %f954; + mov.b32 %r4242, %f955; + mov.b32 %r4243, %f956; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4136, %r4137 }, { %r4240, %r4241, %r4242, %r4243 }; + mov.b32 %r4254, %f957; + mov.b32 %r4255, %f958; + mov.b32 %r4256, %f959; + mov.b32 %r4257, %f960; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4094, %r4095 }, { %r4254, %r4255, %r4256, %r4257 }; + mov.b32 %r4268, %f961; + mov.b32 %r4269, %f962; + mov.b32 %r4270, %f963; + mov.b32 %r4271, %f964; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4108, %r4109 }, { %r4268, %r4269, %r4270, %r4271 }; + mov.b32 %r4282, %f965; + mov.b32 %r4283, %f966; + mov.b32 %r4284, %f967; + mov.b32 %r4285, %f968; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4122, %r4123 }, { %r4282, %r4283, %r4284, %r4285 }; + mov.b32 %r4296, %f969; + mov.b32 %r4297, %f970; + mov.b32 %r4298, %f971; + mov.b32 %r4299, %f972; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4136, %r4137 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4206, %r4207 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4220, %r4221 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4234, %r4235 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4248, %r4249 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4206, %r4207 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4220, %r4221 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4234, %r4235 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4248, %r4249 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4318, %r4319 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4332, %r4333 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4346, %r4347 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4360, %r4361 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4318, %r4319 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4332, %r4333 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4346, %r4347 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4360, %r4361 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4430, %r4431 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4444, %r4445 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4458, %r4459 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4472, %r4473 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4430, %r4431 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4444, %r4445 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4458, %r4459 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4472, %r4473 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4542, %r4543 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4556, %r4557 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4570, %r4571 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4584, %r4585 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4542, %r4543 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4556, %r4557 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4570, %r4571 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4584, %r4585 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4654, %r4655 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4668, %r4669 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4682, %r4683 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4696, %r4697 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4654, %r4655 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4668, %r4669 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4682, %r4683 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4696, %r4697 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4766, %r4767 }, { %r4198, %r4199, %r4200, %r4201 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4780, %r4781 }, { %r4212, %r4213, %r4214, %r4215 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4794, %r4795 }, { %r4226, %r4227, %r4228, %r4229 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4808, %r4809 }, { %r4240, %r4241, %r4242, %r4243 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4766, %r4767 }, { %r4254, %r4255, %r4256, %r4257 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4780, %r4781 }, { %r4268, %r4269, %r4270, %r4271 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4794, %r4795 }, { %r4282, %r4283, %r4284, %r4285 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4808, %r4809 }, { %r4296, %r4297, %r4298, %r4299 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4878, %r4879 }, { %r4198, %r4199, %r4200, %r4201 }; + mov.b32 %f944, %r4201; + mov.b32 %f943, %r4200; + mov.b32 %f942, %r4199; + mov.b32 %f941, %r4198; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4892, %r4893 }, { %r4212, %r4213, %r4214, %r4215 }; + mov.b32 %f948, %r4215; + mov.b32 %f947, %r4214; + mov.b32 %f946, %r4213; + mov.b32 %f945, %r4212; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4906, %r4907 }, { %r4226, %r4227, %r4228, %r4229 }; + mov.b32 %f952, %r4229; + mov.b32 %f951, %r4228; + mov.b32 %f950, %r4227; + mov.b32 %f949, %r4226; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4920, %r4921 }, { %r4240, %r4241, %r4242, %r4243 }; + mov.b32 %f956, %r4243; + mov.b32 %f955, %r4242; + mov.b32 %f954, %r4241; + mov.b32 %f953, %r4240; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4878, %r4879 }, { %r4254, %r4255, %r4256, %r4257 }; + mov.b32 %f960, %r4257; + mov.b32 %f959, %r4256; + mov.b32 %f958, %r4255; + mov.b32 %f957, %r4254; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4892, %r4893 }, { %r4268, %r4269, %r4270, %r4271 }; + mov.b32 %f964, %r4271; + mov.b32 %f963, %r4270; + mov.b32 %f962, %r4269; + mov.b32 %f961, %r4268; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4906, %r4907 }, { %r4282, %r4283, %r4284, %r4285 }; + mov.b32 %f968, %r4285; + mov.b32 %f967, %r4284; + mov.b32 %f966, %r4283; + mov.b32 %f965, %r4282; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4920, %r4921 }, { %r4296, %r4297, %r4298, %r4299 }; + mov.b32 %f972, %r4299; + mov.b32 %f971, %r4298; + mov.b32 %f970, %r4297; + mov.b32 %f969, %r4296; + @%p102 ld.global.v4.b32 { %r4982, %r4983, %r4984, %r4985 }, [ %rd96 + 0 ]; + @%p102 ld.global.v4.b32 { %r4986, %r4987, %r4988, %r4989 }, [ %rd97 + 0 ]; + @%p102 ld.global.v4.b32 { %r4990, %r4991, %r4992, %r4993 }, [ %rd98 + 0 ]; + @%p102 ld.global.v4.b32 { %r4994, %r4995, %r4996, %r4997 }, [ %rd99 + 0 ]; + @%p102 ld.global.v4.b32 { %r4998, %r4999, %r5000, %r5001 }, [ %rd100 + 0 ]; + @%p102 ld.global.v4.b32 { %r5002, %r5003, %r5004, %r5005 }, [ %rd101 + 0 ]; + @%p102 ld.global.v4.b32 { %r5006, %r5007, %r5008, %r5009 }, [ %rd102 + 0 ]; + @%p102 ld.global.v4.b32 { %r5010, %r5011, %r5012, %r5013 }, [ %rd103 + 0 ]; bar.sync 0; - ld.shared.v4.f32 {%f435, %f436, %f437, %f438}, [%r3590]; - ld.shared.v4.f32 {%f439, %f440, %f441, %f442}, [%r3590+16]; - ld.shared.v4.f32 {%f443, %f444, %f445, %f446}, [%r3590+9216]; - ld.shared.v4.f32 {%f447, %f448, %f449, %f450}, [%r3590+9232]; + st.shared.v4.u32 [%r246], {%r4982, %r4983, %r4984, %r4985}; + st.shared.v4.u32 [%r246+4352], {%r4986, %r4987, %r4988, %r4989}; + st.shared.v4.u32 [%r246+8704], {%r4990, %r4991, %r4992, %r4993}; + st.shared.v4.u32 [%r246+13056], {%r4994, %r4995, %r4996, %r4997}; bar.sync 0; - st.shared.v2.f32 [%r3587], {%f515, %f516}; - st.shared.v2.f32 [%r3587+2304], {%f517, %f518}; - st.shared.v2.f32 [%r3587+64], {%f519, %f520}; - st.shared.v2.f32 [%r3587+2368], {%f521, %f522}; - st.shared.v2.f32 [%r3587+128], {%f523, %f524}; - st.shared.v2.f32 [%r3587+2432], {%f525, %f526}; - st.shared.v2.f32 [%r3587+192], {%f527, %f528}; - st.shared.v2.f32 [%r3587+2496], {%f529, %f530}; + ld.shared.v2.f32 {%f781, %f782}, [%r247]; + ld.shared.v2.f32 {%f783, %f784}, [%r248]; + ld.shared.v2.f32 {%f785, %f786}, [%r247+64]; + ld.shared.v2.f32 {%f787, %f788}, [%r248+64]; + ld.shared.v2.f32 {%f789, %f790}, [%r247+128]; + ld.shared.v2.f32 {%f791, %f792}, [%r248+128]; + ld.shared.v2.f32 {%f793, %f794}, [%r247+192]; + ld.shared.v2.f32 {%f795, %f796}, [%r248+192]; bar.sync 0; - ld.shared.v4.f32 {%f451, %f452, %f453, %f454}, [%r3590]; - ld.shared.v4.f32 {%f455, %f456, %f457, %f458}, [%r3590+16]; - ld.shared.v4.f32 {%f459, %f460, %f461, %f462}, [%r3590+9216]; - ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r3590+9232]; + st.shared.v4.u32 [%r246], {%r4998, %r4999, %r5000, %r5001}; + st.shared.v4.u32 [%r246+4352], {%r5002, %r5003, %r5004, %r5005}; + st.shared.v4.u32 [%r246+8704], {%r5006, %r5007, %r5008, %r5009}; + st.shared.v4.u32 [%r246+13056], {%r5010, %r5011, %r5012, %r5013}; bar.sync 0; - st.shared.v2.f32 [%r3587], {%f531, %f532}; - st.shared.v2.f32 [%r3587+2304], {%f533, %f534}; - st.shared.v2.f32 [%r3587+64], {%f535, %f536}; - st.shared.v2.f32 [%r3587+2368], {%f537, %f538}; - st.shared.v2.f32 [%r3587+128], {%f539, %f540}; - st.shared.v2.f32 [%r3587+2432], {%f541, %f542}; - st.shared.v2.f32 [%r3587+192], {%f543, %f544}; - st.shared.v2.f32 [%r3587+2496], {%f545, %f546}; + ld.shared.v2.f32 {%f797, %f798}, [%r247]; + ld.shared.v2.f32 {%f799, %f800}, [%r248]; + ld.shared.v2.f32 {%f801, %f802}, [%r247+64]; + ld.shared.v2.f32 {%f803, %f804}, [%r248+64]; + ld.shared.v2.f32 {%f805, %f806}, [%r247+128]; + ld.shared.v2.f32 {%f807, %f808}, [%r248+128]; + ld.shared.v2.f32 {%f809, %f810}, [%r247+192]; + ld.shared.v2.f32 {%f811, %f812}, [%r248+192]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5178, %r5179, %r5180, %r5181 }, [ %r5018 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5290, %r5291, %r5292, %r5293 }, [ %r5023 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5402, %r5403, %r5404, %r5405 }, [ %r5028 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5514, %r5515, %r5516, %r5517 }, [ %r5033 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5626, %r5627, %r5628, %r5629 }, [ %r5038 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5738, %r5739, %r5740, %r5741 }, [ %r5043 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5850, %r5851, %r5852, %r5853 }, [ %r5048 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5962, %r5963, %r5964, %r5965 }, [ %r5053 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5234, %r5235, %r5236, %r5237 }, [ %r5058 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5346, %r5347, %r5348, %r5349 }, [ %r5063 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5458, %r5459, %r5460, %r5461 }, [ %r5068 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5570, %r5571, %r5572, %r5573 }, [ %r5073 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5682, %r5683, %r5684, %r5685 }, [ %r5078 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5794, %r5795, %r5796, %r5797 }, [ %r5083 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5906, %r5907, %r5908, %r5909 }, [ %r5088 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r6018, %r6019, %r6020, %r6021 }, [ %r5093 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5182, %r5183, %r5196, %r5197 }, [ %r5098 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5294, %r5295, %r5308, %r5309 }, [ %r5103 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5406, %r5407, %r5420, %r5421 }, [ %r5108 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5518, %r5519, %r5532, %r5533 }, [ %r5113 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5630, %r5631, %r5644, %r5645 }, [ %r5118 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5742, %r5743, %r5756, %r5757 }, [ %r5123 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5854, %r5855, %r5868, %r5869 }, [ %r5128 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5966, %r5967, %r5980, %r5981 }, [ %r5133 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5210, %r5211, %r5224, %r5225 }, [ %r5138 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5322, %r5323, %r5336, %r5337 }, [ %r5143 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5434, %r5435, %r5448, %r5449 }, [ %r5148 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5546, %r5547, %r5560, %r5561 }, [ %r5153 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5658, %r5659, %r5672, %r5673 }, [ %r5158 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5770, %r5771, %r5784, %r5785 }, [ %r5163 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5882, %r5883, %r5896, %r5897 }, [ %r5168 + 0 ]; + ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5994, %r5995, %r6008, %r6009 }, [ %r5173 + 0 ]; + mov.b32 %r5286, %f781; + mov.b32 %r5287, %f782; + mov.b32 %r5288, %f783; + mov.b32 %r5289, %f784; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5182, %r5183 }, { %r5286, %r5287, %r5288, %r5289 }; + mov.b32 %r5300, %f785; + mov.b32 %r5301, %f786; + mov.b32 %r5302, %f787; + mov.b32 %r5303, %f788; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5196, %r5197 }, { %r5300, %r5301, %r5302, %r5303 }; + mov.b32 %r5314, %f789; + mov.b32 %r5315, %f790; + mov.b32 %r5316, %f791; + mov.b32 %r5317, %f792; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5210, %r5211 }, { %r5314, %r5315, %r5316, %r5317 }; + mov.b32 %r5328, %f793; + mov.b32 %r5329, %f794; + mov.b32 %r5330, %f795; + mov.b32 %r5331, %f796; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5224, %r5225 }, { %r5328, %r5329, %r5330, %r5331 }; + mov.b32 %r5342, %f797; + mov.b32 %r5343, %f798; + mov.b32 %r5344, %f799; + mov.b32 %r5345, %f800; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5182, %r5183 }, { %r5342, %r5343, %r5344, %r5345 }; + mov.b32 %r5356, %f801; + mov.b32 %r5357, %f802; + mov.b32 %r5358, %f803; + mov.b32 %r5359, %f804; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5196, %r5197 }, { %r5356, %r5357, %r5358, %r5359 }; + mov.b32 %r5370, %f805; + mov.b32 %r5371, %f806; + mov.b32 %r5372, %f807; + mov.b32 %r5373, %f808; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5210, %r5211 }, { %r5370, %r5371, %r5372, %r5373 }; + mov.b32 %r5384, %f809; + mov.b32 %r5385, %f810; + mov.b32 %r5386, %f811; + mov.b32 %r5387, %f812; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5224, %r5225 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5294, %r5295 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5308, %r5309 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5322, %r5323 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5336, %r5337 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5294, %r5295 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5308, %r5309 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5322, %r5323 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5336, %r5337 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5406, %r5407 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5420, %r5421 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5434, %r5435 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5448, %r5449 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5406, %r5407 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5420, %r5421 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5434, %r5435 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5448, %r5449 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5518, %r5519 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5532, %r5533 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5546, %r5547 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5560, %r5561 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5518, %r5519 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5532, %r5533 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5546, %r5547 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5560, %r5561 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5630, %r5631 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5644, %r5645 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5658, %r5659 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5672, %r5673 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5630, %r5631 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5644, %r5645 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5658, %r5659 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5672, %r5673 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5742, %r5743 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5756, %r5757 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5770, %r5771 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5784, %r5785 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5742, %r5743 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5756, %r5757 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5770, %r5771 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5784, %r5785 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5854, %r5855 }, { %r5286, %r5287, %r5288, %r5289 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5868, %r5869 }, { %r5300, %r5301, %r5302, %r5303 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5882, %r5883 }, { %r5314, %r5315, %r5316, %r5317 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5896, %r5897 }, { %r5328, %r5329, %r5330, %r5331 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5854, %r5855 }, { %r5342, %r5343, %r5344, %r5345 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5868, %r5869 }, { %r5356, %r5357, %r5358, %r5359 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5882, %r5883 }, { %r5370, %r5371, %r5372, %r5373 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5896, %r5897 }, { %r5384, %r5385, %r5386, %r5387 }; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5966, %r5967 }, { %r5286, %r5287, %r5288, %r5289 }; + mov.b32 %f813, %r5289; + mov.b32 %f814, %r5288; + mov.b32 %f815, %r5287; + mov.b32 %f816, %r5286; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5980, %r5981 }, { %r5300, %r5301, %r5302, %r5303 }; + mov.b32 %f817, %r5303; + mov.b32 %f818, %r5302; + mov.b32 %f819, %r5301; + mov.b32 %f820, %r5300; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5994, %r5995 }, { %r5314, %r5315, %r5316, %r5317 }; + mov.b32 %f821, %r5317; + mov.b32 %f822, %r5316; + mov.b32 %f823, %r5315; + mov.b32 %f824, %r5314; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5962, %r5963, %r5964, %r5965 }, { %r6008, %r6009 }, { %r5328, %r5329, %r5330, %r5331 }; + mov.b32 %f825, %r5331; + mov.b32 %f826, %r5330; + mov.b32 %f827, %r5329; + mov.b32 %f828, %r5328; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5966, %r5967 }, { %r5342, %r5343, %r5344, %r5345 }; + mov.b32 %f829, %r5345; + mov.b32 %f830, %r5344; + mov.b32 %f831, %r5343; + mov.b32 %f832, %r5342; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5980, %r5981 }, { %r5356, %r5357, %r5358, %r5359 }; + mov.b32 %f833, %r5359; + mov.b32 %f834, %r5358; + mov.b32 %f835, %r5357; + mov.b32 %f836, %r5356; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5994, %r5995 }, { %r5370, %r5371, %r5372, %r5373 }; + mov.b32 %f837, %r5373; + mov.b32 %f838, %r5372; + mov.b32 %f839, %r5371; + mov.b32 %f840, %r5370; + mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r6018, %r6019, %r6020, %r6021 }, { %r6008, %r6009 }, { %r5384, %r5385, %r5386, %r5387 }; + mov.b32 %f841, %r5387; + mov.b32 %f842, %r5386; + mov.b32 %f843, %r5385; + mov.b32 %f844, %r5384; bar.sync 0; - ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r3590]; - ld.shared.v4.f32 {%f471, %f472, %f473, %f474}, [%r3590+16]; - ld.shared.v4.f32 {%f475, %f476, %f477, %f478}, [%r3590+9216]; - ld.shared.v4.f32 {%f479, %f480, %f481, %f482}, [%r3590+9232]; - shl.b64 %rd62, %rd7, 1; - add.s64 %rd54, %rd2, %rd62; - shl.b64 %rd63, %rd8, 1; - add.s64 %rd55, %rd2, %rd63; - shl.b64 %rd64, %rd9, 1; - add.s64 %rd56, %rd2, %rd64; - shl.b64 %rd65, %rd10, 1; - add.s64 %rd57, %rd2, %rd65; - cvt.rn.f16.f32 %h193, %f452; - cvt.rn.f16.f32 %h194, %f451; - mov.b32 %hh289, {%h194, %h193}; - cvt.rn.f16.f32 %h195, %f454; - cvt.rn.f16.f32 %h196, %f453; - mov.b32 %hh290, {%h196, %h195}; - cvt.rn.f16.f32 %h197, %f456; - cvt.rn.f16.f32 %h198, %f455; - mov.b32 %hh291, {%h198, %h197}; - cvt.rn.f16.f32 %h199, %f458; - cvt.rn.f16.f32 %h200, %f457; - mov.b32 %hh292, {%h200, %h199}; - cvt.rn.f16.f32 %h201, %f460; - cvt.rn.f16.f32 %h202, %f459; - mov.b32 %hh293, {%h202, %h201}; - cvt.rn.f16.f32 %h203, %f462; - cvt.rn.f16.f32 %h204, %f461; - mov.b32 %hh294, {%h204, %h203}; - cvt.rn.f16.f32 %h205, %f464; - cvt.rn.f16.f32 %h206, %f463; - mov.b32 %hh295, {%h206, %h205}; - cvt.rn.f16.f32 %h207, %f466; - cvt.rn.f16.f32 %h208, %f465; - mov.b32 %hh296, {%h208, %h207}; - cvt.rn.f16.f32 %h209, %f468; - cvt.rn.f16.f32 %h210, %f467; - mov.b32 %hh297, {%h210, %h209}; - cvt.rn.f16.f32 %h211, %f470; - cvt.rn.f16.f32 %h212, %f469; - mov.b32 %hh298, {%h212, %h211}; - cvt.rn.f16.f32 %h213, %f472; - cvt.rn.f16.f32 %h214, %f471; - mov.b32 %hh299, {%h214, %h213}; - cvt.rn.f16.f32 %h215, %f474; - cvt.rn.f16.f32 %h216, %f473; - mov.b32 %hh300, {%h216, %h215}; - cvt.rn.f16.f32 %h217, %f476; - cvt.rn.f16.f32 %h218, %f475; - mov.b32 %hh301, {%h218, %h217}; - cvt.rn.f16.f32 %h219, %f478; - cvt.rn.f16.f32 %h220, %f477; - mov.b32 %hh302, {%h220, %h219}; - cvt.rn.f16.f32 %h221, %f480; - cvt.rn.f16.f32 %h222, %f479; - mov.b32 %hh303, {%h222, %h221}; - cvt.rn.f16.f32 %h223, %f482; - cvt.rn.f16.f32 %h224, %f481; - mov.b32 %hh304, {%h224, %h223}; - mov.b32 %r3547, %hh289; - mov.b32 %r3548, %hh290; - mov.b32 %r3549, %hh291; - mov.b32 %r3550, %hh292; - @%p19 st.global.v4.b32 [ %rd54 + 0 ], { %r3547, %r3548, %r3549, %r3550 }; - mov.b32 %r3551, %hh293; - mov.b32 %r3552, %hh294; - mov.b32 %r3553, %hh295; - mov.b32 %r3554, %hh296; - @%p19 st.global.v4.b32 [ %rd55 + 0 ], { %r3551, %r3552, %r3553, %r3554 }; - mov.b32 %r3555, %hh297; - mov.b32 %r3556, %hh298; - mov.b32 %r3557, %hh299; - mov.b32 %r3558, %hh300; - @%p19 st.global.v4.b32 [ %rd56 + 0 ], { %r3555, %r3556, %r3557, %r3558 }; - mov.b32 %r3559, %hh301; - mov.b32 %r3560, %hh302; - mov.b32 %r3561, %hh303; - mov.b32 %r3562, %hh304; - @%p19 st.global.v4.b32 [ %rd57 + 0 ], { %r3559, %r3560, %r3561, %r3562 }; - shl.b64 %rd66, %rd3, 1; - add.s64 %rd58, %rd1, %rd66; - shl.b64 %rd67, %rd4, 1; - add.s64 %rd59, %rd1, %rd67; - shl.b64 %rd68, %rd5, 1; - add.s64 %rd60, %rd1, %rd68; - shl.b64 %rd69, %rd6, 1; - add.s64 %rd61, %rd1, %rd69; - cvt.rn.f16.f32 %h225, %f420; - cvt.rn.f16.f32 %h226, %f419; - mov.b32 %hh305, {%h226, %h225}; - cvt.rn.f16.f32 %h227, %f422; - cvt.rn.f16.f32 %h228, %f421; - mov.b32 %hh306, {%h228, %h227}; - cvt.rn.f16.f32 %h229, %f424; - cvt.rn.f16.f32 %h230, %f423; - mov.b32 %hh307, {%h230, %h229}; - cvt.rn.f16.f32 %h231, %f426; - cvt.rn.f16.f32 %h232, %f425; - mov.b32 %hh308, {%h232, %h231}; - cvt.rn.f16.f32 %h233, %f428; - cvt.rn.f16.f32 %h234, %f427; - mov.b32 %hh309, {%h234, %h233}; - cvt.rn.f16.f32 %h235, %f430; - cvt.rn.f16.f32 %h236, %f429; - mov.b32 %hh310, {%h236, %h235}; - cvt.rn.f16.f32 %h237, %f432; - cvt.rn.f16.f32 %h238, %f431; - mov.b32 %hh311, {%h238, %h237}; - cvt.rn.f16.f32 %h239, %f434; - cvt.rn.f16.f32 %h240, %f433; - mov.b32 %hh312, {%h240, %h239}; - cvt.rn.f16.f32 %h241, %f436; - cvt.rn.f16.f32 %h242, %f435; - mov.b32 %hh313, {%h242, %h241}; - cvt.rn.f16.f32 %h243, %f438; - cvt.rn.f16.f32 %h244, %f437; - mov.b32 %hh314, {%h244, %h243}; - cvt.rn.f16.f32 %h245, %f440; - cvt.rn.f16.f32 %h246, %f439; - mov.b32 %hh315, {%h246, %h245}; - cvt.rn.f16.f32 %h247, %f442; - cvt.rn.f16.f32 %h248, %f441; - mov.b32 %hh316, {%h248, %h247}; - cvt.rn.f16.f32 %h249, %f444; - cvt.rn.f16.f32 %h250, %f443; - mov.b32 %hh317, {%h250, %h249}; - cvt.rn.f16.f32 %h251, %f446; - cvt.rn.f16.f32 %h252, %f445; - mov.b32 %hh318, {%h252, %h251}; - cvt.rn.f16.f32 %h253, %f448; - cvt.rn.f16.f32 %h254, %f447; - mov.b32 %hh319, {%h254, %h253}; - cvt.rn.f16.f32 %h255, %f450; - cvt.rn.f16.f32 %h256, %f449; - mov.b32 %hh320, {%h256, %h255}; - mov.b32 %r3563, %hh305; - mov.b32 %r3564, %hh306; - mov.b32 %r3565, %hh307; - mov.b32 %r3566, %hh308; - @%p19 st.global.v4.b32 [ %rd58 + 0 ], { %r3563, %r3564, %r3565, %r3566 }; - mov.b32 %r3567, %hh309; - mov.b32 %r3568, %hh310; - mov.b32 %r3569, %hh311; - mov.b32 %r3570, %hh312; - @%p19 st.global.v4.b32 [ %rd59 + 0 ], { %r3567, %r3568, %r3569, %r3570 }; - mov.b32 %r3571, %hh313; - mov.b32 %r3572, %hh314; - mov.b32 %r3573, %hh315; - mov.b32 %r3574, %hh316; - @%p19 st.global.v4.b32 [ %rd60 + 0 ], { %r3571, %r3572, %r3573, %r3574 }; - mov.b32 %r3575, %hh317; - mov.b32 %r3576, %hh318; - mov.b32 %r3577, %hh319; - mov.b32 %r3578, %hh320; - @%p19 st.global.v4.b32 [ %rd61 + 0 ], { %r3575, %r3576, %r3577, %r3578 }; + st.shared.v2.f32 [%r247], {%f816, %f815}; + st.shared.v2.f32 [%r248], {%f814, %f813}; + st.shared.v2.f32 [%r247+64], {%f820, %f819}; + st.shared.v2.f32 [%r248+64], {%f818, %f817}; + st.shared.v2.f32 [%r247+128], {%f824, %f823}; + st.shared.v2.f32 [%r248+128], {%f822, %f821}; + st.shared.v2.f32 [%r247+192], {%f828, %f827}; + st.shared.v2.f32 [%r248+192], {%f826, %f825}; + bar.sync 0; + ld.shared.v4.u32 {%r6070, %r6071, %r6072, %r6073}, [%r246]; + ld.shared.v4.u32 {%r6074, %r6075, %r6076, %r6077}, [%r246+4352]; + ld.shared.v4.u32 {%r6078, %r6079, %r6080, %r6081}, [%r246+8704]; + ld.shared.v4.u32 {%r6082, %r6083, %r6084, %r6085}, [%r246+13056]; + bar.sync 0; + st.shared.v2.f32 [%r247], {%f832, %f831}; + st.shared.v2.f32 [%r248], {%f830, %f829}; + st.shared.v2.f32 [%r247+64], {%f836, %f835}; + st.shared.v2.f32 [%r248+64], {%f834, %f833}; + st.shared.v2.f32 [%r247+128], {%f840, %f839}; + st.shared.v2.f32 [%r248+128], {%f838, %f837}; + st.shared.v2.f32 [%r247+192], {%f844, %f843}; + st.shared.v2.f32 [%r248+192], {%f842, %f841}; + bar.sync 0; + ld.shared.v4.u32 {%r6086, %r6087, %r6088, %r6089}, [%r246]; + ld.shared.v4.u32 {%r6090, %r6091, %r6092, %r6093}, [%r246+4352]; + ld.shared.v4.u32 {%r6094, %r6095, %r6096, %r6097}, [%r246+8704]; + ld.shared.v4.u32 {%r6098, %r6099, %r6100, %r6101}, [%r246+13056]; + @%p102 st.global.v4.b32 [ %rd96 + 0 ], { %r6070, %r6071, %r6072, %r6073 }; + @%p102 st.global.v4.b32 [ %rd97 + 0 ], { %r6074, %r6075, %r6076, %r6077 }; + @%p102 st.global.v4.b32 [ %rd98 + 0 ], { %r6078, %r6079, %r6080, %r6081 }; + @%p102 st.global.v4.b32 [ %rd99 + 0 ], { %r6082, %r6083, %r6084, %r6085 }; + @%p102 st.global.v4.b32 [ %rd100 + 0 ], { %r6086, %r6087, %r6088, %r6089 }; + @%p102 st.global.v4.b32 [ %rd101 + 0 ], { %r6090, %r6091, %r6092, %r6093 }; + @%p102 st.global.v4.b32 [ %rd102 + 0 ], { %r6094, %r6095, %r6096, %r6097 }; + @%p102 st.global.v4.b32 [ %rd103 + 0 ], { %r6098, %r6099, %r6100, %r6101 }; + add.s32 %r6176, %r6176, 128; + add.s64 %rd138, %rd138, %rd10; + add.s64 %rd137, %rd137, %rd10; + add.s64 %rd136, %rd136, %rd10; + add.s64 %rd135, %rd135, %rd10; + add.s64 %rd134, %rd134, %rd10; + add.s64 %rd133, %rd133, %rd10; + add.s64 %rd132, %rd132, %rd10; + add.s64 %rd131, %rd131, %rd10; + add.s64 %rd130, %rd130, %rd11; + setp.lt.s32 %p101, %r6176, %r22; + @%p101 bra LBB0_4; + bra.uni LBB0_5; +LBB0_6: ret; }