Files
triton/python/bwd.ptx
2022-12-30 15:21:00 -08:00

2033 lines
96 KiB
Plaintext

//
// Generated by LLVM NVPTX Back-End
//
.version 7.4
.target sm_86
.address_size 64
// .globl _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27
.extern .shared .align 1 .b8 global_smem[];
.visible .entry _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2,
.param .f32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_4,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_9,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10,
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_15,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_16,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_18,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_19,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_20,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_21,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23,
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24
)
.maxntid 256, 1, 1
{
.reg .pred %p<27>;
.reg .b16 %h<257>;
.reg .b32 %r<3848>;
.reg .b32 %hh<321>;
.reg .f32 %f<547>;
.reg .b64 %rd<70>;
mov.u32 %r1, %tid.x;
ld.param.u64 %rd28, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1];
and.b32 %r447, %r1, 31;
ld.param.u64 %rd29, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2];
shr.u32 %r2, %r1, 5;
bfe.u32 %r448, %r1, 3, 2;
shr.u32 %r449, %r1, 3;
and.b32 %r450, %r449, 124;
or.b32 %r3, %r450, %r448;
ld.param.u64 %rd31, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7];
ld.param.u64 %rd32, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8];
and.b32 %r4, %r1, 7;
shl.b32 %r5, %r4, 3;
ld.param.u32 %r451, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12];
shl.b32 %r6, %r2, 4;
ld.param.u32 %r452, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13];
ld.param.u32 %r453, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14];
bfe.u32 %r8, %r1, 2, 3;
shl.b32 %r454, %r1, 1;
ld.param.u32 %r455, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17];
and.b32 %r10, %r454, 6;
mov.u32 %r456, %ctaid.x;
ld.param.u32 %r458, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22];
div.s32 %r460, %r456, %r458;
mul.lo.s32 %r461, %r460, %r458;
sub.s32 %r462, %r456, %r461;
ld.param.u32 %r463, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24];
mul.lo.s32 %r464, %r460, %r451;
mad.lo.s32 %r465, %r462, %r452, %r464;
mul.wide.s32 %rd33, %r465, 2;
add.s64 %rd35, %rd28, %rd33;
add.s64 %rd36, %rd29, %rd33;
add.s64 %rd1, %rd31, %rd33;
add.s64 %rd2, %rd32, %rd33;
shl.b32 %r466, %r455, 5;
mad.lo.s32 %r467, %r3, %r455, %r5;
add.s32 %r468, %r467, %r466;
add.s32 %r469, %r468, %r466;
add.s32 %r470, %r469, %r466;
cvt.s64.s32 %rd3, %r467;
mul.wide.s32 %rd38, %r467, 2;
add.s64 %rd19, %rd35, %rd38;
cvt.s64.s32 %rd4, %r468;
mul.wide.s32 %rd39, %r468, 2;
add.s64 %rd20, %rd35, %rd39;
cvt.s64.s32 %rd5, %r469;
mul.wide.s32 %rd40, %r469, 2;
add.s64 %rd21, %rd35, %rd40;
cvt.s64.s32 %rd6, %r470;
mul.wide.s32 %rd41, %r470, 2;
add.s64 %rd22, %rd35, %rd41;
mov.pred %p19, -1;
@%p19 ld.global.v4.b32 { %r95, %r96, %r97, %r98 }, [ %rd19 + 0 ];
@%p19 ld.global.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd20 + 0 ];
@%p19 ld.global.v4.b32 { %r103, %r104, %r105, %r106 }, [ %rd21 + 0 ];
@%p19 ld.global.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd22 + 0 ];
shl.b32 %r471, %r453, 5;
mad.lo.s32 %r472, %r3, %r453, %r5;
add.s32 %r473, %r472, %r471;
add.s32 %r474, %r473, %r471;
add.s32 %r475, %r474, %r471;
cvt.s64.s32 %rd7, %r472;
mul.wide.s32 %rd42, %r472, 2;
add.s64 %rd23, %rd36, %rd42;
cvt.s64.s32 %rd8, %r473;
mul.wide.s32 %rd43, %r473, 2;
add.s64 %rd24, %rd36, %rd43;
cvt.s64.s32 %rd9, %r474;
mul.wide.s32 %rd44, %r474, 2;
add.s64 %rd25, %rd36, %rd44;
cvt.s64.s32 %rd10, %r475;
mul.wide.s32 %rd45, %r475, 2;
add.s64 %rd26, %rd36, %rd45;
@%p19 ld.global.v4.b32 { %r111, %r112, %r113, %r114 }, [ %rd23 + 0 ];
@%p19 ld.global.v4.b32 { %r115, %r116, %r117, %r118 }, [ %rd24 + 0 ];
@%p19 ld.global.v4.b32 { %r119, %r120, %r121, %r122 }, [ %rd25 + 0 ];
@%p19 ld.global.v4.b32 { %r123, %r124, %r125, %r126 }, [ %rd26 + 0 ];
shl.b32 %r11, %r463, 7;
mov.b32 {%h1, %h2}, %r95;
shl.b32 %r12, %r3, 6;
or.b32 %r476, %r12, %r5;
shl.b32 %r477, %r476, 1;
mov.u32 %r478, global_smem;
add.s32 %r479, %r478, %r477;
st.shared.b16 [%r479], %h1;
st.shared.b16 [%r479+2], %h2;
mov.b32 {%h3, %h4}, %r96;
st.shared.b16 [%r479+4], %h3;
st.shared.b16 [%r479+6], %h4;
mov.b32 {%h5, %h6}, %r97;
st.shared.b16 [%r479+8], %h5;
st.shared.b16 [%r479+10], %h6;
mov.b32 {%h7, %h8}, %r98;
st.shared.b16 [%r479+12], %h7;
st.shared.b16 [%r479+14], %h8;
mov.b32 {%h9, %h10}, %r99;
add.s32 %r13, %r12, 2048;
or.b32 %r480, %r13, %r5;
shl.b32 %r481, %r480, 1;
add.s32 %r482, %r478, %r481;
st.shared.b16 [%r482], %h9;
st.shared.b16 [%r482+2], %h10;
mov.b32 {%h11, %h12}, %r100;
st.shared.b16 [%r482+4], %h11;
st.shared.b16 [%r482+6], %h12;
mov.b32 {%h13, %h14}, %r101;
st.shared.b16 [%r482+8], %h13;
st.shared.b16 [%r482+10], %h14;
mov.b32 {%h15, %h16}, %r102;
st.shared.b16 [%r482+12], %h15;
st.shared.b16 [%r482+14], %h16;
mov.b32 {%h17, %h18}, %r103;
add.s32 %r14, %r12, 4096;
or.b32 %r483, %r14, %r5;
shl.b32 %r484, %r483, 1;
add.s32 %r485, %r478, %r484;
st.shared.b16 [%r485], %h17;
st.shared.b16 [%r485+2], %h18;
mov.b32 {%h19, %h20}, %r104;
st.shared.b16 [%r485+4], %h19;
st.shared.b16 [%r485+6], %h20;
mov.b32 {%h21, %h22}, %r105;
st.shared.b16 [%r485+8], %h21;
st.shared.b16 [%r485+10], %h22;
mov.b32 {%h23, %h24}, %r106;
st.shared.b16 [%r485+12], %h23;
st.shared.b16 [%r485+14], %h24;
mov.b32 {%h25, %h26}, %r107;
add.s32 %r15, %r12, 6144;
or.b32 %r486, %r15, %r5;
shl.b32 %r487, %r486, 1;
add.s32 %r488, %r478, %r487;
st.shared.b16 [%r488], %h25;
st.shared.b16 [%r488+2], %h26;
mov.b32 {%h27, %h28}, %r108;
st.shared.b16 [%r488+4], %h27;
st.shared.b16 [%r488+6], %h28;
mov.b32 {%h29, %h30}, %r109;
st.shared.b16 [%r488+8], %h29;
st.shared.b16 [%r488+10], %h30;
mov.b32 {%h31, %h32}, %r110;
st.shared.b16 [%r488+12], %h31;
st.shared.b16 [%r488+14], %h32;
bar.sync 0;
mov.b32 {%h33, %h34}, %r111;
add.s32 %r489, %r478, 16384;
add.s32 %r490, %r489, %r477;
st.shared.b16 [%r490], %h33;
st.shared.b16 [%r490+2], %h34;
mov.b32 {%h35, %h36}, %r112;
st.shared.b16 [%r490+4], %h35;
st.shared.b16 [%r490+6], %h36;
mov.b32 {%h37, %h38}, %r113;
st.shared.b16 [%r490+8], %h37;
st.shared.b16 [%r490+10], %h38;
mov.b32 {%h39, %h40}, %r114;
st.shared.b16 [%r490+12], %h39;
st.shared.b16 [%r490+14], %h40;
mov.b32 {%h41, %h42}, %r115;
add.s32 %r491, %r489, %r481;
st.shared.b16 [%r491], %h41;
st.shared.b16 [%r491+2], %h42;
mov.b32 {%h43, %h44}, %r116;
st.shared.b16 [%r491+4], %h43;
st.shared.b16 [%r491+6], %h44;
mov.b32 {%h45, %h46}, %r117;
st.shared.b16 [%r491+8], %h45;
st.shared.b16 [%r491+10], %h46;
mov.b32 {%h47, %h48}, %r118;
st.shared.b16 [%r491+12], %h47;
st.shared.b16 [%r491+14], %h48;
mov.b32 {%h49, %h50}, %r119;
add.s32 %r492, %r489, %r484;
st.shared.b16 [%r492], %h49;
st.shared.b16 [%r492+2], %h50;
mov.b32 {%h51, %h52}, %r120;
st.shared.b16 [%r492+4], %h51;
st.shared.b16 [%r492+6], %h52;
mov.b32 {%h53, %h54}, %r121;
st.shared.b16 [%r492+8], %h53;
st.shared.b16 [%r492+10], %h54;
mov.b32 {%h55, %h56}, %r122;
st.shared.b16 [%r492+12], %h55;
st.shared.b16 [%r492+14], %h56;
mov.b32 {%h57, %h58}, %r123;
add.s32 %r493, %r489, %r487;
st.shared.b16 [%r493], %h57;
st.shared.b16 [%r493+2], %h58;
mov.b32 {%h59, %h60}, %r124;
st.shared.b16 [%r493+4], %h59;
st.shared.b16 [%r493+6], %h60;
mov.b32 {%h61, %h62}, %r125;
st.shared.b16 [%r493+8], %h61;
st.shared.b16 [%r493+10], %h62;
mov.b32 {%h63, %h64}, %r126;
st.shared.b16 [%r493+12], %h63;
st.shared.b16 [%r493+14], %h64;
bar.sync 0;
bfe.u32 %r16, %r447, 3, 1;
bfe.u32 %r17, %r1, 4, 1;
shl.b32 %r18, %r17, 3;
or.b32 %r494, %r18, %r4;
shl.b32 %r495, %r494, 6;
shl.b32 %r19, %r16, 3;
or.b32 %r496, %r495, %r19;
shl.b32 %r497, %r496, 1;
add.s32 %r131, %r478, %r497;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3591, %r3592, %r3593, %r3594 }, [ %r131 + 0 ];
and.b32 %r498, %r1, 8;
or.b32 %r499, %r495, %r498;
shl.b32 %r500, %r499, 1;
add.s32 %r501, %r478, %r500;
add.s32 %r136, %r501, 32;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3623, %r3624, %r3625, %r3626 }, [ %r136 + 0 ];
add.s32 %r141, %r131, 64;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3655, %r3656, %r3657, %r3658 }, [ %r141 + 0 ];
add.s32 %r146, %r501, 96;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3687, %r3688, %r3689, %r3690 }, [ %r146 + 0 ];
add.s32 %r151, %r131, 2048;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3595, %r3596, %r3597, %r3598 }, [ %r151 + 0 ];
add.s32 %r156, %r501, 2080;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3627, %r3628, %r3629, %r3630 }, [ %r156 + 0 ];
add.s32 %r161, %r131, 2112;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3659, %r3660, %r3661, %r3662 }, [ %r161 + 0 ];
add.s32 %r166, %r501, 2144;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3691, %r3692, %r3693, %r3694 }, [ %r166 + 0 ];
add.s32 %r171, %r131, 4096;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3599, %r3600, %r3601, %r3602 }, [ %r171 + 0 ];
add.s32 %r176, %r501, 4128;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3631, %r3632, %r3633, %r3634 }, [ %r176 + 0 ];
add.s32 %r181, %r131, 4160;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3663, %r3664, %r3665, %r3666 }, [ %r181 + 0 ];
add.s32 %r186, %r501, 4192;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3695, %r3696, %r3697, %r3698 }, [ %r186 + 0 ];
add.s32 %r191, %r131, 6144;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3603, %r3604, %r3605, %r3606 }, [ %r191 + 0 ];
add.s32 %r196, %r501, 6176;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3635, %r3636, %r3637, %r3638 }, [ %r196 + 0 ];
add.s32 %r201, %r131, 6208;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3667, %r3668, %r3669, %r3670 }, [ %r201 + 0 ];
add.s32 %r206, %r501, 6240;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3699, %r3700, %r3701, %r3702 }, [ %r206 + 0 ];
add.s32 %r211, %r131, 8192;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3607, %r3608, %r3609, %r3610 }, [ %r211 + 0 ];
add.s32 %r216, %r501, 8224;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3639, %r3640, %r3641, %r3642 }, [ %r216 + 0 ];
add.s32 %r221, %r131, 8256;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3671, %r3672, %r3673, %r3674 }, [ %r221 + 0 ];
add.s32 %r226, %r501, 8288;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3703, %r3704, %r3705, %r3706 }, [ %r226 + 0 ];
add.s32 %r231, %r131, 10240;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3611, %r3612, %r3613, %r3614 }, [ %r231 + 0 ];
add.s32 %r236, %r501, 10272;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3643, %r3644, %r3645, %r3646 }, [ %r236 + 0 ];
add.s32 %r241, %r131, 10304;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3675, %r3676, %r3677, %r3678 }, [ %r241 + 0 ];
add.s32 %r246, %r501, 10336;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3707, %r3708, %r3709, %r3710 }, [ %r246 + 0 ];
add.s32 %r251, %r131, 12288;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3615, %r3616, %r3617, %r3618 }, [ %r251 + 0 ];
add.s32 %r256, %r501, 12320;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3647, %r3648, %r3649, %r3650 }, [ %r256 + 0 ];
add.s32 %r261, %r131, 12352;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3679, %r3680, %r3681, %r3682 }, [ %r261 + 0 ];
add.s32 %r266, %r501, 12384;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3711, %r3712, %r3713, %r3714 }, [ %r266 + 0 ];
add.s32 %r271, %r131, 14336;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3619, %r3620, %r3621, %r3622 }, [ %r271 + 0 ];
add.s32 %r276, %r501, 14368;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3651, %r3652, %r3653, %r3654 }, [ %r276 + 0 ];
add.s32 %r281, %r131, 14400;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3683, %r3684, %r3685, %r3686 }, [ %r281 + 0 ];
add.s32 %r286, %r501, 14432;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3715, %r3716, %r3717, %r3718 }, [ %r286 + 0 ];
add.s32 %r291, %r489, %r497;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3719, %r3720, %r3721, %r3722 }, [ %r291 + 0 ];
add.s32 %r502, %r489, %r500;
add.s32 %r296, %r502, 32;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3751, %r3752, %r3753, %r3754 }, [ %r296 + 0 ];
add.s32 %r301, %r291, 64;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3783, %r3784, %r3785, %r3786 }, [ %r301 + 0 ];
add.s32 %r306, %r502, 96;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3815, %r3816, %r3817, %r3818 }, [ %r306 + 0 ];
add.s32 %r311, %r291, 2048;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3723, %r3724, %r3725, %r3726 }, [ %r311 + 0 ];
add.s32 %r316, %r502, 2080;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3755, %r3756, %r3757, %r3758 }, [ %r316 + 0 ];
add.s32 %r321, %r291, 2112;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3787, %r3788, %r3789, %r3790 }, [ %r321 + 0 ];
add.s32 %r326, %r502, 2144;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3819, %r3820, %r3821, %r3822 }, [ %r326 + 0 ];
add.s32 %r331, %r291, 4096;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3727, %r3728, %r3729, %r3730 }, [ %r331 + 0 ];
add.s32 %r336, %r502, 4128;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3759, %r3760, %r3761, %r3762 }, [ %r336 + 0 ];
add.s32 %r341, %r291, 4160;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3791, %r3792, %r3793, %r3794 }, [ %r341 + 0 ];
add.s32 %r346, %r502, 4192;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3823, %r3824, %r3825, %r3826 }, [ %r346 + 0 ];
add.s32 %r351, %r291, 6144;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3731, %r3732, %r3733, %r3734 }, [ %r351 + 0 ];
add.s32 %r356, %r502, 6176;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3763, %r3764, %r3765, %r3766 }, [ %r356 + 0 ];
add.s32 %r361, %r291, 6208;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3795, %r3796, %r3797, %r3798 }, [ %r361 + 0 ];
add.s32 %r366, %r502, 6240;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3827, %r3828, %r3829, %r3830 }, [ %r366 + 0 ];
add.s32 %r371, %r291, 8192;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3735, %r3736, %r3737, %r3738 }, [ %r371 + 0 ];
add.s32 %r376, %r502, 8224;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3767, %r3768, %r3769, %r3770 }, [ %r376 + 0 ];
add.s32 %r381, %r291, 8256;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3799, %r3800, %r3801, %r3802 }, [ %r381 + 0 ];
add.s32 %r386, %r502, 8288;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3831, %r3832, %r3833, %r3834 }, [ %r386 + 0 ];
add.s32 %r391, %r291, 10240;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3739, %r3740, %r3741, %r3742 }, [ %r391 + 0 ];
add.s32 %r396, %r502, 10272;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3771, %r3772, %r3773, %r3774 }, [ %r396 + 0 ];
add.s32 %r401, %r291, 10304;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3803, %r3804, %r3805, %r3806 }, [ %r401 + 0 ];
add.s32 %r406, %r502, 10336;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3835, %r3836, %r3837, %r3838 }, [ %r406 + 0 ];
add.s32 %r411, %r291, 12288;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3743, %r3744, %r3745, %r3746 }, [ %r411 + 0 ];
add.s32 %r416, %r502, 12320;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3775, %r3776, %r3777, %r3778 }, [ %r416 + 0 ];
add.s32 %r421, %r291, 12352;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3807, %r3808, %r3809, %r3810 }, [ %r421 + 0 ];
add.s32 %r426, %r502, 12384;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3839, %r3840, %r3841, %r3842 }, [ %r426 + 0 ];
add.s32 %r431, %r291, 14336;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3747, %r3748, %r3749, %r3750 }, [ %r431 + 0 ];
add.s32 %r436, %r502, 14368;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3779, %r3780, %r3781, %r3782 }, [ %r436 + 0 ];
add.s32 %r441, %r291, 14400;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3811, %r3812, %r3813, %r3814 }, [ %r441 + 0 ];
add.s32 %r446, %r502, 14432;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3843, %r3844, %r3845, %r3846 }, [ %r446 + 0 ];
setp.lt.s32 %p9, %r11, 1;
mov.f32 %f129, 0f00000000;
mov.f32 %f515, %f129;
mov.f32 %f516, %f129;
mov.f32 %f517, %f129;
mov.f32 %f518, %f129;
mov.f32 %f519, %f129;
mov.f32 %f520, %f129;
mov.f32 %f521, %f129;
mov.f32 %f522, %f129;
mov.f32 %f523, %f129;
mov.f32 %f524, %f129;
mov.f32 %f525, %f129;
mov.f32 %f526, %f129;
mov.f32 %f527, %f129;
mov.f32 %f528, %f129;
mov.f32 %f529, %f129;
mov.f32 %f530, %f129;
mov.f32 %f531, %f129;
mov.f32 %f532, %f129;
mov.f32 %f533, %f129;
mov.f32 %f534, %f129;
mov.f32 %f535, %f129;
mov.f32 %f536, %f129;
mov.f32 %f537, %f129;
mov.f32 %f538, %f129;
mov.f32 %f539, %f129;
mov.f32 %f540, %f129;
mov.f32 %f541, %f129;
mov.f32 %f542, %f129;
mov.f32 %f543, %f129;
mov.f32 %f544, %f129;
mov.f32 %f545, %f129;
mov.f32 %f546, %f129;
@%p9 bra LBB0_3;
ld.param.f32 %f97, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3];
ld.param.u64 %rd27, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0];
ld.param.u64 %rd30, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5];
and.b32 %r7, %r6, 112;
or.b32 %r9, %r7, %r8;
add.s64 %rd34, %rd27, %rd33;
add.s64 %rd37, %rd30, %rd33;
add.s64 %rd46, %rd34, %rd42;
add.s64 %rd47, %rd34, %rd43;
add.s64 %rd48, %rd34, %rd44;
add.s64 %rd49, %rd34, %rd45;
add.s64 %rd50, %rd37, %rd42;
add.s64 %rd51, %rd37, %rd43;
add.s64 %rd52, %rd37, %rd44;
add.s64 %rd53, %rd37, %rd45;
mov.b32 %hh1, %r3591;
mov.b32 %hh2, %r3592;
mov.b32 %hh3, %r3593;
mov.b32 %hh4, %r3594;
mov.b32 %hh5, %r3623;
mov.b32 %hh6, %r3624;
mov.b32 %hh7, %r3625;
mov.b32 %hh8, %r3626;
mov.b32 %hh9, %r3655;
mov.b32 %hh10, %r3656;
mov.b32 %hh11, %r3657;
mov.b32 %hh12, %r3658;
mov.b32 %hh13, %r3687;
mov.b32 %hh14, %r3688;
mov.b32 %hh15, %r3689;
mov.b32 %hh16, %r3690;
mov.b32 %hh17, %r3595;
mov.b32 %hh18, %r3596;
mov.b32 %hh19, %r3597;
mov.b32 %hh20, %r3598;
mov.b32 %hh21, %r3627;
mov.b32 %hh22, %r3628;
mov.b32 %hh23, %r3629;
mov.b32 %hh24, %r3630;
mov.b32 %hh25, %r3659;
mov.b32 %hh26, %r3660;
mov.b32 %hh27, %r3661;
mov.b32 %hh28, %r3662;
mov.b32 %hh29, %r3691;
mov.b32 %hh30, %r3692;
mov.b32 %hh31, %r3693;
mov.b32 %hh32, %r3694;
mov.b32 %hh33, %r3599;
mov.b32 %hh34, %r3600;
mov.b32 %hh35, %r3601;
mov.b32 %hh36, %r3602;
mov.b32 %hh37, %r3631;
mov.b32 %hh38, %r3632;
mov.b32 %hh39, %r3633;
mov.b32 %hh40, %r3634;
mov.b32 %hh41, %r3663;
mov.b32 %hh42, %r3664;
mov.b32 %hh43, %r3665;
mov.b32 %hh44, %r3666;
mov.b32 %hh45, %r3695;
mov.b32 %hh46, %r3696;
mov.b32 %hh47, %r3697;
mov.b32 %hh48, %r3698;
mov.b32 %hh49, %r3603;
mov.b32 %hh50, %r3604;
mov.b32 %hh51, %r3605;
mov.b32 %hh52, %r3606;
mov.b32 %hh53, %r3635;
mov.b32 %hh54, %r3636;
mov.b32 %hh55, %r3637;
mov.b32 %hh56, %r3638;
mov.b32 %hh57, %r3667;
mov.b32 %hh58, %r3668;
mov.b32 %hh59, %r3669;
mov.b32 %hh60, %r3670;
mov.b32 %hh61, %r3699;
mov.b32 %hh62, %r3700;
mov.b32 %hh63, %r3701;
mov.b32 %hh64, %r3702;
mov.b32 %hh65, %r3607;
mov.b32 %hh66, %r3608;
mov.b32 %hh67, %r3609;
mov.b32 %hh68, %r3610;
mov.b32 %hh69, %r3639;
mov.b32 %hh70, %r3640;
mov.b32 %hh71, %r3641;
mov.b32 %hh72, %r3642;
mov.b32 %hh73, %r3671;
mov.b32 %hh74, %r3672;
mov.b32 %hh75, %r3673;
mov.b32 %hh76, %r3674;
mov.b32 %hh77, %r3703;
mov.b32 %hh78, %r3704;
mov.b32 %hh79, %r3705;
mov.b32 %hh80, %r3706;
mov.b32 %hh81, %r3611;
mov.b32 %hh82, %r3612;
mov.b32 %hh83, %r3613;
mov.b32 %hh84, %r3614;
mov.b32 %hh85, %r3643;
mov.b32 %hh86, %r3644;
mov.b32 %hh87, %r3645;
mov.b32 %hh88, %r3646;
mov.b32 %hh89, %r3675;
mov.b32 %hh90, %r3676;
mov.b32 %hh91, %r3677;
mov.b32 %hh92, %r3678;
mov.b32 %hh93, %r3707;
mov.b32 %hh94, %r3708;
mov.b32 %hh95, %r3709;
mov.b32 %hh96, %r3710;
mov.b32 %hh97, %r3615;
mov.b32 %hh98, %r3616;
mov.b32 %hh99, %r3617;
mov.b32 %hh100, %r3618;
mov.b32 %hh101, %r3647;
mov.b32 %hh102, %r3648;
mov.b32 %hh103, %r3649;
mov.b32 %hh104, %r3650;
mov.b32 %hh105, %r3679;
mov.b32 %hh106, %r3680;
mov.b32 %hh107, %r3681;
mov.b32 %hh108, %r3682;
mov.b32 %hh109, %r3711;
mov.b32 %hh110, %r3712;
mov.b32 %hh111, %r3713;
mov.b32 %hh112, %r3714;
mov.b32 %hh113, %r3619;
mov.b32 %hh114, %r3620;
mov.b32 %hh115, %r3621;
mov.b32 %hh116, %r3622;
mov.b32 %hh117, %r3651;
mov.b32 %hh118, %r3652;
mov.b32 %hh119, %r3653;
mov.b32 %hh120, %r3654;
mov.b32 %hh121, %r3683;
mov.b32 %hh122, %r3684;
mov.b32 %hh123, %r3685;
mov.b32 %hh124, %r3686;
mov.b32 %hh125, %r3715;
mov.b32 %hh126, %r3716;
mov.b32 %hh127, %r3717;
mov.b32 %hh128, %r3718;
mov.b32 %hh129, %r3719;
mov.b32 %hh130, %r3720;
mov.b32 %hh131, %r3721;
mov.b32 %hh132, %r3722;
mov.b32 %hh133, %r3751;
mov.b32 %hh134, %r3752;
mov.b32 %hh135, %r3753;
mov.b32 %hh136, %r3754;
mov.b32 %hh137, %r3783;
mov.b32 %hh138, %r3784;
mov.b32 %hh139, %r3785;
mov.b32 %hh140, %r3786;
mov.b32 %hh141, %r3815;
mov.b32 %hh142, %r3816;
mov.b32 %hh143, %r3817;
mov.b32 %hh144, %r3818;
mov.b32 %hh145, %r3723;
mov.b32 %hh146, %r3724;
mov.b32 %hh147, %r3725;
mov.b32 %hh148, %r3726;
mov.b32 %hh149, %r3755;
mov.b32 %hh150, %r3756;
mov.b32 %hh151, %r3757;
mov.b32 %hh152, %r3758;
mov.b32 %hh153, %r3787;
mov.b32 %hh154, %r3788;
mov.b32 %hh155, %r3789;
mov.b32 %hh156, %r3790;
mov.b32 %hh157, %r3819;
mov.b32 %hh158, %r3820;
mov.b32 %hh159, %r3821;
mov.b32 %hh160, %r3822;
mov.b32 %hh161, %r3727;
mov.b32 %hh162, %r3728;
mov.b32 %hh163, %r3729;
mov.b32 %hh164, %r3730;
mov.b32 %hh165, %r3759;
mov.b32 %hh166, %r3760;
mov.b32 %hh167, %r3761;
mov.b32 %hh168, %r3762;
mov.b32 %hh169, %r3791;
mov.b32 %hh170, %r3792;
mov.b32 %hh171, %r3793;
mov.b32 %hh172, %r3794;
mov.b32 %hh173, %r3823;
mov.b32 %hh174, %r3824;
mov.b32 %hh175, %r3825;
mov.b32 %hh176, %r3826;
mov.b32 %hh177, %r3731;
mov.b32 %hh178, %r3732;
mov.b32 %hh179, %r3733;
mov.b32 %hh180, %r3734;
mov.b32 %hh181, %r3763;
mov.b32 %hh182, %r3764;
mov.b32 %hh183, %r3765;
mov.b32 %hh184, %r3766;
mov.b32 %hh185, %r3795;
mov.b32 %hh186, %r3796;
mov.b32 %hh187, %r3797;
mov.b32 %hh188, %r3798;
mov.b32 %hh189, %r3827;
mov.b32 %hh190, %r3828;
mov.b32 %hh191, %r3829;
mov.b32 %hh192, %r3830;
mov.b32 %hh193, %r3735;
mov.b32 %hh194, %r3736;
mov.b32 %hh195, %r3737;
mov.b32 %hh196, %r3738;
mov.b32 %hh197, %r3767;
mov.b32 %hh198, %r3768;
mov.b32 %hh199, %r3769;
mov.b32 %hh200, %r3770;
mov.b32 %hh201, %r3799;
mov.b32 %hh202, %r3800;
mov.b32 %hh203, %r3801;
mov.b32 %hh204, %r3802;
mov.b32 %hh205, %r3831;
mov.b32 %hh206, %r3832;
mov.b32 %hh207, %r3833;
mov.b32 %hh208, %r3834;
mov.b32 %hh209, %r3739;
mov.b32 %hh210, %r3740;
mov.b32 %hh211, %r3741;
mov.b32 %hh212, %r3742;
mov.b32 %hh213, %r3771;
mov.b32 %hh214, %r3772;
mov.b32 %hh215, %r3773;
mov.b32 %hh216, %r3774;
mov.b32 %hh217, %r3803;
mov.b32 %hh218, %r3804;
mov.b32 %hh219, %r3805;
mov.b32 %hh220, %r3806;
mov.b32 %hh221, %r3835;
mov.b32 %hh222, %r3836;
mov.b32 %hh223, %r3837;
mov.b32 %hh224, %r3838;
mov.b32 %hh225, %r3743;
mov.b32 %hh226, %r3744;
mov.b32 %hh227, %r3745;
mov.b32 %hh228, %r3746;
mov.b32 %hh229, %r3775;
mov.b32 %hh230, %r3776;
mov.b32 %hh231, %r3777;
mov.b32 %hh232, %r3778;
mov.b32 %hh233, %r3807;
mov.b32 %hh234, %r3808;
mov.b32 %hh235, %r3809;
mov.b32 %hh236, %r3810;
mov.b32 %hh237, %r3839;
mov.b32 %hh238, %r3840;
mov.b32 %hh239, %r3841;
mov.b32 %hh240, %r3842;
mov.b32 %hh241, %r3747;
mov.b32 %hh242, %r3748;
mov.b32 %hh243, %r3749;
mov.b32 %hh244, %r3750;
mov.b32 %hh245, %r3779;
mov.b32 %hh246, %r3780;
mov.b32 %hh247, %r3781;
mov.b32 %hh248, %r3782;
mov.b32 %hh249, %r3811;
mov.b32 %hh250, %r3812;
mov.b32 %hh251, %r3813;
mov.b32 %hh252, %r3814;
mov.b32 %hh253, %r3843;
mov.b32 %hh254, %r3844;
mov.b32 %hh255, %r3845;
mov.b32 %hh256, %r3846;
and.b32 %r504, %r3, 7;
xor.b32 %r505, %r504, %r4;
shl.b32 %r506, %r505, 3;
or.b32 %r507, %r506, %r12;
shl.b32 %r508, %r507, 1;
add.s32 %r20, %r478, %r508;
or.b32 %r510, %r506, %r13;
shl.b32 %r511, %r510, 1;
add.s32 %r21, %r478, %r511;
or.b32 %r512, %r506, %r14;
shl.b32 %r513, %r512, 1;
add.s32 %r22, %r478, %r513;
or.b32 %r514, %r506, %r15;
shl.b32 %r515, %r514, 1;
add.s32 %r23, %r478, %r515;
or.b32 %r516, %r19, %r4;
or.b32 %r517, %r516, %r7;
xor.b32 %r518, %r17, %r4;
shl.b32 %r519, %r518, 4;
shl.b32 %r520, %r517, 7;
or.b32 %r521, %r520, %r519;
add.s32 %r599, %r478, %r521;
or.b32 %r522, %r17, 2;
xor.b32 %r523, %r522, %r4;
shl.b32 %r524, %r523, 4;
or.b32 %r525, %r524, %r520;
add.s32 %r604, %r478, %r525;
or.b32 %r526, %r17, 4;
xor.b32 %r527, %r526, %r4;
shl.b32 %r528, %r527, 4;
or.b32 %r529, %r528, %r520;
add.s32 %r609, %r478, %r529;
or.b32 %r530, %r17, 6;
xor.b32 %r531, %r530, %r4;
shl.b32 %r532, %r531, 4;
or.b32 %r533, %r532, %r520;
add.s32 %r614, %r478, %r533;
shl.b32 %r534, %r9, 1;
shl.b32 %r535, %r10, 8;
or.b32 %r536, %r535, %r534;
add.s32 %r28, %r478, %r536;
or.b32 %r537, %r536, 256;
add.s32 %r30, %r478, %r537;
add.s32 %r31, %r28, 2048;
or.b32 %r538, %r536, 2304;
add.s32 %r32, %r478, %r538;
add.s32 %r33, %r28, 4096;
or.b32 %r539, %r536, 4352;
add.s32 %r34, %r478, %r539;
add.s32 %r35, %r28, 6144;
or.b32 %r540, %r536, 6400;
add.s32 %r36, %r478, %r540;
add.s32 %r37, %r28, 8192;
or.b32 %r541, %r536, 8448;
add.s32 %r38, %r478, %r541;
add.s32 %r39, %r28, 10240;
or.b32 %r542, %r536, 10496;
add.s32 %r40, %r478, %r542;
add.s32 %r41, %r28, 12288;
or.b32 %r543, %r536, 12544;
add.s32 %r42, %r478, %r543;
add.s32 %r43, %r28, 14336;
or.b32 %r544, %r536, 14592;
add.s32 %r44, %r478, %r544;
add.s32 %r45, %r28, 16384;
or.b32 %r545, %r536, 16640;
add.s32 %r46, %r478, %r545;
add.s32 %r47, %r28, 18432;
or.b32 %r546, %r536, 18688;
add.s32 %r48, %r478, %r546;
add.s32 %r49, %r28, 20480;
or.b32 %r547, %r536, 20736;
add.s32 %r50, %r478, %r547;
add.s32 %r51, %r28, 22528;
or.b32 %r548, %r536, 22784;
add.s32 %r52, %r478, %r548;
add.s32 %r53, %r28, 24576;
or.b32 %r549, %r536, 24832;
add.s32 %r54, %r478, %r549;
add.s32 %r55, %r28, 26624;
or.b32 %r550, %r536, 26880;
add.s32 %r56, %r478, %r550;
add.s32 %r57, %r28, 28672;
or.b32 %r551, %r536, 28928;
add.s32 %r58, %r478, %r551;
add.s32 %r59, %r28, 30720;
or.b32 %r552, %r536, 30976;
add.s32 %r60, %r478, %r552;
shl.b32 %r553, %r2, 1;
and.b32 %r554, %r553, 6;
or.b32 %r555, %r554, %r16;
shl.b32 %r556, %r555, 10;
shl.b32 %r557, %r4, 7;
or.b32 %r558, %r556, %r557;
or.b32 %r559, %r558, %r18;
shl.b32 %r560, %r559, 1;
add.s32 %r1531, %r478, %r560;
shl.b32 %r561, %r522, 4;
shl.b32 %r562, %r558, 1;
or.b32 %r563, %r561, %r562;
add.s32 %r1536, %r478, %r563;
shl.b32 %r564, %r526, 4;
or.b32 %r565, %r564, %r562;
add.s32 %r1541, %r478, %r565;
shl.b32 %r566, %r530, 4;
or.b32 %r567, %r566, %r562;
add.s32 %r1546, %r478, %r567;
add.s32 %r1551, %r1531, 128;
add.s32 %r1556, %r1531, 160;
add.s32 %r1561, %r1531, 192;
add.s32 %r1566, %r1531, 224;
add.s32 %r1571, %r1531, 16384;
add.s32 %r1576, %r1536, 16384;
add.s32 %r1581, %r1541, 16384;
add.s32 %r1586, %r1546, 16384;
add.s32 %r1591, %r1531, 16512;
add.s32 %r1596, %r1531, 16544;
add.s32 %r1601, %r1531, 16576;
add.s32 %r1606, %r1531, 16608;
bfe.u32 %r568, %r1, 7, 1;
shl.b32 %r569, %r17, 1;
or.b32 %r570, %r569, %r568;
xor.b32 %r571, %r570, %r4;
shl.b32 %r572, %r571, 4;
shl.b32 %r573, %r516, 7;
or.b32 %r574, %r572, %r573;
add.s32 %r1611, %r478, %r574;
add.s32 %r1616, %r1611, 2048;
add.s32 %r1621, %r1611, 4096;
add.s32 %r1626, %r1611, 6144;
add.s32 %r1631, %r1611, 8192;
add.s32 %r1636, %r1611, 10240;
add.s32 %r1641, %r1611, 12288;
add.s32 %r1646, %r1611, 14336;
or.b32 %r575, %r570, 4;
xor.b32 %r576, %r575, %r4;
shl.b32 %r577, %r576, 4;
or.b32 %r578, %r577, %r573;
add.s32 %r1651, %r478, %r578;
add.s32 %r1656, %r1651, 2048;
add.s32 %r1661, %r1651, 4096;
add.s32 %r1666, %r1651, 6144;
add.s32 %r1671, %r1651, 8192;
add.s32 %r1676, %r1651, 10240;
add.s32 %r1681, %r1651, 12288;
add.s32 %r1686, %r1651, 14336;
mov.f32 %f515, 0f00000000;
mov.u32 %r625, 0;
mov.f32 %f516, %f515;
mov.f32 %f517, %f515;
mov.f32 %f518, %f515;
mov.f32 %f519, %f515;
mov.f32 %f520, %f515;
mov.f32 %f521, %f515;
mov.f32 %f522, %f515;
mov.f32 %f523, %f515;
mov.f32 %f524, %f515;
mov.f32 %f525, %f515;
mov.f32 %f526, %f515;
mov.f32 %f527, %f515;
mov.f32 %f528, %f515;
mov.f32 %f529, %f515;
mov.f32 %f530, %f515;
mov.f32 %f531, %f515;
mov.f32 %f532, %f515;
mov.f32 %f533, %f515;
mov.f32 %f534, %f515;
mov.f32 %f535, %f515;
mov.f32 %f536, %f515;
mov.f32 %f537, %f515;
mov.f32 %f538, %f515;
mov.f32 %f539, %f515;
mov.f32 %f540, %f515;
mov.f32 %f541, %f515;
mov.f32 %f542, %f515;
mov.f32 %f543, %f515;
mov.f32 %f544, %f515;
mov.f32 %f545, %f515;
mov.f32 %f546, %f515;
mov.u32 %r3847, %r625;
LBB0_2:
@%p19 ld.global.v4.b32 { %r3499, %r3500, %r3501, %r3502 }, [ %rd46 + 0 ];
mov.b32 %hh257, %r3499;
mov.b32 %hh258, %r3500;
mov.b32 %hh259, %r3501;
mov.b32 %hh260, %r3502;
@%p19 ld.global.v4.b32 { %r3503, %r3504, %r3505, %r3506 }, [ %rd47 + 0 ];
mov.b32 %hh261, %r3503;
mov.b32 %hh262, %r3504;
mov.b32 %hh263, %r3505;
mov.b32 %hh264, %r3506;
@%p19 ld.global.v4.b32 { %r3507, %r3508, %r3509, %r3510 }, [ %rd48 + 0 ];
mov.b32 %hh265, %r3507;
mov.b32 %hh266, %r3508;
mov.b32 %hh267, %r3509;
mov.b32 %hh268, %r3510;
@%p19 ld.global.v4.b32 { %r3511, %r3512, %r3513, %r3514 }, [ %rd49 + 0 ];
mov.b32 %hh269, %r3511;
mov.b32 %hh270, %r3512;
mov.b32 %hh271, %r3513;
mov.b32 %hh272, %r3514;
bar.sync 0;
st.shared.v4.b32 [%r20], {%r3499, %r3500, %r3501, %r3502};
st.shared.v4.b32 [%r21], {%r3503, %r3504, %r3505, %r3506};
st.shared.v4.b32 [%r22], {%r3507, %r3508, %r3509, %r3510};
st.shared.v4.b32 [%r23], {%r3511, %r3512, %r3513, %r3514};
bar.sync 0;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r619, %r620, %r621, %r622 }, [ %r599 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r843, %r844, %r845, %r846 }, [ %r604 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1067, %r1068, %r1069, %r1070 }, [ %r609 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1291, %r1292, %r1293, %r1294 }, [ %r614 + 0 ];
mov.u32 %r839, %r625;
mov.u32 %r840, %r625;
mov.u32 %r841, %r625;
mov.u32 %r842, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r619, %r620, %r621, %r622 }, { %r3591, %r3592 }, { %r839, %r840, %r841, %r842 };
mov.u32 %r853, %r625;
mov.u32 %r854, %r625;
mov.u32 %r855, %r625;
mov.u32 %r856, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r619, %r620, %r621, %r622 }, { %r3593, %r3594 }, { %r853, %r854, %r855, %r856 };
mov.u32 %r867, %r625;
mov.u32 %r868, %r625;
mov.u32 %r869, %r625;
mov.u32 %r870, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r619, %r620, %r621, %r622 }, { %r3595, %r3596 }, { %r867, %r868, %r869, %r870 };
mov.u32 %r881, %r625;
mov.u32 %r882, %r625;
mov.u32 %r883, %r625;
mov.u32 %r884, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r619, %r620, %r621, %r622 }, { %r3597, %r3598 }, { %r881, %r882, %r883, %r884 };
mov.u32 %r895, %r625;
mov.u32 %r896, %r625;
mov.u32 %r897, %r625;
mov.u32 %r898, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r619, %r620, %r621, %r622 }, { %r3599, %r3600 }, { %r895, %r896, %r897, %r898 };
mov.u32 %r909, %r625;
mov.u32 %r910, %r625;
mov.u32 %r911, %r625;
mov.u32 %r912, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r619, %r620, %r621, %r622 }, { %r3601, %r3602 }, { %r909, %r910, %r911, %r912 };
mov.u32 %r923, %r625;
mov.u32 %r924, %r625;
mov.u32 %r925, %r625;
mov.u32 %r926, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r619, %r620, %r621, %r622 }, { %r3603, %r3604 }, { %r923, %r924, %r925, %r926 };
mov.u32 %r937, %r625;
mov.u32 %r938, %r625;
mov.u32 %r939, %r625;
mov.u32 %r940, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r619, %r620, %r621, %r622 }, { %r3605, %r3606 }, { %r937, %r938, %r939, %r940 };
mov.u32 %r951, %r625;
mov.u32 %r952, %r625;
mov.u32 %r953, %r625;
mov.u32 %r954, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r619, %r620, %r621, %r622 }, { %r3607, %r3608 }, { %r951, %r952, %r953, %r954 };
mov.u32 %r965, %r625;
mov.u32 %r966, %r625;
mov.u32 %r967, %r625;
mov.u32 %r968, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r619, %r620, %r621, %r622 }, { %r3609, %r3610 }, { %r965, %r966, %r967, %r968 };
mov.u32 %r979, %r625;
mov.u32 %r980, %r625;
mov.u32 %r981, %r625;
mov.u32 %r982, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r619, %r620, %r621, %r622 }, { %r3611, %r3612 }, { %r979, %r980, %r981, %r982 };
mov.u32 %r993, %r625;
mov.u32 %r994, %r625;
mov.u32 %r995, %r625;
mov.u32 %r996, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r619, %r620, %r621, %r622 }, { %r3613, %r3614 }, { %r993, %r994, %r995, %r996 };
mov.u32 %r1007, %r625;
mov.u32 %r1008, %r625;
mov.u32 %r1009, %r625;
mov.u32 %r1010, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r619, %r620, %r621, %r622 }, { %r3615, %r3616 }, { %r1007, %r1008, %r1009, %r1010 };
mov.u32 %r1021, %r625;
mov.u32 %r1022, %r625;
mov.u32 %r1023, %r625;
mov.u32 %r1024, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r619, %r620, %r621, %r622 }, { %r3617, %r3618 }, { %r1021, %r1022, %r1023, %r1024 };
mov.u32 %r1035, %r625;
mov.u32 %r1036, %r625;
mov.u32 %r1037, %r625;
mov.u32 %r1038, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r619, %r620, %r621, %r622 }, { %r3619, %r3620 }, { %r1035, %r1036, %r1037, %r1038 };
mov.u32 %r1049, %r625;
mov.u32 %r1050, %r625;
mov.u32 %r1051, %r625;
mov.u32 %r1052, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r619, %r620, %r621, %r622 }, { %r3621, %r3622 }, { %r1049, %r1050, %r1051, %r1052 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r843, %r844, %r845, %r846 }, { %r3623, %r3624 }, { %r839, %r840, %r841, %r842 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r843, %r844, %r845, %r846 }, { %r3625, %r3626 }, { %r853, %r854, %r855, %r856 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r843, %r844, %r845, %r846 }, { %r3627, %r3628 }, { %r867, %r868, %r869, %r870 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r843, %r844, %r845, %r846 }, { %r3629, %r3630 }, { %r881, %r882, %r883, %r884 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r843, %r844, %r845, %r846 }, { %r3631, %r3632 }, { %r895, %r896, %r897, %r898 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r843, %r844, %r845, %r846 }, { %r3633, %r3634 }, { %r909, %r910, %r911, %r912 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r843, %r844, %r845, %r846 }, { %r3635, %r3636 }, { %r923, %r924, %r925, %r926 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r843, %r844, %r845, %r846 }, { %r3637, %r3638 }, { %r937, %r938, %r939, %r940 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r843, %r844, %r845, %r846 }, { %r3639, %r3640 }, { %r951, %r952, %r953, %r954 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r843, %r844, %r845, %r846 }, { %r3641, %r3642 }, { %r965, %r966, %r967, %r968 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r843, %r844, %r845, %r846 }, { %r3643, %r3644 }, { %r979, %r980, %r981, %r982 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r843, %r844, %r845, %r846 }, { %r3645, %r3646 }, { %r993, %r994, %r995, %r996 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r843, %r844, %r845, %r846 }, { %r3647, %r3648 }, { %r1007, %r1008, %r1009, %r1010 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r843, %r844, %r845, %r846 }, { %r3649, %r3650 }, { %r1021, %r1022, %r1023, %r1024 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r843, %r844, %r845, %r846 }, { %r3651, %r3652 }, { %r1035, %r1036, %r1037, %r1038 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r843, %r844, %r845, %r846 }, { %r3653, %r3654 }, { %r1049, %r1050, %r1051, %r1052 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3655, %r3656 }, { %r839, %r840, %r841, %r842 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3657, %r3658 }, { %r853, %r854, %r855, %r856 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3659, %r3660 }, { %r867, %r868, %r869, %r870 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3661, %r3662 }, { %r881, %r882, %r883, %r884 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3663, %r3664 }, { %r895, %r896, %r897, %r898 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3665, %r3666 }, { %r909, %r910, %r911, %r912 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3667, %r3668 }, { %r923, %r924, %r925, %r926 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3669, %r3670 }, { %r937, %r938, %r939, %r940 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3671, %r3672 }, { %r951, %r952, %r953, %r954 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3673, %r3674 }, { %r965, %r966, %r967, %r968 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3675, %r3676 }, { %r979, %r980, %r981, %r982 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3677, %r3678 }, { %r993, %r994, %r995, %r996 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3679, %r3680 }, { %r1007, %r1008, %r1009, %r1010 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3681, %r3682 }, { %r1021, %r1022, %r1023, %r1024 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3683, %r3684 }, { %r1035, %r1036, %r1037, %r1038 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3685, %r3686 }, { %r1049, %r1050, %r1051, %r1052 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3687, %r3688 }, { %r839, %r840, %r841, %r842 };
mov.b32 %f162, %r842;
mov.b32 %f163, %r841;
mov.b32 %f164, %r840;
mov.b32 %f165, %r839;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3689, %r3690 }, { %r853, %r854, %r855, %r856 };
mov.b32 %f166, %r856;
mov.b32 %f167, %r855;
mov.b32 %f168, %r854;
mov.b32 %f169, %r853;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3691, %r3692 }, { %r867, %r868, %r869, %r870 };
mov.b32 %f170, %r870;
mov.b32 %f171, %r869;
mov.b32 %f172, %r868;
mov.b32 %f173, %r867;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3693, %r3694 }, { %r881, %r882, %r883, %r884 };
mov.b32 %f174, %r884;
mov.b32 %f175, %r883;
mov.b32 %f176, %r882;
mov.b32 %f177, %r881;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3695, %r3696 }, { %r895, %r896, %r897, %r898 };
mov.b32 %f178, %r898;
mov.b32 %f179, %r897;
mov.b32 %f180, %r896;
mov.b32 %f181, %r895;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3697, %r3698 }, { %r909, %r910, %r911, %r912 };
mov.b32 %f182, %r912;
mov.b32 %f183, %r911;
mov.b32 %f184, %r910;
mov.b32 %f185, %r909;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3699, %r3700 }, { %r923, %r924, %r925, %r926 };
mov.b32 %f186, %r926;
mov.b32 %f187, %r925;
mov.b32 %f188, %r924;
mov.b32 %f189, %r923;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3701, %r3702 }, { %r937, %r938, %r939, %r940 };
mov.b32 %f190, %r940;
mov.b32 %f191, %r939;
mov.b32 %f192, %r938;
mov.b32 %f193, %r937;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3703, %r3704 }, { %r951, %r952, %r953, %r954 };
mov.b32 %f194, %r954;
mov.b32 %f195, %r953;
mov.b32 %f196, %r952;
mov.b32 %f197, %r951;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3705, %r3706 }, { %r965, %r966, %r967, %r968 };
mov.b32 %f198, %r968;
mov.b32 %f199, %r967;
mov.b32 %f200, %r966;
mov.b32 %f201, %r965;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3707, %r3708 }, { %r979, %r980, %r981, %r982 };
mov.b32 %f202, %r982;
mov.b32 %f203, %r981;
mov.b32 %f204, %r980;
mov.b32 %f205, %r979;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3709, %r3710 }, { %r993, %r994, %r995, %r996 };
mov.b32 %f206, %r996;
mov.b32 %f207, %r995;
mov.b32 %f208, %r994;
mov.b32 %f209, %r993;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3711, %r3712 }, { %r1007, %r1008, %r1009, %r1010 };
mov.b32 %f210, %r1010;
mov.b32 %f211, %r1009;
mov.b32 %f212, %r1008;
mov.b32 %f213, %r1007;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3713, %r3714 }, { %r1021, %r1022, %r1023, %r1024 };
mov.b32 %f214, %r1024;
mov.b32 %f215, %r1023;
mov.b32 %f216, %r1022;
mov.b32 %f217, %r1021;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3715, %r3716 }, { %r1035, %r1036, %r1037, %r1038 };
mov.b32 %f218, %r1038;
mov.b32 %f219, %r1037;
mov.b32 %f220, %r1036;
mov.b32 %f221, %r1035;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3717, %r3718 }, { %r1049, %r1050, %r1051, %r1052 };
mov.b32 %f222, %r1052;
mov.b32 %f223, %r1051;
mov.b32 %f224, %r1050;
mov.b32 %f225, %r1049;
@%p19 ld.global.v4.b32 { %r3531, %r3532, %r3533, %r3534 }, [ %rd50 + 0 ];
mov.b32 %hh273, %r3531;
mov.b32 %hh274, %r3532;
mov.b32 %hh275, %r3533;
mov.b32 %hh276, %r3534;
@%p19 ld.global.v4.b32 { %r3535, %r3536, %r3537, %r3538 }, [ %rd51 + 0 ];
mov.b32 %hh277, %r3535;
mov.b32 %hh278, %r3536;
mov.b32 %hh279, %r3537;
mov.b32 %hh280, %r3538;
@%p19 ld.global.v4.b32 { %r3539, %r3540, %r3541, %r3542 }, [ %rd52 + 0 ];
mov.b32 %hh281, %r3539;
mov.b32 %hh282, %r3540;
mov.b32 %hh283, %r3541;
mov.b32 %hh284, %r3542;
@%p19 ld.global.v4.b32 { %r3543, %r3544, %r3545, %r3546 }, [ %rd53 + 0 ];
mov.b32 %hh285, %r3543;
mov.b32 %hh286, %r3544;
mov.b32 %hh287, %r3545;
mov.b32 %hh288, %r3546;
cvt.rn.f16.f32 %h65, %f165;
cvt.rn.f16.f32 %h66, %f164;
cvt.rn.f16.f32 %h67, %f163;
cvt.rn.f16.f32 %h68, %f162;
cvt.rn.f16.f32 %h69, %f169;
cvt.rn.f16.f32 %h70, %f168;
cvt.rn.f16.f32 %h71, %f167;
cvt.rn.f16.f32 %h72, %f166;
cvt.rn.f16.f32 %h73, %f173;
cvt.rn.f16.f32 %h74, %f172;
cvt.rn.f16.f32 %h75, %f171;
cvt.rn.f16.f32 %h76, %f170;
cvt.rn.f16.f32 %h77, %f177;
cvt.rn.f16.f32 %h78, %f176;
cvt.rn.f16.f32 %h79, %f175;
cvt.rn.f16.f32 %h80, %f174;
cvt.rn.f16.f32 %h81, %f181;
cvt.rn.f16.f32 %h82, %f180;
cvt.rn.f16.f32 %h83, %f179;
cvt.rn.f16.f32 %h84, %f178;
cvt.rn.f16.f32 %h85, %f185;
cvt.rn.f16.f32 %h86, %f184;
cvt.rn.f16.f32 %h87, %f183;
cvt.rn.f16.f32 %h88, %f182;
cvt.rn.f16.f32 %h89, %f189;
cvt.rn.f16.f32 %h90, %f188;
cvt.rn.f16.f32 %h91, %f187;
cvt.rn.f16.f32 %h92, %f186;
cvt.rn.f16.f32 %h93, %f193;
cvt.rn.f16.f32 %h94, %f192;
cvt.rn.f16.f32 %h95, %f191;
cvt.rn.f16.f32 %h96, %f190;
cvt.rn.f16.f32 %h97, %f197;
cvt.rn.f16.f32 %h98, %f196;
cvt.rn.f16.f32 %h99, %f195;
cvt.rn.f16.f32 %h100, %f194;
cvt.rn.f16.f32 %h101, %f201;
cvt.rn.f16.f32 %h102, %f200;
cvt.rn.f16.f32 %h103, %f199;
cvt.rn.f16.f32 %h104, %f198;
cvt.rn.f16.f32 %h105, %f205;
cvt.rn.f16.f32 %h106, %f204;
cvt.rn.f16.f32 %h107, %f203;
cvt.rn.f16.f32 %h108, %f202;
cvt.rn.f16.f32 %h109, %f209;
cvt.rn.f16.f32 %h110, %f208;
cvt.rn.f16.f32 %h111, %f207;
cvt.rn.f16.f32 %h112, %f206;
cvt.rn.f16.f32 %h113, %f213;
cvt.rn.f16.f32 %h114, %f212;
cvt.rn.f16.f32 %h115, %f211;
cvt.rn.f16.f32 %h116, %f210;
cvt.rn.f16.f32 %h117, %f217;
cvt.rn.f16.f32 %h118, %f216;
cvt.rn.f16.f32 %h119, %f215;
cvt.rn.f16.f32 %h120, %f214;
cvt.rn.f16.f32 %h121, %f221;
cvt.rn.f16.f32 %h122, %f220;
cvt.rn.f16.f32 %h123, %f219;
cvt.rn.f16.f32 %h124, %f218;
cvt.rn.f16.f32 %h125, %f225;
cvt.rn.f16.f32 %h126, %f224;
cvt.rn.f16.f32 %h127, %f223;
cvt.rn.f16.f32 %h128, %f222;
bar.sync 0;
st.shared.b16 [%r28], %h65;
st.shared.b16 [%r28+256], %h66;
st.shared.b16 [%r28+16], %h67;
st.shared.b16 [%r30+16], %h68;
st.shared.b16 [%r28+2048], %h69;
st.shared.b16 [%r28+2304], %h70;
st.shared.b16 [%r31+16], %h71;
st.shared.b16 [%r32+16], %h72;
st.shared.b16 [%r28+4096], %h73;
st.shared.b16 [%r28+4352], %h74;
st.shared.b16 [%r33+16], %h75;
st.shared.b16 [%r34+16], %h76;
st.shared.b16 [%r28+6144], %h77;
st.shared.b16 [%r28+6400], %h78;
st.shared.b16 [%r35+16], %h79;
st.shared.b16 [%r36+16], %h80;
st.shared.b16 [%r28+8192], %h81;
st.shared.b16 [%r28+8448], %h82;
st.shared.b16 [%r37+16], %h83;
st.shared.b16 [%r38+16], %h84;
st.shared.b16 [%r28+10240], %h85;
st.shared.b16 [%r28+10496], %h86;
st.shared.b16 [%r39+16], %h87;
st.shared.b16 [%r40+16], %h88;
st.shared.b16 [%r28+12288], %h89;
st.shared.b16 [%r28+12544], %h90;
st.shared.b16 [%r41+16], %h91;
st.shared.b16 [%r42+16], %h92;
st.shared.b16 [%r28+14336], %h93;
st.shared.b16 [%r28+14592], %h94;
st.shared.b16 [%r43+16], %h95;
st.shared.b16 [%r44+16], %h96;
st.shared.b16 [%r28+16384], %h97;
st.shared.b16 [%r28+16640], %h98;
st.shared.b16 [%r45+16], %h99;
st.shared.b16 [%r46+16], %h100;
st.shared.b16 [%r28+18432], %h101;
st.shared.b16 [%r28+18688], %h102;
st.shared.b16 [%r47+16], %h103;
st.shared.b16 [%r48+16], %h104;
st.shared.b16 [%r28+20480], %h105;
st.shared.b16 [%r28+20736], %h106;
st.shared.b16 [%r49+16], %h107;
st.shared.b16 [%r50+16], %h108;
st.shared.b16 [%r28+22528], %h109;
st.shared.b16 [%r28+22784], %h110;
st.shared.b16 [%r51+16], %h111;
st.shared.b16 [%r52+16], %h112;
st.shared.b16 [%r28+24576], %h113;
st.shared.b16 [%r28+24832], %h114;
st.shared.b16 [%r53+16], %h115;
st.shared.b16 [%r54+16], %h116;
st.shared.b16 [%r28+26624], %h117;
st.shared.b16 [%r28+26880], %h118;
st.shared.b16 [%r55+16], %h119;
st.shared.b16 [%r56+16], %h120;
st.shared.b16 [%r28+28672], %h121;
st.shared.b16 [%r28+28928], %h122;
st.shared.b16 [%r57+16], %h123;
st.shared.b16 [%r58+16], %h124;
st.shared.b16 [%r28+30720], %h125;
st.shared.b16 [%r28+30976], %h126;
st.shared.b16 [%r59+16], %h127;
st.shared.b16 [%r60+16], %h128;
bar.sync 0;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1691, %r1692, %r1693, %r1694 }, [ %r1531 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1803, %r1804, %r1805, %r1806 }, [ %r1536 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1915, %r1916, %r1917, %r1918 }, [ %r1541 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2027, %r2028, %r2029, %r2030 }, [ %r1546 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2139, %r2140, %r2141, %r2142 }, [ %r1551 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2251, %r2252, %r2253, %r2254 }, [ %r1556 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2363, %r2364, %r2365, %r2366 }, [ %r1561 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2475, %r2476, %r2477, %r2478 }, [ %r1566 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1747, %r1748, %r1749, %r1750 }, [ %r1571 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1859, %r1860, %r1861, %r1862 }, [ %r1576 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1971, %r1972, %r1973, %r1974 }, [ %r1581 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2083, %r2084, %r2085, %r2086 }, [ %r1586 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2195, %r2196, %r2197, %r2198 }, [ %r1591 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2307, %r2308, %r2309, %r2310 }, [ %r1596 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2419, %r2420, %r2421, %r2422 }, [ %r1601 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2531, %r2532, %r2533, %r2534 }, [ %r1606 + 0 ];
bar.sync 0;
st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534};
st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538};
st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542};
st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546};
bar.sync 0;
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1695, %r1696, %r1709, %r1710 }, [ %r1611 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1807, %r1808, %r1821, %r1822 }, [ %r1616 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1919, %r1920, %r1933, %r1934 }, [ %r1621 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2031, %r2032, %r2045, %r2046 }, [ %r1626 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2143, %r2144, %r2157, %r2158 }, [ %r1631 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2255, %r2256, %r2269, %r2270 }, [ %r1636 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2367, %r2368, %r2381, %r2382 }, [ %r1641 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2479, %r2480, %r2493, %r2494 }, [ %r1646 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1723, %r1724, %r1737, %r1738 }, [ %r1651 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1835, %r1836, %r1849, %r1850 }, [ %r1656 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1947, %r1948, %r1961, %r1962 }, [ %r1661 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2059, %r2060, %r2073, %r2074 }, [ %r1666 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2171, %r2172, %r2185, %r2186 }, [ %r1671 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2283, %r2284, %r2297, %r2298 }, [ %r1676 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2395, %r2396, %r2409, %r2410 }, [ %r1681 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2507, %r2508, %r2521, %r2522 }, [ %r1686 + 0 ];
mov.b32 %r1799, %f515;
mov.b32 %r1800, %f516;
mov.b32 %r1801, %f517;
mov.b32 %r1802, %f518;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1695, %r1696 }, { %r1799, %r1800, %r1801, %r1802 };
mov.b32 %r1813, %f519;
mov.b32 %r1814, %f520;
mov.b32 %r1815, %f521;
mov.b32 %r1816, %f522;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1709, %r1710 }, { %r1813, %r1814, %r1815, %r1816 };
mov.b32 %r1827, %f523;
mov.b32 %r1828, %f524;
mov.b32 %r1829, %f525;
mov.b32 %r1830, %f526;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1723, %r1724 }, { %r1827, %r1828, %r1829, %r1830 };
mov.b32 %r1841, %f527;
mov.b32 %r1842, %f528;
mov.b32 %r1843, %f529;
mov.b32 %r1844, %f530;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1737, %r1738 }, { %r1841, %r1842, %r1843, %r1844 };
mov.b32 %r1855, %f531;
mov.b32 %r1856, %f532;
mov.b32 %r1857, %f533;
mov.b32 %r1858, %f534;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1695, %r1696 }, { %r1855, %r1856, %r1857, %r1858 };
mov.b32 %r1869, %f535;
mov.b32 %r1870, %f536;
mov.b32 %r1871, %f537;
mov.b32 %r1872, %f538;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1709, %r1710 }, { %r1869, %r1870, %r1871, %r1872 };
mov.b32 %r1883, %f539;
mov.b32 %r1884, %f540;
mov.b32 %r1885, %f541;
mov.b32 %r1886, %f542;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1723, %r1724 }, { %r1883, %r1884, %r1885, %r1886 };
mov.b32 %r1897, %f543;
mov.b32 %r1898, %f544;
mov.b32 %r1899, %f545;
mov.b32 %r1900, %f546;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1737, %r1738 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1807, %r1808 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1821, %r1822 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1835, %r1836 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1849, %r1850 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1807, %r1808 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1821, %r1822 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1835, %r1836 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1849, %r1850 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1919, %r1920 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1933, %r1934 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1947, %r1948 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1961, %r1962 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1919, %r1920 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1933, %r1934 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1947, %r1948 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1961, %r1962 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2031, %r2032 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2045, %r2046 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2059, %r2060 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2073, %r2074 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2031, %r2032 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2045, %r2046 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2059, %r2060 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2073, %r2074 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2143, %r2144 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2157, %r2158 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2171, %r2172 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2185, %r2186 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2143, %r2144 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2157, %r2158 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2171, %r2172 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2185, %r2186 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2255, %r2256 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2269, %r2270 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2283, %r2284 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2297, %r2298 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2255, %r2256 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2269, %r2270 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2283, %r2284 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2297, %r2298 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2367, %r2368 }, { %r1799, %r1800, %r1801, %r1802 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2381, %r2382 }, { %r1813, %r1814, %r1815, %r1816 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2395, %r2396 }, { %r1827, %r1828, %r1829, %r1830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2409, %r2410 }, { %r1841, %r1842, %r1843, %r1844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2367, %r2368 }, { %r1855, %r1856, %r1857, %r1858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2381, %r2382 }, { %r1869, %r1870, %r1871, %r1872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2395, %r2396 }, { %r1883, %r1884, %r1885, %r1886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2409, %r2410 }, { %r1897, %r1898, %r1899, %r1900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2479, %r2480 }, { %r1799, %r1800, %r1801, %r1802 };
mov.b32 %f518, %r1802;
mov.b32 %f517, %r1801;
mov.b32 %f516, %r1800;
mov.b32 %f515, %r1799;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2493, %r2494 }, { %r1813, %r1814, %r1815, %r1816 };
mov.b32 %f522, %r1816;
mov.b32 %f521, %r1815;
mov.b32 %f520, %r1814;
mov.b32 %f519, %r1813;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2507, %r2508 }, { %r1827, %r1828, %r1829, %r1830 };
mov.b32 %f526, %r1830;
mov.b32 %f525, %r1829;
mov.b32 %f524, %r1828;
mov.b32 %f523, %r1827;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2521, %r2522 }, { %r1841, %r1842, %r1843, %r1844 };
mov.b32 %f530, %r1844;
mov.b32 %f529, %r1843;
mov.b32 %f528, %r1842;
mov.b32 %f527, %r1841;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2479, %r2480 }, { %r1855, %r1856, %r1857, %r1858 };
mov.b32 %f534, %r1858;
mov.b32 %f533, %r1857;
mov.b32 %f532, %r1856;
mov.b32 %f531, %r1855;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2493, %r2494 }, { %r1869, %r1870, %r1871, %r1872 };
mov.b32 %f538, %r1872;
mov.b32 %f537, %r1871;
mov.b32 %f536, %r1870;
mov.b32 %f535, %r1869;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2507, %r2508 }, { %r1883, %r1884, %r1885, %r1886 };
mov.b32 %f542, %r1886;
mov.b32 %f541, %r1885;
mov.b32 %f540, %r1884;
mov.b32 %f539, %r1883;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2521, %r2522 }, { %r1897, %r1898, %r1899, %r1900 };
mov.b32 %f546, %r1900;
mov.b32 %f545, %r1899;
mov.b32 %f544, %r1898;
mov.b32 %f543, %r1897;
bar.sync 0;
st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534};
st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538};
st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542};
st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546};
bar.sync 0;
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2607, %r2608, %r2609, %r2610 }, [ %r599 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2831, %r2832, %r2833, %r2834 }, [ %r604 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3055, %r3056, %r3057, %r3058 }, [ %r609 + 0 ];
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3279, %r3280, %r3281, %r3282 }, [ %r614 + 0 ];
mov.u32 %r2827, %r625;
mov.u32 %r2828, %r625;
mov.u32 %r2829, %r625;
mov.u32 %r2830, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3719, %r3720 }, { %r2827, %r2828, %r2829, %r2830 };
mov.u32 %r2841, %r625;
mov.u32 %r2842, %r625;
mov.u32 %r2843, %r625;
mov.u32 %r2844, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3721, %r3722 }, { %r2841, %r2842, %r2843, %r2844 };
mov.u32 %r2855, %r625;
mov.u32 %r2856, %r625;
mov.u32 %r2857, %r625;
mov.u32 %r2858, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3723, %r3724 }, { %r2855, %r2856, %r2857, %r2858 };
mov.u32 %r2869, %r625;
mov.u32 %r2870, %r625;
mov.u32 %r2871, %r625;
mov.u32 %r2872, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3725, %r3726 }, { %r2869, %r2870, %r2871, %r2872 };
mov.u32 %r2883, %r625;
mov.u32 %r2884, %r625;
mov.u32 %r2885, %r625;
mov.u32 %r2886, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3727, %r3728 }, { %r2883, %r2884, %r2885, %r2886 };
mov.u32 %r2897, %r625;
mov.u32 %r2898, %r625;
mov.u32 %r2899, %r625;
mov.u32 %r2900, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3729, %r3730 }, { %r2897, %r2898, %r2899, %r2900 };
mov.u32 %r2911, %r625;
mov.u32 %r2912, %r625;
mov.u32 %r2913, %r625;
mov.u32 %r2914, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3731, %r3732 }, { %r2911, %r2912, %r2913, %r2914 };
mov.u32 %r2925, %r625;
mov.u32 %r2926, %r625;
mov.u32 %r2927, %r625;
mov.u32 %r2928, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3733, %r3734 }, { %r2925, %r2926, %r2927, %r2928 };
mov.u32 %r2939, %r625;
mov.u32 %r2940, %r625;
mov.u32 %r2941, %r625;
mov.u32 %r2942, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3735, %r3736 }, { %r2939, %r2940, %r2941, %r2942 };
mov.u32 %r2953, %r625;
mov.u32 %r2954, %r625;
mov.u32 %r2955, %r625;
mov.u32 %r2956, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3737, %r3738 }, { %r2953, %r2954, %r2955, %r2956 };
mov.u32 %r2967, %r625;
mov.u32 %r2968, %r625;
mov.u32 %r2969, %r625;
mov.u32 %r2970, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3739, %r3740 }, { %r2967, %r2968, %r2969, %r2970 };
mov.u32 %r2981, %r625;
mov.u32 %r2982, %r625;
mov.u32 %r2983, %r625;
mov.u32 %r2984, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3741, %r3742 }, { %r2981, %r2982, %r2983, %r2984 };
mov.u32 %r2995, %r625;
mov.u32 %r2996, %r625;
mov.u32 %r2997, %r625;
mov.u32 %r2998, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3743, %r3744 }, { %r2995, %r2996, %r2997, %r2998 };
mov.u32 %r3009, %r625;
mov.u32 %r3010, %r625;
mov.u32 %r3011, %r625;
mov.u32 %r3012, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3745, %r3746 }, { %r3009, %r3010, %r3011, %r3012 };
mov.u32 %r3023, %r625;
mov.u32 %r3024, %r625;
mov.u32 %r3025, %r625;
mov.u32 %r3026, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3747, %r3748 }, { %r3023, %r3024, %r3025, %r3026 };
mov.u32 %r3040, %r625;
mov.u32 %r3037, %r625;
mov.u32 %r3038, %r625;
mov.u32 %r3039, %r625;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3749, %r3750 }, { %r3037, %r3038, %r3039, %r3040 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3751, %r3752 }, { %r2827, %r2828, %r2829, %r2830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3753, %r3754 }, { %r2841, %r2842, %r2843, %r2844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3755, %r3756 }, { %r2855, %r2856, %r2857, %r2858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3757, %r3758 }, { %r2869, %r2870, %r2871, %r2872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3759, %r3760 }, { %r2883, %r2884, %r2885, %r2886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3761, %r3762 }, { %r2897, %r2898, %r2899, %r2900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3763, %r3764 }, { %r2911, %r2912, %r2913, %r2914 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3765, %r3766 }, { %r2925, %r2926, %r2927, %r2928 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3767, %r3768 }, { %r2939, %r2940, %r2941, %r2942 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3769, %r3770 }, { %r2953, %r2954, %r2955, %r2956 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3771, %r3772 }, { %r2967, %r2968, %r2969, %r2970 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3773, %r3774 }, { %r2981, %r2982, %r2983, %r2984 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3775, %r3776 }, { %r2995, %r2996, %r2997, %r2998 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3777, %r3778 }, { %r3009, %r3010, %r3011, %r3012 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3779, %r3780 }, { %r3023, %r3024, %r3025, %r3026 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3781, %r3782 }, { %r3037, %r3038, %r3039, %r3040 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3783, %r3784 }, { %r2827, %r2828, %r2829, %r2830 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3785, %r3786 }, { %r2841, %r2842, %r2843, %r2844 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3787, %r3788 }, { %r2855, %r2856, %r2857, %r2858 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3789, %r3790 }, { %r2869, %r2870, %r2871, %r2872 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3791, %r3792 }, { %r2883, %r2884, %r2885, %r2886 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3793, %r3794 }, { %r2897, %r2898, %r2899, %r2900 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3795, %r3796 }, { %r2911, %r2912, %r2913, %r2914 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3797, %r3798 }, { %r2925, %r2926, %r2927, %r2928 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3799, %r3800 }, { %r2939, %r2940, %r2941, %r2942 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3801, %r3802 }, { %r2953, %r2954, %r2955, %r2956 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3803, %r3804 }, { %r2967, %r2968, %r2969, %r2970 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3805, %r3806 }, { %r2981, %r2982, %r2983, %r2984 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3807, %r3808 }, { %r2995, %r2996, %r2997, %r2998 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3809, %r3810 }, { %r3009, %r3010, %r3011, %r3012 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3811, %r3812 }, { %r3023, %r3024, %r3025, %r3026 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3813, %r3814 }, { %r3037, %r3038, %r3039, %r3040 };
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3815, %r3816 }, { %r2827, %r2828, %r2829, %r2830 };
mov.b32 %f226, %r2830;
mov.b32 %f227, %r2829;
mov.b32 %f228, %r2828;
mov.b32 %f229, %r2827;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3817, %r3818 }, { %r2841, %r2842, %r2843, %r2844 };
mov.b32 %f230, %r2844;
mov.b32 %f231, %r2843;
mov.b32 %f232, %r2842;
mov.b32 %f233, %r2841;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3819, %r3820 }, { %r2855, %r2856, %r2857, %r2858 };
mov.b32 %f234, %r2858;
mov.b32 %f235, %r2857;
mov.b32 %f236, %r2856;
mov.b32 %f237, %r2855;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3821, %r3822 }, { %r2869, %r2870, %r2871, %r2872 };
mov.b32 %f238, %r2872;
mov.b32 %f239, %r2871;
mov.b32 %f240, %r2870;
mov.b32 %f241, %r2869;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3823, %r3824 }, { %r2883, %r2884, %r2885, %r2886 };
mov.b32 %f242, %r2886;
mov.b32 %f243, %r2885;
mov.b32 %f244, %r2884;
mov.b32 %f245, %r2883;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3825, %r3826 }, { %r2897, %r2898, %r2899, %r2900 };
mov.b32 %f246, %r2900;
mov.b32 %f247, %r2899;
mov.b32 %f248, %r2898;
mov.b32 %f249, %r2897;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3827, %r3828 }, { %r2911, %r2912, %r2913, %r2914 };
mov.b32 %f250, %r2914;
mov.b32 %f251, %r2913;
mov.b32 %f252, %r2912;
mov.b32 %f253, %r2911;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3829, %r3830 }, { %r2925, %r2926, %r2927, %r2928 };
mov.b32 %f254, %r2928;
mov.b32 %f255, %r2927;
mov.b32 %f256, %r2926;
mov.b32 %f257, %r2925;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3831, %r3832 }, { %r2939, %r2940, %r2941, %r2942 };
mov.b32 %f258, %r2942;
mov.b32 %f259, %r2941;
mov.b32 %f260, %r2940;
mov.b32 %f261, %r2939;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3833, %r3834 }, { %r2953, %r2954, %r2955, %r2956 };
mov.b32 %f262, %r2956;
mov.b32 %f263, %r2955;
mov.b32 %f264, %r2954;
mov.b32 %f265, %r2953;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3835, %r3836 }, { %r2967, %r2968, %r2969, %r2970 };
mov.b32 %f266, %r2970;
mov.b32 %f267, %r2969;
mov.b32 %f268, %r2968;
mov.b32 %f269, %r2967;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3837, %r3838 }, { %r2981, %r2982, %r2983, %r2984 };
mov.b32 %f270, %r2984;
mov.b32 %f271, %r2983;
mov.b32 %f272, %r2982;
mov.b32 %f273, %r2981;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3839, %r3840 }, { %r2995, %r2996, %r2997, %r2998 };
mov.b32 %f274, %r2998;
mov.b32 %f275, %r2997;
mov.b32 %f276, %r2996;
mov.b32 %f277, %r2995;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3841, %r3842 }, { %r3009, %r3010, %r3011, %r3012 };
mov.b32 %f278, %r3012;
mov.b32 %f279, %r3011;
mov.b32 %f280, %r3010;
mov.b32 %f281, %r3009;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3843, %r3844 }, { %r3023, %r3024, %r3025, %r3026 };
mov.b32 %f282, %r3026;
mov.b32 %f283, %r3025;
mov.b32 %f284, %r3024;
mov.b32 %f285, %r3023;
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3845, %r3846 }, { %r3037, %r3038, %r3039, %r3040 };
mov.b32 %f286, %r3040;
mov.b32 %f287, %r3039;
mov.b32 %f288, %r3038;
mov.b32 %f289, %r3037;
mul.f32 %f290, %f165, %f229;
mul.f32 %f291, %f164, %f228;
mul.f32 %f292, %f163, %f227;
mul.f32 %f293, %f162, %f226;
mul.f32 %f294, %f169, %f233;
mul.f32 %f295, %f168, %f232;
mul.f32 %f296, %f167, %f231;
mul.f32 %f297, %f166, %f230;
mul.f32 %f298, %f173, %f237;
mul.f32 %f299, %f172, %f236;
mul.f32 %f300, %f171, %f235;
mul.f32 %f301, %f170, %f234;
mul.f32 %f302, %f177, %f241;
mul.f32 %f303, %f176, %f240;
mul.f32 %f304, %f175, %f239;
mul.f32 %f305, %f174, %f238;
mul.f32 %f306, %f181, %f245;
mul.f32 %f307, %f180, %f244;
mul.f32 %f308, %f179, %f243;
mul.f32 %f309, %f178, %f242;
mul.f32 %f310, %f185, %f249;
mul.f32 %f311, %f184, %f248;
mul.f32 %f312, %f183, %f247;
mul.f32 %f313, %f182, %f246;
mul.f32 %f314, %f189, %f253;
mul.f32 %f315, %f188, %f252;
mul.f32 %f316, %f187, %f251;
mul.f32 %f317, %f186, %f250;
mul.f32 %f318, %f193, %f257;
mul.f32 %f319, %f192, %f256;
mul.f32 %f320, %f191, %f255;
mul.f32 %f321, %f190, %f254;
mul.f32 %f322, %f197, %f261;
mul.f32 %f323, %f196, %f260;
mul.f32 %f324, %f195, %f259;
mul.f32 %f325, %f194, %f258;
mul.f32 %f326, %f201, %f265;
mul.f32 %f327, %f200, %f264;
mul.f32 %f328, %f199, %f263;
mul.f32 %f329, %f198, %f262;
mul.f32 %f330, %f205, %f269;
mul.f32 %f331, %f204, %f268;
mul.f32 %f332, %f203, %f267;
mul.f32 %f333, %f202, %f266;
mul.f32 %f334, %f209, %f273;
mul.f32 %f335, %f208, %f272;
mul.f32 %f336, %f207, %f271;
mul.f32 %f337, %f206, %f270;
mul.f32 %f338, %f213, %f277;
mul.f32 %f339, %f212, %f276;
mul.f32 %f340, %f211, %f275;
mul.f32 %f341, %f210, %f274;
mul.f32 %f342, %f217, %f281;
mul.f32 %f343, %f216, %f280;
mul.f32 %f344, %f215, %f279;
mul.f32 %f345, %f214, %f278;
mul.f32 %f346, %f221, %f285;
mul.f32 %f347, %f220, %f284;
mul.f32 %f348, %f219, %f283;
mul.f32 %f349, %f218, %f282;
mul.f32 %f350, %f225, %f289;
mul.f32 %f351, %f224, %f288;
mul.f32 %f352, %f223, %f287;
mul.f32 %f353, %f222, %f286;
mul.f32 %f354, %f290, %f97;
mul.f32 %f355, %f291, %f97;
mul.f32 %f356, %f292, %f97;
mul.f32 %f357, %f293, %f97;
mul.f32 %f358, %f294, %f97;
mul.f32 %f359, %f295, %f97;
mul.f32 %f360, %f296, %f97;
mul.f32 %f361, %f297, %f97;
mul.f32 %f362, %f298, %f97;
mul.f32 %f363, %f299, %f97;
mul.f32 %f364, %f300, %f97;
mul.f32 %f365, %f301, %f97;
mul.f32 %f366, %f302, %f97;
mul.f32 %f367, %f303, %f97;
mul.f32 %f368, %f304, %f97;
mul.f32 %f369, %f305, %f97;
mul.f32 %f370, %f306, %f97;
mul.f32 %f371, %f307, %f97;
mul.f32 %f372, %f308, %f97;
mul.f32 %f373, %f309, %f97;
mul.f32 %f374, %f310, %f97;
mul.f32 %f375, %f311, %f97;
mul.f32 %f376, %f312, %f97;
mul.f32 %f377, %f313, %f97;
mul.f32 %f378, %f314, %f97;
mul.f32 %f379, %f315, %f97;
mul.f32 %f380, %f316, %f97;
mul.f32 %f381, %f317, %f97;
mul.f32 %f382, %f318, %f97;
mul.f32 %f383, %f319, %f97;
mul.f32 %f384, %f320, %f97;
mul.f32 %f385, %f321, %f97;
mul.f32 %f386, %f322, %f97;
mul.f32 %f387, %f323, %f97;
mul.f32 %f388, %f324, %f97;
mul.f32 %f389, %f325, %f97;
mul.f32 %f390, %f326, %f97;
mul.f32 %f391, %f327, %f97;
mul.f32 %f392, %f328, %f97;
mul.f32 %f393, %f329, %f97;
mul.f32 %f394, %f330, %f97;
mul.f32 %f395, %f331, %f97;
mul.f32 %f396, %f332, %f97;
mul.f32 %f397, %f333, %f97;
mul.f32 %f398, %f334, %f97;
mul.f32 %f399, %f335, %f97;
mul.f32 %f400, %f336, %f97;
mul.f32 %f401, %f337, %f97;
mul.f32 %f402, %f338, %f97;
mul.f32 %f403, %f339, %f97;
mul.f32 %f404, %f340, %f97;
mul.f32 %f405, %f341, %f97;
mul.f32 %f406, %f342, %f97;
mul.f32 %f407, %f343, %f97;
mul.f32 %f408, %f344, %f97;
mul.f32 %f409, %f345, %f97;
mul.f32 %f410, %f346, %f97;
mul.f32 %f411, %f347, %f97;
mul.f32 %f412, %f348, %f97;
mul.f32 %f413, %f349, %f97;
mul.f32 %f414, %f350, %f97;
mul.f32 %f415, %f351, %f97;
mul.f32 %f416, %f352, %f97;
mul.f32 %f417, %f353, %f97;
bar.sync 0;
st.shared.b32 [%r28], %f254;
st.shared.b32 [%r28+256], %f255;
st.shared.b32 [%r28+16], %f256;
st.shared.b32 [%r28+16], %f257;
st.shared.b32 [%r28+2048], %f258;
st.shared.b32 [%r28+2304], %f259;
st.shared.b32 [%r28+16], %f260;
st.shared.b32 [%r28+16], %f261;
st.shared.b32 [%r28+4096], %f262;
st.shared.b32 [%r28+4352], %f263;
st.shared.b32 [%r28+16], %f264;
st.shared.b32 [%r28+16], %f265;
st.shared.b32 [%r28+6144], %f266;
st.shared.b32 [%r28+6400], %f267;
st.shared.b32 [%r28+16], %f268;
st.shared.b32 [%r28+16], %f269;
st.shared.b32 [%r28+8192], %f270;
st.shared.b32 [%r28+8448], %f271;
st.shared.b32 [%r28+16], %f272;
st.shared.b32 [%r28+16], %f273;
st.shared.b32 [%r28+10240], %f274;
st.shared.b32 [%r28+10496], %f275;
st.shared.b32 [%r28+16], %f276;
st.shared.b32 [%r28+16], %f277;
st.shared.b32 [%r28+12288], %f278;
st.shared.b32 [%r28+12544], %f279;
st.shared.b32 [%r28+16], %f280;
st.shared.b32 [%r28+16], %f281;
st.shared.b32 [%r28+14336], %f282;
st.shared.b32 [%r28+14592], %f283;
st.shared.b32 [%r28+16], %f284;
st.shared.b32 [%r28+16], %f285;
st.shared.b32 [%r28+16384], %f286;
st.shared.b32 [%r28+16640], %f287;
st.shared.b32 [%r28+16], %f288;
st.shared.b32 [%r28+16], %f289;
st.shared.b32 [%r28+18432], %f290;
st.shared.b32 [%r28+18688], %f291;
st.shared.b32 [%r28+16], %f292;
st.shared.b32 [%r28+16], %f293;
st.shared.b32 [%r28+20480], %f294;
st.shared.b32 [%r28+20736], %f295;
st.shared.b32 [%r28+16], %f296;
st.shared.b32 [%r28+16], %f297;
st.shared.b32 [%r28+22528], %f298;
st.shared.b32 [%r28+22784], %f299;
st.shared.b32 [%r28+16], %f300;
st.shared.b32 [%r28+16], %f301;
st.shared.b32 [%r28+24576], %f302;
st.shared.b32 [%r28+24832], %f303;
st.shared.b32 [%r28+16], %f304;
st.shared.b32 [%r28+16], %f305;
st.shared.b32 [%r28+26624], %f306;
st.shared.b32 [%r28+26880], %f307;
st.shared.b32 [%r28+16], %f308;
st.shared.b32 [%r28+16], %f309;
st.shared.b32 [%r28+28672], %f310;
st.shared.b32 [%r28+28928], %f311;
st.shared.b32 [%r28+16], %f312;
st.shared.b32 [%r28+16], %f313;
st.shared.b32 [%r28+30720], %f314;
st.shared.b32 [%r28+30976], %f315;
st.shared.b32 [%r28+16], %f316;
st.shared.b32 [%r28+16], %f317;
add.s32 %r3847, %r3847, 128;
setp.lt.s32 %p18, %r3847, %r11;
@%p18 bra LBB0_2;
LBB0_3:
bar.sync 0;
and.b32 %r3579, %r6, 48;
or.b32 %r3580, %r3579, %r8;
shr.u32 %r3581, %r1, 4;
and.b32 %r3582, %r3581, 56;
or.b32 %r3583, %r10, %r3582;
mad.lo.s32 %r3584, %r3580, 72, %r3583;
shl.b32 %r3585, %r3584, 2;
add.s32 %r3587, %r478, %r3585;
st.shared.v2.f32 [%r3587], {%f129, %f129};
st.shared.v2.f32 [%r3587+2304], {%f129, %f129};
st.shared.v2.f32 [%r3587+64], {%f129, %f129};
st.shared.v2.f32 [%r3587+2368], {%f129, %f129};
st.shared.v2.f32 [%r3587+128], {%f129, %f129};
st.shared.v2.f32 [%r3587+2432], {%f129, %f129};
st.shared.v2.f32 [%r3587+192], {%f129, %f129};
st.shared.v2.f32 [%r3587+2496], {%f129, %f129};
bar.sync 0;
mad.lo.s32 %r3588, %r3, 72, %r5;
shl.b32 %r3589, %r3588, 2;
add.s32 %r3590, %r478, %r3589;
ld.shared.v4.f32 {%f419, %f420, %f421, %f422}, [%r3590];
ld.shared.v4.f32 {%f423, %f424, %f425, %f426}, [%r3590+16];
ld.shared.v4.f32 {%f427, %f428, %f429, %f430}, [%r3590+9216];
ld.shared.v4.f32 {%f431, %f432, %f433, %f434}, [%r3590+9232];
bar.sync 0;
st.shared.v2.f32 [%r3587], {%f129, %f129};
st.shared.v2.f32 [%r3587+2304], {%f129, %f129};
st.shared.v2.f32 [%r3587+64], {%f129, %f129};
st.shared.v2.f32 [%r3587+2368], {%f129, %f129};
st.shared.v2.f32 [%r3587+128], {%f129, %f129};
st.shared.v2.f32 [%r3587+2432], {%f129, %f129};
st.shared.v2.f32 [%r3587+192], {%f129, %f129};
st.shared.v2.f32 [%r3587+2496], {%f129, %f129};
bar.sync 0;
ld.shared.v4.f32 {%f435, %f436, %f437, %f438}, [%r3590];
ld.shared.v4.f32 {%f439, %f440, %f441, %f442}, [%r3590+16];
ld.shared.v4.f32 {%f443, %f444, %f445, %f446}, [%r3590+9216];
ld.shared.v4.f32 {%f447, %f448, %f449, %f450}, [%r3590+9232];
bar.sync 0;
st.shared.v2.f32 [%r3587], {%f515, %f516};
st.shared.v2.f32 [%r3587+2304], {%f517, %f518};
st.shared.v2.f32 [%r3587+64], {%f519, %f520};
st.shared.v2.f32 [%r3587+2368], {%f521, %f522};
st.shared.v2.f32 [%r3587+128], {%f523, %f524};
st.shared.v2.f32 [%r3587+2432], {%f525, %f526};
st.shared.v2.f32 [%r3587+192], {%f527, %f528};
st.shared.v2.f32 [%r3587+2496], {%f529, %f530};
bar.sync 0;
ld.shared.v4.f32 {%f451, %f452, %f453, %f454}, [%r3590];
ld.shared.v4.f32 {%f455, %f456, %f457, %f458}, [%r3590+16];
ld.shared.v4.f32 {%f459, %f460, %f461, %f462}, [%r3590+9216];
ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r3590+9232];
bar.sync 0;
st.shared.v2.f32 [%r3587], {%f531, %f532};
st.shared.v2.f32 [%r3587+2304], {%f533, %f534};
st.shared.v2.f32 [%r3587+64], {%f535, %f536};
st.shared.v2.f32 [%r3587+2368], {%f537, %f538};
st.shared.v2.f32 [%r3587+128], {%f539, %f540};
st.shared.v2.f32 [%r3587+2432], {%f541, %f542};
st.shared.v2.f32 [%r3587+192], {%f543, %f544};
st.shared.v2.f32 [%r3587+2496], {%f545, %f546};
bar.sync 0;
ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r3590];
ld.shared.v4.f32 {%f471, %f472, %f473, %f474}, [%r3590+16];
ld.shared.v4.f32 {%f475, %f476, %f477, %f478}, [%r3590+9216];
ld.shared.v4.f32 {%f479, %f480, %f481, %f482}, [%r3590+9232];
shl.b64 %rd62, %rd7, 1;
add.s64 %rd54, %rd2, %rd62;
shl.b64 %rd63, %rd8, 1;
add.s64 %rd55, %rd2, %rd63;
shl.b64 %rd64, %rd9, 1;
add.s64 %rd56, %rd2, %rd64;
shl.b64 %rd65, %rd10, 1;
add.s64 %rd57, %rd2, %rd65;
cvt.rn.f16.f32 %h193, %f452;
cvt.rn.f16.f32 %h194, %f451;
mov.b32 %hh289, {%h194, %h193};
cvt.rn.f16.f32 %h195, %f454;
cvt.rn.f16.f32 %h196, %f453;
mov.b32 %hh290, {%h196, %h195};
cvt.rn.f16.f32 %h197, %f456;
cvt.rn.f16.f32 %h198, %f455;
mov.b32 %hh291, {%h198, %h197};
cvt.rn.f16.f32 %h199, %f458;
cvt.rn.f16.f32 %h200, %f457;
mov.b32 %hh292, {%h200, %h199};
cvt.rn.f16.f32 %h201, %f460;
cvt.rn.f16.f32 %h202, %f459;
mov.b32 %hh293, {%h202, %h201};
cvt.rn.f16.f32 %h203, %f462;
cvt.rn.f16.f32 %h204, %f461;
mov.b32 %hh294, {%h204, %h203};
cvt.rn.f16.f32 %h205, %f464;
cvt.rn.f16.f32 %h206, %f463;
mov.b32 %hh295, {%h206, %h205};
cvt.rn.f16.f32 %h207, %f466;
cvt.rn.f16.f32 %h208, %f465;
mov.b32 %hh296, {%h208, %h207};
cvt.rn.f16.f32 %h209, %f468;
cvt.rn.f16.f32 %h210, %f467;
mov.b32 %hh297, {%h210, %h209};
cvt.rn.f16.f32 %h211, %f470;
cvt.rn.f16.f32 %h212, %f469;
mov.b32 %hh298, {%h212, %h211};
cvt.rn.f16.f32 %h213, %f472;
cvt.rn.f16.f32 %h214, %f471;
mov.b32 %hh299, {%h214, %h213};
cvt.rn.f16.f32 %h215, %f474;
cvt.rn.f16.f32 %h216, %f473;
mov.b32 %hh300, {%h216, %h215};
cvt.rn.f16.f32 %h217, %f476;
cvt.rn.f16.f32 %h218, %f475;
mov.b32 %hh301, {%h218, %h217};
cvt.rn.f16.f32 %h219, %f478;
cvt.rn.f16.f32 %h220, %f477;
mov.b32 %hh302, {%h220, %h219};
cvt.rn.f16.f32 %h221, %f480;
cvt.rn.f16.f32 %h222, %f479;
mov.b32 %hh303, {%h222, %h221};
cvt.rn.f16.f32 %h223, %f482;
cvt.rn.f16.f32 %h224, %f481;
mov.b32 %hh304, {%h224, %h223};
mov.b32 %r3547, %hh289;
mov.b32 %r3548, %hh290;
mov.b32 %r3549, %hh291;
mov.b32 %r3550, %hh292;
@%p19 st.global.v4.b32 [ %rd54 + 0 ], { %r3547, %r3548, %r3549, %r3550 };
mov.b32 %r3551, %hh293;
mov.b32 %r3552, %hh294;
mov.b32 %r3553, %hh295;
mov.b32 %r3554, %hh296;
@%p19 st.global.v4.b32 [ %rd55 + 0 ], { %r3551, %r3552, %r3553, %r3554 };
mov.b32 %r3555, %hh297;
mov.b32 %r3556, %hh298;
mov.b32 %r3557, %hh299;
mov.b32 %r3558, %hh300;
@%p19 st.global.v4.b32 [ %rd56 + 0 ], { %r3555, %r3556, %r3557, %r3558 };
mov.b32 %r3559, %hh301;
mov.b32 %r3560, %hh302;
mov.b32 %r3561, %hh303;
mov.b32 %r3562, %hh304;
@%p19 st.global.v4.b32 [ %rd57 + 0 ], { %r3559, %r3560, %r3561, %r3562 };
shl.b64 %rd66, %rd3, 1;
add.s64 %rd58, %rd1, %rd66;
shl.b64 %rd67, %rd4, 1;
add.s64 %rd59, %rd1, %rd67;
shl.b64 %rd68, %rd5, 1;
add.s64 %rd60, %rd1, %rd68;
shl.b64 %rd69, %rd6, 1;
add.s64 %rd61, %rd1, %rd69;
cvt.rn.f16.f32 %h225, %f420;
cvt.rn.f16.f32 %h226, %f419;
mov.b32 %hh305, {%h226, %h225};
cvt.rn.f16.f32 %h227, %f422;
cvt.rn.f16.f32 %h228, %f421;
mov.b32 %hh306, {%h228, %h227};
cvt.rn.f16.f32 %h229, %f424;
cvt.rn.f16.f32 %h230, %f423;
mov.b32 %hh307, {%h230, %h229};
cvt.rn.f16.f32 %h231, %f426;
cvt.rn.f16.f32 %h232, %f425;
mov.b32 %hh308, {%h232, %h231};
cvt.rn.f16.f32 %h233, %f428;
cvt.rn.f16.f32 %h234, %f427;
mov.b32 %hh309, {%h234, %h233};
cvt.rn.f16.f32 %h235, %f430;
cvt.rn.f16.f32 %h236, %f429;
mov.b32 %hh310, {%h236, %h235};
cvt.rn.f16.f32 %h237, %f432;
cvt.rn.f16.f32 %h238, %f431;
mov.b32 %hh311, {%h238, %h237};
cvt.rn.f16.f32 %h239, %f434;
cvt.rn.f16.f32 %h240, %f433;
mov.b32 %hh312, {%h240, %h239};
cvt.rn.f16.f32 %h241, %f436;
cvt.rn.f16.f32 %h242, %f435;
mov.b32 %hh313, {%h242, %h241};
cvt.rn.f16.f32 %h243, %f438;
cvt.rn.f16.f32 %h244, %f437;
mov.b32 %hh314, {%h244, %h243};
cvt.rn.f16.f32 %h245, %f440;
cvt.rn.f16.f32 %h246, %f439;
mov.b32 %hh315, {%h246, %h245};
cvt.rn.f16.f32 %h247, %f442;
cvt.rn.f16.f32 %h248, %f441;
mov.b32 %hh316, {%h248, %h247};
cvt.rn.f16.f32 %h249, %f444;
cvt.rn.f16.f32 %h250, %f443;
mov.b32 %hh317, {%h250, %h249};
cvt.rn.f16.f32 %h251, %f446;
cvt.rn.f16.f32 %h252, %f445;
mov.b32 %hh318, {%h252, %h251};
cvt.rn.f16.f32 %h253, %f448;
cvt.rn.f16.f32 %h254, %f447;
mov.b32 %hh319, {%h254, %h253};
cvt.rn.f16.f32 %h255, %f450;
cvt.rn.f16.f32 %h256, %f449;
mov.b32 %hh320, {%h256, %h255};
mov.b32 %r3563, %hh305;
mov.b32 %r3564, %hh306;
mov.b32 %r3565, %hh307;
mov.b32 %r3566, %hh308;
@%p19 st.global.v4.b32 [ %rd58 + 0 ], { %r3563, %r3564, %r3565, %r3566 };
mov.b32 %r3567, %hh309;
mov.b32 %r3568, %hh310;
mov.b32 %r3569, %hh311;
mov.b32 %r3570, %hh312;
@%p19 st.global.v4.b32 [ %rd59 + 0 ], { %r3567, %r3568, %r3569, %r3570 };
mov.b32 %r3571, %hh313;
mov.b32 %r3572, %hh314;
mov.b32 %r3573, %hh315;
mov.b32 %r3574, %hh316;
@%p19 st.global.v4.b32 [ %rd60 + 0 ], { %r3571, %r3572, %r3573, %r3574 };
mov.b32 %r3575, %hh317;
mov.b32 %r3576, %hh318;
mov.b32 %r3577, %hh319;
mov.b32 %r3578, %hh320;
@%p19 st.global.v4.b32 [ %rd61 + 0 ], { %r3575, %r3576, %r3577, %r3578 };
ret;
}