2033 lines
96 KiB
Plaintext
2033 lines
96 KiB
Plaintext
//
|
|
// Generated by LLVM NVPTX Back-End
|
|
//
|
|
|
|
.version 7.4
|
|
.target sm_86
|
|
.address_size 64
|
|
|
|
// .globl _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27
|
|
.extern .shared .align 1 .b8 global_smem[];
|
|
|
|
.visible .entry _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2,
|
|
.param .f32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_4,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_9,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_15,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_16,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_18,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_19,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_20,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_21,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24
|
|
)
|
|
.maxntid 256, 1, 1
|
|
{
|
|
.reg .pred %p<27>;
|
|
.reg .b16 %h<257>;
|
|
.reg .b32 %r<3848>;
|
|
.reg .b32 %hh<321>;
|
|
.reg .f32 %f<547>;
|
|
.reg .b64 %rd<70>;
|
|
|
|
mov.u32 %r1, %tid.x;
|
|
ld.param.u64 %rd28, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1];
|
|
and.b32 %r447, %r1, 31;
|
|
ld.param.u64 %rd29, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2];
|
|
shr.u32 %r2, %r1, 5;
|
|
bfe.u32 %r448, %r1, 3, 2;
|
|
shr.u32 %r449, %r1, 3;
|
|
and.b32 %r450, %r449, 124;
|
|
or.b32 %r3, %r450, %r448;
|
|
ld.param.u64 %rd31, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7];
|
|
ld.param.u64 %rd32, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8];
|
|
and.b32 %r4, %r1, 7;
|
|
shl.b32 %r5, %r4, 3;
|
|
ld.param.u32 %r451, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12];
|
|
shl.b32 %r6, %r2, 4;
|
|
ld.param.u32 %r452, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13];
|
|
ld.param.u32 %r453, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14];
|
|
bfe.u32 %r8, %r1, 2, 3;
|
|
shl.b32 %r454, %r1, 1;
|
|
ld.param.u32 %r455, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17];
|
|
and.b32 %r10, %r454, 6;
|
|
mov.u32 %r456, %ctaid.x;
|
|
ld.param.u32 %r458, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22];
|
|
div.s32 %r460, %r456, %r458;
|
|
mul.lo.s32 %r461, %r460, %r458;
|
|
sub.s32 %r462, %r456, %r461;
|
|
ld.param.u32 %r463, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24];
|
|
mul.lo.s32 %r464, %r460, %r451;
|
|
mad.lo.s32 %r465, %r462, %r452, %r464;
|
|
mul.wide.s32 %rd33, %r465, 2;
|
|
add.s64 %rd35, %rd28, %rd33;
|
|
add.s64 %rd36, %rd29, %rd33;
|
|
add.s64 %rd1, %rd31, %rd33;
|
|
add.s64 %rd2, %rd32, %rd33;
|
|
shl.b32 %r466, %r455, 5;
|
|
mad.lo.s32 %r467, %r3, %r455, %r5;
|
|
add.s32 %r468, %r467, %r466;
|
|
add.s32 %r469, %r468, %r466;
|
|
add.s32 %r470, %r469, %r466;
|
|
cvt.s64.s32 %rd3, %r467;
|
|
mul.wide.s32 %rd38, %r467, 2;
|
|
add.s64 %rd19, %rd35, %rd38;
|
|
cvt.s64.s32 %rd4, %r468;
|
|
mul.wide.s32 %rd39, %r468, 2;
|
|
add.s64 %rd20, %rd35, %rd39;
|
|
cvt.s64.s32 %rd5, %r469;
|
|
mul.wide.s32 %rd40, %r469, 2;
|
|
add.s64 %rd21, %rd35, %rd40;
|
|
cvt.s64.s32 %rd6, %r470;
|
|
mul.wide.s32 %rd41, %r470, 2;
|
|
add.s64 %rd22, %rd35, %rd41;
|
|
mov.pred %p19, -1;
|
|
@%p19 ld.global.v4.b32 { %r95, %r96, %r97, %r98 }, [ %rd19 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r99, %r100, %r101, %r102 }, [ %rd20 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r103, %r104, %r105, %r106 }, [ %rd21 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r107, %r108, %r109, %r110 }, [ %rd22 + 0 ];
|
|
shl.b32 %r471, %r453, 5;
|
|
mad.lo.s32 %r472, %r3, %r453, %r5;
|
|
add.s32 %r473, %r472, %r471;
|
|
add.s32 %r474, %r473, %r471;
|
|
add.s32 %r475, %r474, %r471;
|
|
cvt.s64.s32 %rd7, %r472;
|
|
mul.wide.s32 %rd42, %r472, 2;
|
|
add.s64 %rd23, %rd36, %rd42;
|
|
cvt.s64.s32 %rd8, %r473;
|
|
mul.wide.s32 %rd43, %r473, 2;
|
|
add.s64 %rd24, %rd36, %rd43;
|
|
cvt.s64.s32 %rd9, %r474;
|
|
mul.wide.s32 %rd44, %r474, 2;
|
|
add.s64 %rd25, %rd36, %rd44;
|
|
cvt.s64.s32 %rd10, %r475;
|
|
mul.wide.s32 %rd45, %r475, 2;
|
|
add.s64 %rd26, %rd36, %rd45;
|
|
@%p19 ld.global.v4.b32 { %r111, %r112, %r113, %r114 }, [ %rd23 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r115, %r116, %r117, %r118 }, [ %rd24 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r119, %r120, %r121, %r122 }, [ %rd25 + 0 ];
|
|
@%p19 ld.global.v4.b32 { %r123, %r124, %r125, %r126 }, [ %rd26 + 0 ];
|
|
shl.b32 %r11, %r463, 7;
|
|
mov.b32 {%h1, %h2}, %r95;
|
|
shl.b32 %r12, %r3, 6;
|
|
or.b32 %r476, %r12, %r5;
|
|
shl.b32 %r477, %r476, 1;
|
|
mov.u32 %r478, global_smem;
|
|
add.s32 %r479, %r478, %r477;
|
|
st.shared.b16 [%r479], %h1;
|
|
st.shared.b16 [%r479+2], %h2;
|
|
mov.b32 {%h3, %h4}, %r96;
|
|
st.shared.b16 [%r479+4], %h3;
|
|
st.shared.b16 [%r479+6], %h4;
|
|
mov.b32 {%h5, %h6}, %r97;
|
|
st.shared.b16 [%r479+8], %h5;
|
|
st.shared.b16 [%r479+10], %h6;
|
|
mov.b32 {%h7, %h8}, %r98;
|
|
st.shared.b16 [%r479+12], %h7;
|
|
st.shared.b16 [%r479+14], %h8;
|
|
mov.b32 {%h9, %h10}, %r99;
|
|
add.s32 %r13, %r12, 2048;
|
|
or.b32 %r480, %r13, %r5;
|
|
shl.b32 %r481, %r480, 1;
|
|
add.s32 %r482, %r478, %r481;
|
|
st.shared.b16 [%r482], %h9;
|
|
st.shared.b16 [%r482+2], %h10;
|
|
mov.b32 {%h11, %h12}, %r100;
|
|
st.shared.b16 [%r482+4], %h11;
|
|
st.shared.b16 [%r482+6], %h12;
|
|
mov.b32 {%h13, %h14}, %r101;
|
|
st.shared.b16 [%r482+8], %h13;
|
|
st.shared.b16 [%r482+10], %h14;
|
|
mov.b32 {%h15, %h16}, %r102;
|
|
st.shared.b16 [%r482+12], %h15;
|
|
st.shared.b16 [%r482+14], %h16;
|
|
mov.b32 {%h17, %h18}, %r103;
|
|
add.s32 %r14, %r12, 4096;
|
|
or.b32 %r483, %r14, %r5;
|
|
shl.b32 %r484, %r483, 1;
|
|
add.s32 %r485, %r478, %r484;
|
|
st.shared.b16 [%r485], %h17;
|
|
st.shared.b16 [%r485+2], %h18;
|
|
mov.b32 {%h19, %h20}, %r104;
|
|
st.shared.b16 [%r485+4], %h19;
|
|
st.shared.b16 [%r485+6], %h20;
|
|
mov.b32 {%h21, %h22}, %r105;
|
|
st.shared.b16 [%r485+8], %h21;
|
|
st.shared.b16 [%r485+10], %h22;
|
|
mov.b32 {%h23, %h24}, %r106;
|
|
st.shared.b16 [%r485+12], %h23;
|
|
st.shared.b16 [%r485+14], %h24;
|
|
mov.b32 {%h25, %h26}, %r107;
|
|
add.s32 %r15, %r12, 6144;
|
|
or.b32 %r486, %r15, %r5;
|
|
shl.b32 %r487, %r486, 1;
|
|
add.s32 %r488, %r478, %r487;
|
|
st.shared.b16 [%r488], %h25;
|
|
st.shared.b16 [%r488+2], %h26;
|
|
mov.b32 {%h27, %h28}, %r108;
|
|
st.shared.b16 [%r488+4], %h27;
|
|
st.shared.b16 [%r488+6], %h28;
|
|
mov.b32 {%h29, %h30}, %r109;
|
|
st.shared.b16 [%r488+8], %h29;
|
|
st.shared.b16 [%r488+10], %h30;
|
|
mov.b32 {%h31, %h32}, %r110;
|
|
st.shared.b16 [%r488+12], %h31;
|
|
st.shared.b16 [%r488+14], %h32;
|
|
bar.sync 0;
|
|
mov.b32 {%h33, %h34}, %r111;
|
|
add.s32 %r489, %r478, 16384;
|
|
add.s32 %r490, %r489, %r477;
|
|
st.shared.b16 [%r490], %h33;
|
|
st.shared.b16 [%r490+2], %h34;
|
|
mov.b32 {%h35, %h36}, %r112;
|
|
st.shared.b16 [%r490+4], %h35;
|
|
st.shared.b16 [%r490+6], %h36;
|
|
mov.b32 {%h37, %h38}, %r113;
|
|
st.shared.b16 [%r490+8], %h37;
|
|
st.shared.b16 [%r490+10], %h38;
|
|
mov.b32 {%h39, %h40}, %r114;
|
|
st.shared.b16 [%r490+12], %h39;
|
|
st.shared.b16 [%r490+14], %h40;
|
|
mov.b32 {%h41, %h42}, %r115;
|
|
add.s32 %r491, %r489, %r481;
|
|
st.shared.b16 [%r491], %h41;
|
|
st.shared.b16 [%r491+2], %h42;
|
|
mov.b32 {%h43, %h44}, %r116;
|
|
st.shared.b16 [%r491+4], %h43;
|
|
st.shared.b16 [%r491+6], %h44;
|
|
mov.b32 {%h45, %h46}, %r117;
|
|
st.shared.b16 [%r491+8], %h45;
|
|
st.shared.b16 [%r491+10], %h46;
|
|
mov.b32 {%h47, %h48}, %r118;
|
|
st.shared.b16 [%r491+12], %h47;
|
|
st.shared.b16 [%r491+14], %h48;
|
|
mov.b32 {%h49, %h50}, %r119;
|
|
add.s32 %r492, %r489, %r484;
|
|
st.shared.b16 [%r492], %h49;
|
|
st.shared.b16 [%r492+2], %h50;
|
|
mov.b32 {%h51, %h52}, %r120;
|
|
st.shared.b16 [%r492+4], %h51;
|
|
st.shared.b16 [%r492+6], %h52;
|
|
mov.b32 {%h53, %h54}, %r121;
|
|
st.shared.b16 [%r492+8], %h53;
|
|
st.shared.b16 [%r492+10], %h54;
|
|
mov.b32 {%h55, %h56}, %r122;
|
|
st.shared.b16 [%r492+12], %h55;
|
|
st.shared.b16 [%r492+14], %h56;
|
|
mov.b32 {%h57, %h58}, %r123;
|
|
add.s32 %r493, %r489, %r487;
|
|
st.shared.b16 [%r493], %h57;
|
|
st.shared.b16 [%r493+2], %h58;
|
|
mov.b32 {%h59, %h60}, %r124;
|
|
st.shared.b16 [%r493+4], %h59;
|
|
st.shared.b16 [%r493+6], %h60;
|
|
mov.b32 {%h61, %h62}, %r125;
|
|
st.shared.b16 [%r493+8], %h61;
|
|
st.shared.b16 [%r493+10], %h62;
|
|
mov.b32 {%h63, %h64}, %r126;
|
|
st.shared.b16 [%r493+12], %h63;
|
|
st.shared.b16 [%r493+14], %h64;
|
|
bar.sync 0;
|
|
bfe.u32 %r16, %r447, 3, 1;
|
|
bfe.u32 %r17, %r1, 4, 1;
|
|
shl.b32 %r18, %r17, 3;
|
|
or.b32 %r494, %r18, %r4;
|
|
shl.b32 %r495, %r494, 6;
|
|
shl.b32 %r19, %r16, 3;
|
|
or.b32 %r496, %r495, %r19;
|
|
shl.b32 %r497, %r496, 1;
|
|
add.s32 %r131, %r478, %r497;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3591, %r3592, %r3593, %r3594 }, [ %r131 + 0 ];
|
|
and.b32 %r498, %r1, 8;
|
|
or.b32 %r499, %r495, %r498;
|
|
shl.b32 %r500, %r499, 1;
|
|
add.s32 %r501, %r478, %r500;
|
|
add.s32 %r136, %r501, 32;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3623, %r3624, %r3625, %r3626 }, [ %r136 + 0 ];
|
|
add.s32 %r141, %r131, 64;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3655, %r3656, %r3657, %r3658 }, [ %r141 + 0 ];
|
|
add.s32 %r146, %r501, 96;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3687, %r3688, %r3689, %r3690 }, [ %r146 + 0 ];
|
|
add.s32 %r151, %r131, 2048;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3595, %r3596, %r3597, %r3598 }, [ %r151 + 0 ];
|
|
add.s32 %r156, %r501, 2080;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3627, %r3628, %r3629, %r3630 }, [ %r156 + 0 ];
|
|
add.s32 %r161, %r131, 2112;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3659, %r3660, %r3661, %r3662 }, [ %r161 + 0 ];
|
|
add.s32 %r166, %r501, 2144;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3691, %r3692, %r3693, %r3694 }, [ %r166 + 0 ];
|
|
add.s32 %r171, %r131, 4096;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3599, %r3600, %r3601, %r3602 }, [ %r171 + 0 ];
|
|
add.s32 %r176, %r501, 4128;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3631, %r3632, %r3633, %r3634 }, [ %r176 + 0 ];
|
|
add.s32 %r181, %r131, 4160;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3663, %r3664, %r3665, %r3666 }, [ %r181 + 0 ];
|
|
add.s32 %r186, %r501, 4192;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3695, %r3696, %r3697, %r3698 }, [ %r186 + 0 ];
|
|
add.s32 %r191, %r131, 6144;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3603, %r3604, %r3605, %r3606 }, [ %r191 + 0 ];
|
|
add.s32 %r196, %r501, 6176;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3635, %r3636, %r3637, %r3638 }, [ %r196 + 0 ];
|
|
add.s32 %r201, %r131, 6208;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3667, %r3668, %r3669, %r3670 }, [ %r201 + 0 ];
|
|
add.s32 %r206, %r501, 6240;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3699, %r3700, %r3701, %r3702 }, [ %r206 + 0 ];
|
|
add.s32 %r211, %r131, 8192;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3607, %r3608, %r3609, %r3610 }, [ %r211 + 0 ];
|
|
add.s32 %r216, %r501, 8224;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3639, %r3640, %r3641, %r3642 }, [ %r216 + 0 ];
|
|
add.s32 %r221, %r131, 8256;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3671, %r3672, %r3673, %r3674 }, [ %r221 + 0 ];
|
|
add.s32 %r226, %r501, 8288;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3703, %r3704, %r3705, %r3706 }, [ %r226 + 0 ];
|
|
add.s32 %r231, %r131, 10240;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3611, %r3612, %r3613, %r3614 }, [ %r231 + 0 ];
|
|
add.s32 %r236, %r501, 10272;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3643, %r3644, %r3645, %r3646 }, [ %r236 + 0 ];
|
|
add.s32 %r241, %r131, 10304;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3675, %r3676, %r3677, %r3678 }, [ %r241 + 0 ];
|
|
add.s32 %r246, %r501, 10336;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3707, %r3708, %r3709, %r3710 }, [ %r246 + 0 ];
|
|
add.s32 %r251, %r131, 12288;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3615, %r3616, %r3617, %r3618 }, [ %r251 + 0 ];
|
|
add.s32 %r256, %r501, 12320;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3647, %r3648, %r3649, %r3650 }, [ %r256 + 0 ];
|
|
add.s32 %r261, %r131, 12352;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3679, %r3680, %r3681, %r3682 }, [ %r261 + 0 ];
|
|
add.s32 %r266, %r501, 12384;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3711, %r3712, %r3713, %r3714 }, [ %r266 + 0 ];
|
|
add.s32 %r271, %r131, 14336;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3619, %r3620, %r3621, %r3622 }, [ %r271 + 0 ];
|
|
add.s32 %r276, %r501, 14368;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3651, %r3652, %r3653, %r3654 }, [ %r276 + 0 ];
|
|
add.s32 %r281, %r131, 14400;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3683, %r3684, %r3685, %r3686 }, [ %r281 + 0 ];
|
|
add.s32 %r286, %r501, 14432;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3715, %r3716, %r3717, %r3718 }, [ %r286 + 0 ];
|
|
add.s32 %r291, %r489, %r497;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3719, %r3720, %r3721, %r3722 }, [ %r291 + 0 ];
|
|
add.s32 %r502, %r489, %r500;
|
|
add.s32 %r296, %r502, 32;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3751, %r3752, %r3753, %r3754 }, [ %r296 + 0 ];
|
|
add.s32 %r301, %r291, 64;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3783, %r3784, %r3785, %r3786 }, [ %r301 + 0 ];
|
|
add.s32 %r306, %r502, 96;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3815, %r3816, %r3817, %r3818 }, [ %r306 + 0 ];
|
|
add.s32 %r311, %r291, 2048;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3723, %r3724, %r3725, %r3726 }, [ %r311 + 0 ];
|
|
add.s32 %r316, %r502, 2080;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3755, %r3756, %r3757, %r3758 }, [ %r316 + 0 ];
|
|
add.s32 %r321, %r291, 2112;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3787, %r3788, %r3789, %r3790 }, [ %r321 + 0 ];
|
|
add.s32 %r326, %r502, 2144;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3819, %r3820, %r3821, %r3822 }, [ %r326 + 0 ];
|
|
add.s32 %r331, %r291, 4096;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3727, %r3728, %r3729, %r3730 }, [ %r331 + 0 ];
|
|
add.s32 %r336, %r502, 4128;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3759, %r3760, %r3761, %r3762 }, [ %r336 + 0 ];
|
|
add.s32 %r341, %r291, 4160;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3791, %r3792, %r3793, %r3794 }, [ %r341 + 0 ];
|
|
add.s32 %r346, %r502, 4192;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3823, %r3824, %r3825, %r3826 }, [ %r346 + 0 ];
|
|
add.s32 %r351, %r291, 6144;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3731, %r3732, %r3733, %r3734 }, [ %r351 + 0 ];
|
|
add.s32 %r356, %r502, 6176;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3763, %r3764, %r3765, %r3766 }, [ %r356 + 0 ];
|
|
add.s32 %r361, %r291, 6208;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3795, %r3796, %r3797, %r3798 }, [ %r361 + 0 ];
|
|
add.s32 %r366, %r502, 6240;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3827, %r3828, %r3829, %r3830 }, [ %r366 + 0 ];
|
|
add.s32 %r371, %r291, 8192;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3735, %r3736, %r3737, %r3738 }, [ %r371 + 0 ];
|
|
add.s32 %r376, %r502, 8224;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3767, %r3768, %r3769, %r3770 }, [ %r376 + 0 ];
|
|
add.s32 %r381, %r291, 8256;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3799, %r3800, %r3801, %r3802 }, [ %r381 + 0 ];
|
|
add.s32 %r386, %r502, 8288;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3831, %r3832, %r3833, %r3834 }, [ %r386 + 0 ];
|
|
add.s32 %r391, %r291, 10240;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3739, %r3740, %r3741, %r3742 }, [ %r391 + 0 ];
|
|
add.s32 %r396, %r502, 10272;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3771, %r3772, %r3773, %r3774 }, [ %r396 + 0 ];
|
|
add.s32 %r401, %r291, 10304;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3803, %r3804, %r3805, %r3806 }, [ %r401 + 0 ];
|
|
add.s32 %r406, %r502, 10336;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3835, %r3836, %r3837, %r3838 }, [ %r406 + 0 ];
|
|
add.s32 %r411, %r291, 12288;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3743, %r3744, %r3745, %r3746 }, [ %r411 + 0 ];
|
|
add.s32 %r416, %r502, 12320;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3775, %r3776, %r3777, %r3778 }, [ %r416 + 0 ];
|
|
add.s32 %r421, %r291, 12352;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3807, %r3808, %r3809, %r3810 }, [ %r421 + 0 ];
|
|
add.s32 %r426, %r502, 12384;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3839, %r3840, %r3841, %r3842 }, [ %r426 + 0 ];
|
|
add.s32 %r431, %r291, 14336;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3747, %r3748, %r3749, %r3750 }, [ %r431 + 0 ];
|
|
add.s32 %r436, %r502, 14368;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3779, %r3780, %r3781, %r3782 }, [ %r436 + 0 ];
|
|
add.s32 %r441, %r291, 14400;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3811, %r3812, %r3813, %r3814 }, [ %r441 + 0 ];
|
|
add.s32 %r446, %r502, 14432;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3843, %r3844, %r3845, %r3846 }, [ %r446 + 0 ];
|
|
setp.lt.s32 %p9, %r11, 1;
|
|
mov.f32 %f129, 0f00000000;
|
|
mov.f32 %f515, %f129;
|
|
mov.f32 %f516, %f129;
|
|
mov.f32 %f517, %f129;
|
|
mov.f32 %f518, %f129;
|
|
mov.f32 %f519, %f129;
|
|
mov.f32 %f520, %f129;
|
|
mov.f32 %f521, %f129;
|
|
mov.f32 %f522, %f129;
|
|
mov.f32 %f523, %f129;
|
|
mov.f32 %f524, %f129;
|
|
mov.f32 %f525, %f129;
|
|
mov.f32 %f526, %f129;
|
|
mov.f32 %f527, %f129;
|
|
mov.f32 %f528, %f129;
|
|
mov.f32 %f529, %f129;
|
|
mov.f32 %f530, %f129;
|
|
mov.f32 %f531, %f129;
|
|
mov.f32 %f532, %f129;
|
|
mov.f32 %f533, %f129;
|
|
mov.f32 %f534, %f129;
|
|
mov.f32 %f535, %f129;
|
|
mov.f32 %f536, %f129;
|
|
mov.f32 %f537, %f129;
|
|
mov.f32 %f538, %f129;
|
|
mov.f32 %f539, %f129;
|
|
mov.f32 %f540, %f129;
|
|
mov.f32 %f541, %f129;
|
|
mov.f32 %f542, %f129;
|
|
mov.f32 %f543, %f129;
|
|
mov.f32 %f544, %f129;
|
|
mov.f32 %f545, %f129;
|
|
mov.f32 %f546, %f129;
|
|
@%p9 bra LBB0_3;
|
|
ld.param.f32 %f97, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3];
|
|
ld.param.u64 %rd27, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0];
|
|
ld.param.u64 %rd30, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5];
|
|
and.b32 %r7, %r6, 112;
|
|
or.b32 %r9, %r7, %r8;
|
|
add.s64 %rd34, %rd27, %rd33;
|
|
add.s64 %rd37, %rd30, %rd33;
|
|
add.s64 %rd46, %rd34, %rd42;
|
|
add.s64 %rd47, %rd34, %rd43;
|
|
add.s64 %rd48, %rd34, %rd44;
|
|
add.s64 %rd49, %rd34, %rd45;
|
|
add.s64 %rd50, %rd37, %rd42;
|
|
add.s64 %rd51, %rd37, %rd43;
|
|
add.s64 %rd52, %rd37, %rd44;
|
|
add.s64 %rd53, %rd37, %rd45;
|
|
mov.b32 %hh1, %r3591;
|
|
mov.b32 %hh2, %r3592;
|
|
mov.b32 %hh3, %r3593;
|
|
mov.b32 %hh4, %r3594;
|
|
mov.b32 %hh5, %r3623;
|
|
mov.b32 %hh6, %r3624;
|
|
mov.b32 %hh7, %r3625;
|
|
mov.b32 %hh8, %r3626;
|
|
mov.b32 %hh9, %r3655;
|
|
mov.b32 %hh10, %r3656;
|
|
mov.b32 %hh11, %r3657;
|
|
mov.b32 %hh12, %r3658;
|
|
mov.b32 %hh13, %r3687;
|
|
mov.b32 %hh14, %r3688;
|
|
mov.b32 %hh15, %r3689;
|
|
mov.b32 %hh16, %r3690;
|
|
mov.b32 %hh17, %r3595;
|
|
mov.b32 %hh18, %r3596;
|
|
mov.b32 %hh19, %r3597;
|
|
mov.b32 %hh20, %r3598;
|
|
mov.b32 %hh21, %r3627;
|
|
mov.b32 %hh22, %r3628;
|
|
mov.b32 %hh23, %r3629;
|
|
mov.b32 %hh24, %r3630;
|
|
mov.b32 %hh25, %r3659;
|
|
mov.b32 %hh26, %r3660;
|
|
mov.b32 %hh27, %r3661;
|
|
mov.b32 %hh28, %r3662;
|
|
mov.b32 %hh29, %r3691;
|
|
mov.b32 %hh30, %r3692;
|
|
mov.b32 %hh31, %r3693;
|
|
mov.b32 %hh32, %r3694;
|
|
mov.b32 %hh33, %r3599;
|
|
mov.b32 %hh34, %r3600;
|
|
mov.b32 %hh35, %r3601;
|
|
mov.b32 %hh36, %r3602;
|
|
mov.b32 %hh37, %r3631;
|
|
mov.b32 %hh38, %r3632;
|
|
mov.b32 %hh39, %r3633;
|
|
mov.b32 %hh40, %r3634;
|
|
mov.b32 %hh41, %r3663;
|
|
mov.b32 %hh42, %r3664;
|
|
mov.b32 %hh43, %r3665;
|
|
mov.b32 %hh44, %r3666;
|
|
mov.b32 %hh45, %r3695;
|
|
mov.b32 %hh46, %r3696;
|
|
mov.b32 %hh47, %r3697;
|
|
mov.b32 %hh48, %r3698;
|
|
mov.b32 %hh49, %r3603;
|
|
mov.b32 %hh50, %r3604;
|
|
mov.b32 %hh51, %r3605;
|
|
mov.b32 %hh52, %r3606;
|
|
mov.b32 %hh53, %r3635;
|
|
mov.b32 %hh54, %r3636;
|
|
mov.b32 %hh55, %r3637;
|
|
mov.b32 %hh56, %r3638;
|
|
mov.b32 %hh57, %r3667;
|
|
mov.b32 %hh58, %r3668;
|
|
mov.b32 %hh59, %r3669;
|
|
mov.b32 %hh60, %r3670;
|
|
mov.b32 %hh61, %r3699;
|
|
mov.b32 %hh62, %r3700;
|
|
mov.b32 %hh63, %r3701;
|
|
mov.b32 %hh64, %r3702;
|
|
mov.b32 %hh65, %r3607;
|
|
mov.b32 %hh66, %r3608;
|
|
mov.b32 %hh67, %r3609;
|
|
mov.b32 %hh68, %r3610;
|
|
mov.b32 %hh69, %r3639;
|
|
mov.b32 %hh70, %r3640;
|
|
mov.b32 %hh71, %r3641;
|
|
mov.b32 %hh72, %r3642;
|
|
mov.b32 %hh73, %r3671;
|
|
mov.b32 %hh74, %r3672;
|
|
mov.b32 %hh75, %r3673;
|
|
mov.b32 %hh76, %r3674;
|
|
mov.b32 %hh77, %r3703;
|
|
mov.b32 %hh78, %r3704;
|
|
mov.b32 %hh79, %r3705;
|
|
mov.b32 %hh80, %r3706;
|
|
mov.b32 %hh81, %r3611;
|
|
mov.b32 %hh82, %r3612;
|
|
mov.b32 %hh83, %r3613;
|
|
mov.b32 %hh84, %r3614;
|
|
mov.b32 %hh85, %r3643;
|
|
mov.b32 %hh86, %r3644;
|
|
mov.b32 %hh87, %r3645;
|
|
mov.b32 %hh88, %r3646;
|
|
mov.b32 %hh89, %r3675;
|
|
mov.b32 %hh90, %r3676;
|
|
mov.b32 %hh91, %r3677;
|
|
mov.b32 %hh92, %r3678;
|
|
mov.b32 %hh93, %r3707;
|
|
mov.b32 %hh94, %r3708;
|
|
mov.b32 %hh95, %r3709;
|
|
mov.b32 %hh96, %r3710;
|
|
mov.b32 %hh97, %r3615;
|
|
mov.b32 %hh98, %r3616;
|
|
mov.b32 %hh99, %r3617;
|
|
mov.b32 %hh100, %r3618;
|
|
mov.b32 %hh101, %r3647;
|
|
mov.b32 %hh102, %r3648;
|
|
mov.b32 %hh103, %r3649;
|
|
mov.b32 %hh104, %r3650;
|
|
mov.b32 %hh105, %r3679;
|
|
mov.b32 %hh106, %r3680;
|
|
mov.b32 %hh107, %r3681;
|
|
mov.b32 %hh108, %r3682;
|
|
mov.b32 %hh109, %r3711;
|
|
mov.b32 %hh110, %r3712;
|
|
mov.b32 %hh111, %r3713;
|
|
mov.b32 %hh112, %r3714;
|
|
mov.b32 %hh113, %r3619;
|
|
mov.b32 %hh114, %r3620;
|
|
mov.b32 %hh115, %r3621;
|
|
mov.b32 %hh116, %r3622;
|
|
mov.b32 %hh117, %r3651;
|
|
mov.b32 %hh118, %r3652;
|
|
mov.b32 %hh119, %r3653;
|
|
mov.b32 %hh120, %r3654;
|
|
mov.b32 %hh121, %r3683;
|
|
mov.b32 %hh122, %r3684;
|
|
mov.b32 %hh123, %r3685;
|
|
mov.b32 %hh124, %r3686;
|
|
mov.b32 %hh125, %r3715;
|
|
mov.b32 %hh126, %r3716;
|
|
mov.b32 %hh127, %r3717;
|
|
mov.b32 %hh128, %r3718;
|
|
mov.b32 %hh129, %r3719;
|
|
mov.b32 %hh130, %r3720;
|
|
mov.b32 %hh131, %r3721;
|
|
mov.b32 %hh132, %r3722;
|
|
mov.b32 %hh133, %r3751;
|
|
mov.b32 %hh134, %r3752;
|
|
mov.b32 %hh135, %r3753;
|
|
mov.b32 %hh136, %r3754;
|
|
mov.b32 %hh137, %r3783;
|
|
mov.b32 %hh138, %r3784;
|
|
mov.b32 %hh139, %r3785;
|
|
mov.b32 %hh140, %r3786;
|
|
mov.b32 %hh141, %r3815;
|
|
mov.b32 %hh142, %r3816;
|
|
mov.b32 %hh143, %r3817;
|
|
mov.b32 %hh144, %r3818;
|
|
mov.b32 %hh145, %r3723;
|
|
mov.b32 %hh146, %r3724;
|
|
mov.b32 %hh147, %r3725;
|
|
mov.b32 %hh148, %r3726;
|
|
mov.b32 %hh149, %r3755;
|
|
mov.b32 %hh150, %r3756;
|
|
mov.b32 %hh151, %r3757;
|
|
mov.b32 %hh152, %r3758;
|
|
mov.b32 %hh153, %r3787;
|
|
mov.b32 %hh154, %r3788;
|
|
mov.b32 %hh155, %r3789;
|
|
mov.b32 %hh156, %r3790;
|
|
mov.b32 %hh157, %r3819;
|
|
mov.b32 %hh158, %r3820;
|
|
mov.b32 %hh159, %r3821;
|
|
mov.b32 %hh160, %r3822;
|
|
mov.b32 %hh161, %r3727;
|
|
mov.b32 %hh162, %r3728;
|
|
mov.b32 %hh163, %r3729;
|
|
mov.b32 %hh164, %r3730;
|
|
mov.b32 %hh165, %r3759;
|
|
mov.b32 %hh166, %r3760;
|
|
mov.b32 %hh167, %r3761;
|
|
mov.b32 %hh168, %r3762;
|
|
mov.b32 %hh169, %r3791;
|
|
mov.b32 %hh170, %r3792;
|
|
mov.b32 %hh171, %r3793;
|
|
mov.b32 %hh172, %r3794;
|
|
mov.b32 %hh173, %r3823;
|
|
mov.b32 %hh174, %r3824;
|
|
mov.b32 %hh175, %r3825;
|
|
mov.b32 %hh176, %r3826;
|
|
mov.b32 %hh177, %r3731;
|
|
mov.b32 %hh178, %r3732;
|
|
mov.b32 %hh179, %r3733;
|
|
mov.b32 %hh180, %r3734;
|
|
mov.b32 %hh181, %r3763;
|
|
mov.b32 %hh182, %r3764;
|
|
mov.b32 %hh183, %r3765;
|
|
mov.b32 %hh184, %r3766;
|
|
mov.b32 %hh185, %r3795;
|
|
mov.b32 %hh186, %r3796;
|
|
mov.b32 %hh187, %r3797;
|
|
mov.b32 %hh188, %r3798;
|
|
mov.b32 %hh189, %r3827;
|
|
mov.b32 %hh190, %r3828;
|
|
mov.b32 %hh191, %r3829;
|
|
mov.b32 %hh192, %r3830;
|
|
mov.b32 %hh193, %r3735;
|
|
mov.b32 %hh194, %r3736;
|
|
mov.b32 %hh195, %r3737;
|
|
mov.b32 %hh196, %r3738;
|
|
mov.b32 %hh197, %r3767;
|
|
mov.b32 %hh198, %r3768;
|
|
mov.b32 %hh199, %r3769;
|
|
mov.b32 %hh200, %r3770;
|
|
mov.b32 %hh201, %r3799;
|
|
mov.b32 %hh202, %r3800;
|
|
mov.b32 %hh203, %r3801;
|
|
mov.b32 %hh204, %r3802;
|
|
mov.b32 %hh205, %r3831;
|
|
mov.b32 %hh206, %r3832;
|
|
mov.b32 %hh207, %r3833;
|
|
mov.b32 %hh208, %r3834;
|
|
mov.b32 %hh209, %r3739;
|
|
mov.b32 %hh210, %r3740;
|
|
mov.b32 %hh211, %r3741;
|
|
mov.b32 %hh212, %r3742;
|
|
mov.b32 %hh213, %r3771;
|
|
mov.b32 %hh214, %r3772;
|
|
mov.b32 %hh215, %r3773;
|
|
mov.b32 %hh216, %r3774;
|
|
mov.b32 %hh217, %r3803;
|
|
mov.b32 %hh218, %r3804;
|
|
mov.b32 %hh219, %r3805;
|
|
mov.b32 %hh220, %r3806;
|
|
mov.b32 %hh221, %r3835;
|
|
mov.b32 %hh222, %r3836;
|
|
mov.b32 %hh223, %r3837;
|
|
mov.b32 %hh224, %r3838;
|
|
mov.b32 %hh225, %r3743;
|
|
mov.b32 %hh226, %r3744;
|
|
mov.b32 %hh227, %r3745;
|
|
mov.b32 %hh228, %r3746;
|
|
mov.b32 %hh229, %r3775;
|
|
mov.b32 %hh230, %r3776;
|
|
mov.b32 %hh231, %r3777;
|
|
mov.b32 %hh232, %r3778;
|
|
mov.b32 %hh233, %r3807;
|
|
mov.b32 %hh234, %r3808;
|
|
mov.b32 %hh235, %r3809;
|
|
mov.b32 %hh236, %r3810;
|
|
mov.b32 %hh237, %r3839;
|
|
mov.b32 %hh238, %r3840;
|
|
mov.b32 %hh239, %r3841;
|
|
mov.b32 %hh240, %r3842;
|
|
mov.b32 %hh241, %r3747;
|
|
mov.b32 %hh242, %r3748;
|
|
mov.b32 %hh243, %r3749;
|
|
mov.b32 %hh244, %r3750;
|
|
mov.b32 %hh245, %r3779;
|
|
mov.b32 %hh246, %r3780;
|
|
mov.b32 %hh247, %r3781;
|
|
mov.b32 %hh248, %r3782;
|
|
mov.b32 %hh249, %r3811;
|
|
mov.b32 %hh250, %r3812;
|
|
mov.b32 %hh251, %r3813;
|
|
mov.b32 %hh252, %r3814;
|
|
mov.b32 %hh253, %r3843;
|
|
mov.b32 %hh254, %r3844;
|
|
mov.b32 %hh255, %r3845;
|
|
mov.b32 %hh256, %r3846;
|
|
and.b32 %r504, %r3, 7;
|
|
xor.b32 %r505, %r504, %r4;
|
|
shl.b32 %r506, %r505, 3;
|
|
or.b32 %r507, %r506, %r12;
|
|
shl.b32 %r508, %r507, 1;
|
|
add.s32 %r20, %r478, %r508;
|
|
or.b32 %r510, %r506, %r13;
|
|
shl.b32 %r511, %r510, 1;
|
|
add.s32 %r21, %r478, %r511;
|
|
or.b32 %r512, %r506, %r14;
|
|
shl.b32 %r513, %r512, 1;
|
|
add.s32 %r22, %r478, %r513;
|
|
or.b32 %r514, %r506, %r15;
|
|
shl.b32 %r515, %r514, 1;
|
|
add.s32 %r23, %r478, %r515;
|
|
or.b32 %r516, %r19, %r4;
|
|
or.b32 %r517, %r516, %r7;
|
|
xor.b32 %r518, %r17, %r4;
|
|
shl.b32 %r519, %r518, 4;
|
|
shl.b32 %r520, %r517, 7;
|
|
or.b32 %r521, %r520, %r519;
|
|
add.s32 %r599, %r478, %r521;
|
|
or.b32 %r522, %r17, 2;
|
|
xor.b32 %r523, %r522, %r4;
|
|
shl.b32 %r524, %r523, 4;
|
|
or.b32 %r525, %r524, %r520;
|
|
add.s32 %r604, %r478, %r525;
|
|
or.b32 %r526, %r17, 4;
|
|
xor.b32 %r527, %r526, %r4;
|
|
shl.b32 %r528, %r527, 4;
|
|
or.b32 %r529, %r528, %r520;
|
|
add.s32 %r609, %r478, %r529;
|
|
or.b32 %r530, %r17, 6;
|
|
xor.b32 %r531, %r530, %r4;
|
|
shl.b32 %r532, %r531, 4;
|
|
or.b32 %r533, %r532, %r520;
|
|
add.s32 %r614, %r478, %r533;
|
|
shl.b32 %r534, %r9, 1;
|
|
shl.b32 %r535, %r10, 8;
|
|
or.b32 %r536, %r535, %r534;
|
|
add.s32 %r28, %r478, %r536;
|
|
or.b32 %r537, %r536, 256;
|
|
add.s32 %r30, %r478, %r537;
|
|
add.s32 %r31, %r28, 2048;
|
|
or.b32 %r538, %r536, 2304;
|
|
add.s32 %r32, %r478, %r538;
|
|
add.s32 %r33, %r28, 4096;
|
|
or.b32 %r539, %r536, 4352;
|
|
add.s32 %r34, %r478, %r539;
|
|
add.s32 %r35, %r28, 6144;
|
|
or.b32 %r540, %r536, 6400;
|
|
add.s32 %r36, %r478, %r540;
|
|
add.s32 %r37, %r28, 8192;
|
|
or.b32 %r541, %r536, 8448;
|
|
add.s32 %r38, %r478, %r541;
|
|
add.s32 %r39, %r28, 10240;
|
|
or.b32 %r542, %r536, 10496;
|
|
add.s32 %r40, %r478, %r542;
|
|
add.s32 %r41, %r28, 12288;
|
|
or.b32 %r543, %r536, 12544;
|
|
add.s32 %r42, %r478, %r543;
|
|
add.s32 %r43, %r28, 14336;
|
|
or.b32 %r544, %r536, 14592;
|
|
add.s32 %r44, %r478, %r544;
|
|
add.s32 %r45, %r28, 16384;
|
|
or.b32 %r545, %r536, 16640;
|
|
add.s32 %r46, %r478, %r545;
|
|
add.s32 %r47, %r28, 18432;
|
|
or.b32 %r546, %r536, 18688;
|
|
add.s32 %r48, %r478, %r546;
|
|
add.s32 %r49, %r28, 20480;
|
|
or.b32 %r547, %r536, 20736;
|
|
add.s32 %r50, %r478, %r547;
|
|
add.s32 %r51, %r28, 22528;
|
|
or.b32 %r548, %r536, 22784;
|
|
add.s32 %r52, %r478, %r548;
|
|
add.s32 %r53, %r28, 24576;
|
|
or.b32 %r549, %r536, 24832;
|
|
add.s32 %r54, %r478, %r549;
|
|
add.s32 %r55, %r28, 26624;
|
|
or.b32 %r550, %r536, 26880;
|
|
add.s32 %r56, %r478, %r550;
|
|
add.s32 %r57, %r28, 28672;
|
|
or.b32 %r551, %r536, 28928;
|
|
add.s32 %r58, %r478, %r551;
|
|
add.s32 %r59, %r28, 30720;
|
|
or.b32 %r552, %r536, 30976;
|
|
add.s32 %r60, %r478, %r552;
|
|
shl.b32 %r553, %r2, 1;
|
|
and.b32 %r554, %r553, 6;
|
|
or.b32 %r555, %r554, %r16;
|
|
shl.b32 %r556, %r555, 10;
|
|
shl.b32 %r557, %r4, 7;
|
|
or.b32 %r558, %r556, %r557;
|
|
or.b32 %r559, %r558, %r18;
|
|
shl.b32 %r560, %r559, 1;
|
|
add.s32 %r1531, %r478, %r560;
|
|
shl.b32 %r561, %r522, 4;
|
|
shl.b32 %r562, %r558, 1;
|
|
or.b32 %r563, %r561, %r562;
|
|
add.s32 %r1536, %r478, %r563;
|
|
shl.b32 %r564, %r526, 4;
|
|
or.b32 %r565, %r564, %r562;
|
|
add.s32 %r1541, %r478, %r565;
|
|
shl.b32 %r566, %r530, 4;
|
|
or.b32 %r567, %r566, %r562;
|
|
add.s32 %r1546, %r478, %r567;
|
|
add.s32 %r1551, %r1531, 128;
|
|
add.s32 %r1556, %r1531, 160;
|
|
add.s32 %r1561, %r1531, 192;
|
|
add.s32 %r1566, %r1531, 224;
|
|
add.s32 %r1571, %r1531, 16384;
|
|
add.s32 %r1576, %r1536, 16384;
|
|
add.s32 %r1581, %r1541, 16384;
|
|
add.s32 %r1586, %r1546, 16384;
|
|
add.s32 %r1591, %r1531, 16512;
|
|
add.s32 %r1596, %r1531, 16544;
|
|
add.s32 %r1601, %r1531, 16576;
|
|
add.s32 %r1606, %r1531, 16608;
|
|
bfe.u32 %r568, %r1, 7, 1;
|
|
shl.b32 %r569, %r17, 1;
|
|
or.b32 %r570, %r569, %r568;
|
|
xor.b32 %r571, %r570, %r4;
|
|
shl.b32 %r572, %r571, 4;
|
|
shl.b32 %r573, %r516, 7;
|
|
or.b32 %r574, %r572, %r573;
|
|
add.s32 %r1611, %r478, %r574;
|
|
add.s32 %r1616, %r1611, 2048;
|
|
add.s32 %r1621, %r1611, 4096;
|
|
add.s32 %r1626, %r1611, 6144;
|
|
add.s32 %r1631, %r1611, 8192;
|
|
add.s32 %r1636, %r1611, 10240;
|
|
add.s32 %r1641, %r1611, 12288;
|
|
add.s32 %r1646, %r1611, 14336;
|
|
or.b32 %r575, %r570, 4;
|
|
xor.b32 %r576, %r575, %r4;
|
|
shl.b32 %r577, %r576, 4;
|
|
or.b32 %r578, %r577, %r573;
|
|
add.s32 %r1651, %r478, %r578;
|
|
add.s32 %r1656, %r1651, 2048;
|
|
add.s32 %r1661, %r1651, 4096;
|
|
add.s32 %r1666, %r1651, 6144;
|
|
add.s32 %r1671, %r1651, 8192;
|
|
add.s32 %r1676, %r1651, 10240;
|
|
add.s32 %r1681, %r1651, 12288;
|
|
add.s32 %r1686, %r1651, 14336;
|
|
mov.f32 %f515, 0f00000000;
|
|
mov.u32 %r625, 0;
|
|
mov.f32 %f516, %f515;
|
|
mov.f32 %f517, %f515;
|
|
mov.f32 %f518, %f515;
|
|
mov.f32 %f519, %f515;
|
|
mov.f32 %f520, %f515;
|
|
mov.f32 %f521, %f515;
|
|
mov.f32 %f522, %f515;
|
|
mov.f32 %f523, %f515;
|
|
mov.f32 %f524, %f515;
|
|
mov.f32 %f525, %f515;
|
|
mov.f32 %f526, %f515;
|
|
mov.f32 %f527, %f515;
|
|
mov.f32 %f528, %f515;
|
|
mov.f32 %f529, %f515;
|
|
mov.f32 %f530, %f515;
|
|
mov.f32 %f531, %f515;
|
|
mov.f32 %f532, %f515;
|
|
mov.f32 %f533, %f515;
|
|
mov.f32 %f534, %f515;
|
|
mov.f32 %f535, %f515;
|
|
mov.f32 %f536, %f515;
|
|
mov.f32 %f537, %f515;
|
|
mov.f32 %f538, %f515;
|
|
mov.f32 %f539, %f515;
|
|
mov.f32 %f540, %f515;
|
|
mov.f32 %f541, %f515;
|
|
mov.f32 %f542, %f515;
|
|
mov.f32 %f543, %f515;
|
|
mov.f32 %f544, %f515;
|
|
mov.f32 %f545, %f515;
|
|
mov.f32 %f546, %f515;
|
|
mov.u32 %r3847, %r625;
|
|
LBB0_2:
|
|
@%p19 ld.global.v4.b32 { %r3499, %r3500, %r3501, %r3502 }, [ %rd46 + 0 ];
|
|
mov.b32 %hh257, %r3499;
|
|
mov.b32 %hh258, %r3500;
|
|
mov.b32 %hh259, %r3501;
|
|
mov.b32 %hh260, %r3502;
|
|
@%p19 ld.global.v4.b32 { %r3503, %r3504, %r3505, %r3506 }, [ %rd47 + 0 ];
|
|
mov.b32 %hh261, %r3503;
|
|
mov.b32 %hh262, %r3504;
|
|
mov.b32 %hh263, %r3505;
|
|
mov.b32 %hh264, %r3506;
|
|
@%p19 ld.global.v4.b32 { %r3507, %r3508, %r3509, %r3510 }, [ %rd48 + 0 ];
|
|
mov.b32 %hh265, %r3507;
|
|
mov.b32 %hh266, %r3508;
|
|
mov.b32 %hh267, %r3509;
|
|
mov.b32 %hh268, %r3510;
|
|
@%p19 ld.global.v4.b32 { %r3511, %r3512, %r3513, %r3514 }, [ %rd49 + 0 ];
|
|
mov.b32 %hh269, %r3511;
|
|
mov.b32 %hh270, %r3512;
|
|
mov.b32 %hh271, %r3513;
|
|
mov.b32 %hh272, %r3514;
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r20], {%r3499, %r3500, %r3501, %r3502};
|
|
st.shared.v4.b32 [%r21], {%r3503, %r3504, %r3505, %r3506};
|
|
st.shared.v4.b32 [%r22], {%r3507, %r3508, %r3509, %r3510};
|
|
st.shared.v4.b32 [%r23], {%r3511, %r3512, %r3513, %r3514};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r619, %r620, %r621, %r622 }, [ %r599 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r843, %r844, %r845, %r846 }, [ %r604 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1067, %r1068, %r1069, %r1070 }, [ %r609 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1291, %r1292, %r1293, %r1294 }, [ %r614 + 0 ];
|
|
mov.u32 %r839, %r625;
|
|
mov.u32 %r840, %r625;
|
|
mov.u32 %r841, %r625;
|
|
mov.u32 %r842, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r619, %r620, %r621, %r622 }, { %r3591, %r3592 }, { %r839, %r840, %r841, %r842 };
|
|
mov.u32 %r853, %r625;
|
|
mov.u32 %r854, %r625;
|
|
mov.u32 %r855, %r625;
|
|
mov.u32 %r856, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r619, %r620, %r621, %r622 }, { %r3593, %r3594 }, { %r853, %r854, %r855, %r856 };
|
|
mov.u32 %r867, %r625;
|
|
mov.u32 %r868, %r625;
|
|
mov.u32 %r869, %r625;
|
|
mov.u32 %r870, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r619, %r620, %r621, %r622 }, { %r3595, %r3596 }, { %r867, %r868, %r869, %r870 };
|
|
mov.u32 %r881, %r625;
|
|
mov.u32 %r882, %r625;
|
|
mov.u32 %r883, %r625;
|
|
mov.u32 %r884, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r619, %r620, %r621, %r622 }, { %r3597, %r3598 }, { %r881, %r882, %r883, %r884 };
|
|
mov.u32 %r895, %r625;
|
|
mov.u32 %r896, %r625;
|
|
mov.u32 %r897, %r625;
|
|
mov.u32 %r898, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r619, %r620, %r621, %r622 }, { %r3599, %r3600 }, { %r895, %r896, %r897, %r898 };
|
|
mov.u32 %r909, %r625;
|
|
mov.u32 %r910, %r625;
|
|
mov.u32 %r911, %r625;
|
|
mov.u32 %r912, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r619, %r620, %r621, %r622 }, { %r3601, %r3602 }, { %r909, %r910, %r911, %r912 };
|
|
mov.u32 %r923, %r625;
|
|
mov.u32 %r924, %r625;
|
|
mov.u32 %r925, %r625;
|
|
mov.u32 %r926, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r619, %r620, %r621, %r622 }, { %r3603, %r3604 }, { %r923, %r924, %r925, %r926 };
|
|
mov.u32 %r937, %r625;
|
|
mov.u32 %r938, %r625;
|
|
mov.u32 %r939, %r625;
|
|
mov.u32 %r940, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r619, %r620, %r621, %r622 }, { %r3605, %r3606 }, { %r937, %r938, %r939, %r940 };
|
|
mov.u32 %r951, %r625;
|
|
mov.u32 %r952, %r625;
|
|
mov.u32 %r953, %r625;
|
|
mov.u32 %r954, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r619, %r620, %r621, %r622 }, { %r3607, %r3608 }, { %r951, %r952, %r953, %r954 };
|
|
mov.u32 %r965, %r625;
|
|
mov.u32 %r966, %r625;
|
|
mov.u32 %r967, %r625;
|
|
mov.u32 %r968, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r619, %r620, %r621, %r622 }, { %r3609, %r3610 }, { %r965, %r966, %r967, %r968 };
|
|
mov.u32 %r979, %r625;
|
|
mov.u32 %r980, %r625;
|
|
mov.u32 %r981, %r625;
|
|
mov.u32 %r982, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r619, %r620, %r621, %r622 }, { %r3611, %r3612 }, { %r979, %r980, %r981, %r982 };
|
|
mov.u32 %r993, %r625;
|
|
mov.u32 %r994, %r625;
|
|
mov.u32 %r995, %r625;
|
|
mov.u32 %r996, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r619, %r620, %r621, %r622 }, { %r3613, %r3614 }, { %r993, %r994, %r995, %r996 };
|
|
mov.u32 %r1007, %r625;
|
|
mov.u32 %r1008, %r625;
|
|
mov.u32 %r1009, %r625;
|
|
mov.u32 %r1010, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r619, %r620, %r621, %r622 }, { %r3615, %r3616 }, { %r1007, %r1008, %r1009, %r1010 };
|
|
mov.u32 %r1021, %r625;
|
|
mov.u32 %r1022, %r625;
|
|
mov.u32 %r1023, %r625;
|
|
mov.u32 %r1024, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r619, %r620, %r621, %r622 }, { %r3617, %r3618 }, { %r1021, %r1022, %r1023, %r1024 };
|
|
mov.u32 %r1035, %r625;
|
|
mov.u32 %r1036, %r625;
|
|
mov.u32 %r1037, %r625;
|
|
mov.u32 %r1038, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r619, %r620, %r621, %r622 }, { %r3619, %r3620 }, { %r1035, %r1036, %r1037, %r1038 };
|
|
mov.u32 %r1049, %r625;
|
|
mov.u32 %r1050, %r625;
|
|
mov.u32 %r1051, %r625;
|
|
mov.u32 %r1052, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r619, %r620, %r621, %r622 }, { %r3621, %r3622 }, { %r1049, %r1050, %r1051, %r1052 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r843, %r844, %r845, %r846 }, { %r3623, %r3624 }, { %r839, %r840, %r841, %r842 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r843, %r844, %r845, %r846 }, { %r3625, %r3626 }, { %r853, %r854, %r855, %r856 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r843, %r844, %r845, %r846 }, { %r3627, %r3628 }, { %r867, %r868, %r869, %r870 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r843, %r844, %r845, %r846 }, { %r3629, %r3630 }, { %r881, %r882, %r883, %r884 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r843, %r844, %r845, %r846 }, { %r3631, %r3632 }, { %r895, %r896, %r897, %r898 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r843, %r844, %r845, %r846 }, { %r3633, %r3634 }, { %r909, %r910, %r911, %r912 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r843, %r844, %r845, %r846 }, { %r3635, %r3636 }, { %r923, %r924, %r925, %r926 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r843, %r844, %r845, %r846 }, { %r3637, %r3638 }, { %r937, %r938, %r939, %r940 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r843, %r844, %r845, %r846 }, { %r3639, %r3640 }, { %r951, %r952, %r953, %r954 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r843, %r844, %r845, %r846 }, { %r3641, %r3642 }, { %r965, %r966, %r967, %r968 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r843, %r844, %r845, %r846 }, { %r3643, %r3644 }, { %r979, %r980, %r981, %r982 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r843, %r844, %r845, %r846 }, { %r3645, %r3646 }, { %r993, %r994, %r995, %r996 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r843, %r844, %r845, %r846 }, { %r3647, %r3648 }, { %r1007, %r1008, %r1009, %r1010 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r843, %r844, %r845, %r846 }, { %r3649, %r3650 }, { %r1021, %r1022, %r1023, %r1024 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r843, %r844, %r845, %r846 }, { %r3651, %r3652 }, { %r1035, %r1036, %r1037, %r1038 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r843, %r844, %r845, %r846 }, { %r3653, %r3654 }, { %r1049, %r1050, %r1051, %r1052 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3655, %r3656 }, { %r839, %r840, %r841, %r842 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3657, %r3658 }, { %r853, %r854, %r855, %r856 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3659, %r3660 }, { %r867, %r868, %r869, %r870 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3661, %r3662 }, { %r881, %r882, %r883, %r884 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3663, %r3664 }, { %r895, %r896, %r897, %r898 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3665, %r3666 }, { %r909, %r910, %r911, %r912 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3667, %r3668 }, { %r923, %r924, %r925, %r926 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3669, %r3670 }, { %r937, %r938, %r939, %r940 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3671, %r3672 }, { %r951, %r952, %r953, %r954 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3673, %r3674 }, { %r965, %r966, %r967, %r968 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3675, %r3676 }, { %r979, %r980, %r981, %r982 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3677, %r3678 }, { %r993, %r994, %r995, %r996 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3679, %r3680 }, { %r1007, %r1008, %r1009, %r1010 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3681, %r3682 }, { %r1021, %r1022, %r1023, %r1024 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3683, %r3684 }, { %r1035, %r1036, %r1037, %r1038 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1067, %r1068, %r1069, %r1070 }, { %r3685, %r3686 }, { %r1049, %r1050, %r1051, %r1052 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r839, %r840, %r841, %r842 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3687, %r3688 }, { %r839, %r840, %r841, %r842 };
|
|
mov.b32 %f162, %r842;
|
|
mov.b32 %f163, %r841;
|
|
mov.b32 %f164, %r840;
|
|
mov.b32 %f165, %r839;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r853, %r854, %r855, %r856 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3689, %r3690 }, { %r853, %r854, %r855, %r856 };
|
|
mov.b32 %f166, %r856;
|
|
mov.b32 %f167, %r855;
|
|
mov.b32 %f168, %r854;
|
|
mov.b32 %f169, %r853;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r867, %r868, %r869, %r870 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3691, %r3692 }, { %r867, %r868, %r869, %r870 };
|
|
mov.b32 %f170, %r870;
|
|
mov.b32 %f171, %r869;
|
|
mov.b32 %f172, %r868;
|
|
mov.b32 %f173, %r867;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r881, %r882, %r883, %r884 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3693, %r3694 }, { %r881, %r882, %r883, %r884 };
|
|
mov.b32 %f174, %r884;
|
|
mov.b32 %f175, %r883;
|
|
mov.b32 %f176, %r882;
|
|
mov.b32 %f177, %r881;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r895, %r896, %r897, %r898 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3695, %r3696 }, { %r895, %r896, %r897, %r898 };
|
|
mov.b32 %f178, %r898;
|
|
mov.b32 %f179, %r897;
|
|
mov.b32 %f180, %r896;
|
|
mov.b32 %f181, %r895;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r909, %r910, %r911, %r912 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3697, %r3698 }, { %r909, %r910, %r911, %r912 };
|
|
mov.b32 %f182, %r912;
|
|
mov.b32 %f183, %r911;
|
|
mov.b32 %f184, %r910;
|
|
mov.b32 %f185, %r909;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r923, %r924, %r925, %r926 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3699, %r3700 }, { %r923, %r924, %r925, %r926 };
|
|
mov.b32 %f186, %r926;
|
|
mov.b32 %f187, %r925;
|
|
mov.b32 %f188, %r924;
|
|
mov.b32 %f189, %r923;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r937, %r938, %r939, %r940 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3701, %r3702 }, { %r937, %r938, %r939, %r940 };
|
|
mov.b32 %f190, %r940;
|
|
mov.b32 %f191, %r939;
|
|
mov.b32 %f192, %r938;
|
|
mov.b32 %f193, %r937;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r951, %r952, %r953, %r954 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3703, %r3704 }, { %r951, %r952, %r953, %r954 };
|
|
mov.b32 %f194, %r954;
|
|
mov.b32 %f195, %r953;
|
|
mov.b32 %f196, %r952;
|
|
mov.b32 %f197, %r951;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r965, %r966, %r967, %r968 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3705, %r3706 }, { %r965, %r966, %r967, %r968 };
|
|
mov.b32 %f198, %r968;
|
|
mov.b32 %f199, %r967;
|
|
mov.b32 %f200, %r966;
|
|
mov.b32 %f201, %r965;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r979, %r980, %r981, %r982 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3707, %r3708 }, { %r979, %r980, %r981, %r982 };
|
|
mov.b32 %f202, %r982;
|
|
mov.b32 %f203, %r981;
|
|
mov.b32 %f204, %r980;
|
|
mov.b32 %f205, %r979;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r993, %r994, %r995, %r996 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3709, %r3710 }, { %r993, %r994, %r995, %r996 };
|
|
mov.b32 %f206, %r996;
|
|
mov.b32 %f207, %r995;
|
|
mov.b32 %f208, %r994;
|
|
mov.b32 %f209, %r993;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1007, %r1008, %r1009, %r1010 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3711, %r3712 }, { %r1007, %r1008, %r1009, %r1010 };
|
|
mov.b32 %f210, %r1010;
|
|
mov.b32 %f211, %r1009;
|
|
mov.b32 %f212, %r1008;
|
|
mov.b32 %f213, %r1007;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1021, %r1022, %r1023, %r1024 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3713, %r3714 }, { %r1021, %r1022, %r1023, %r1024 };
|
|
mov.b32 %f214, %r1024;
|
|
mov.b32 %f215, %r1023;
|
|
mov.b32 %f216, %r1022;
|
|
mov.b32 %f217, %r1021;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1035, %r1036, %r1037, %r1038 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3715, %r3716 }, { %r1035, %r1036, %r1037, %r1038 };
|
|
mov.b32 %f218, %r1038;
|
|
mov.b32 %f219, %r1037;
|
|
mov.b32 %f220, %r1036;
|
|
mov.b32 %f221, %r1035;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1049, %r1050, %r1051, %r1052 }, { %r1291, %r1292, %r1293, %r1294 }, { %r3717, %r3718 }, { %r1049, %r1050, %r1051, %r1052 };
|
|
mov.b32 %f222, %r1052;
|
|
mov.b32 %f223, %r1051;
|
|
mov.b32 %f224, %r1050;
|
|
mov.b32 %f225, %r1049;
|
|
@%p19 ld.global.v4.b32 { %r3531, %r3532, %r3533, %r3534 }, [ %rd50 + 0 ];
|
|
mov.b32 %hh273, %r3531;
|
|
mov.b32 %hh274, %r3532;
|
|
mov.b32 %hh275, %r3533;
|
|
mov.b32 %hh276, %r3534;
|
|
@%p19 ld.global.v4.b32 { %r3535, %r3536, %r3537, %r3538 }, [ %rd51 + 0 ];
|
|
mov.b32 %hh277, %r3535;
|
|
mov.b32 %hh278, %r3536;
|
|
mov.b32 %hh279, %r3537;
|
|
mov.b32 %hh280, %r3538;
|
|
@%p19 ld.global.v4.b32 { %r3539, %r3540, %r3541, %r3542 }, [ %rd52 + 0 ];
|
|
mov.b32 %hh281, %r3539;
|
|
mov.b32 %hh282, %r3540;
|
|
mov.b32 %hh283, %r3541;
|
|
mov.b32 %hh284, %r3542;
|
|
@%p19 ld.global.v4.b32 { %r3543, %r3544, %r3545, %r3546 }, [ %rd53 + 0 ];
|
|
mov.b32 %hh285, %r3543;
|
|
mov.b32 %hh286, %r3544;
|
|
mov.b32 %hh287, %r3545;
|
|
mov.b32 %hh288, %r3546;
|
|
cvt.rn.f16.f32 %h65, %f165;
|
|
cvt.rn.f16.f32 %h66, %f164;
|
|
cvt.rn.f16.f32 %h67, %f163;
|
|
cvt.rn.f16.f32 %h68, %f162;
|
|
cvt.rn.f16.f32 %h69, %f169;
|
|
cvt.rn.f16.f32 %h70, %f168;
|
|
cvt.rn.f16.f32 %h71, %f167;
|
|
cvt.rn.f16.f32 %h72, %f166;
|
|
cvt.rn.f16.f32 %h73, %f173;
|
|
cvt.rn.f16.f32 %h74, %f172;
|
|
cvt.rn.f16.f32 %h75, %f171;
|
|
cvt.rn.f16.f32 %h76, %f170;
|
|
cvt.rn.f16.f32 %h77, %f177;
|
|
cvt.rn.f16.f32 %h78, %f176;
|
|
cvt.rn.f16.f32 %h79, %f175;
|
|
cvt.rn.f16.f32 %h80, %f174;
|
|
cvt.rn.f16.f32 %h81, %f181;
|
|
cvt.rn.f16.f32 %h82, %f180;
|
|
cvt.rn.f16.f32 %h83, %f179;
|
|
cvt.rn.f16.f32 %h84, %f178;
|
|
cvt.rn.f16.f32 %h85, %f185;
|
|
cvt.rn.f16.f32 %h86, %f184;
|
|
cvt.rn.f16.f32 %h87, %f183;
|
|
cvt.rn.f16.f32 %h88, %f182;
|
|
cvt.rn.f16.f32 %h89, %f189;
|
|
cvt.rn.f16.f32 %h90, %f188;
|
|
cvt.rn.f16.f32 %h91, %f187;
|
|
cvt.rn.f16.f32 %h92, %f186;
|
|
cvt.rn.f16.f32 %h93, %f193;
|
|
cvt.rn.f16.f32 %h94, %f192;
|
|
cvt.rn.f16.f32 %h95, %f191;
|
|
cvt.rn.f16.f32 %h96, %f190;
|
|
cvt.rn.f16.f32 %h97, %f197;
|
|
cvt.rn.f16.f32 %h98, %f196;
|
|
cvt.rn.f16.f32 %h99, %f195;
|
|
cvt.rn.f16.f32 %h100, %f194;
|
|
cvt.rn.f16.f32 %h101, %f201;
|
|
cvt.rn.f16.f32 %h102, %f200;
|
|
cvt.rn.f16.f32 %h103, %f199;
|
|
cvt.rn.f16.f32 %h104, %f198;
|
|
cvt.rn.f16.f32 %h105, %f205;
|
|
cvt.rn.f16.f32 %h106, %f204;
|
|
cvt.rn.f16.f32 %h107, %f203;
|
|
cvt.rn.f16.f32 %h108, %f202;
|
|
cvt.rn.f16.f32 %h109, %f209;
|
|
cvt.rn.f16.f32 %h110, %f208;
|
|
cvt.rn.f16.f32 %h111, %f207;
|
|
cvt.rn.f16.f32 %h112, %f206;
|
|
cvt.rn.f16.f32 %h113, %f213;
|
|
cvt.rn.f16.f32 %h114, %f212;
|
|
cvt.rn.f16.f32 %h115, %f211;
|
|
cvt.rn.f16.f32 %h116, %f210;
|
|
cvt.rn.f16.f32 %h117, %f217;
|
|
cvt.rn.f16.f32 %h118, %f216;
|
|
cvt.rn.f16.f32 %h119, %f215;
|
|
cvt.rn.f16.f32 %h120, %f214;
|
|
cvt.rn.f16.f32 %h121, %f221;
|
|
cvt.rn.f16.f32 %h122, %f220;
|
|
cvt.rn.f16.f32 %h123, %f219;
|
|
cvt.rn.f16.f32 %h124, %f218;
|
|
cvt.rn.f16.f32 %h125, %f225;
|
|
cvt.rn.f16.f32 %h126, %f224;
|
|
cvt.rn.f16.f32 %h127, %f223;
|
|
cvt.rn.f16.f32 %h128, %f222;
|
|
bar.sync 0;
|
|
st.shared.b16 [%r28], %h65;
|
|
st.shared.b16 [%r28+256], %h66;
|
|
st.shared.b16 [%r28+16], %h67;
|
|
st.shared.b16 [%r30+16], %h68;
|
|
st.shared.b16 [%r28+2048], %h69;
|
|
st.shared.b16 [%r28+2304], %h70;
|
|
st.shared.b16 [%r31+16], %h71;
|
|
st.shared.b16 [%r32+16], %h72;
|
|
st.shared.b16 [%r28+4096], %h73;
|
|
st.shared.b16 [%r28+4352], %h74;
|
|
st.shared.b16 [%r33+16], %h75;
|
|
st.shared.b16 [%r34+16], %h76;
|
|
st.shared.b16 [%r28+6144], %h77;
|
|
st.shared.b16 [%r28+6400], %h78;
|
|
st.shared.b16 [%r35+16], %h79;
|
|
st.shared.b16 [%r36+16], %h80;
|
|
st.shared.b16 [%r28+8192], %h81;
|
|
st.shared.b16 [%r28+8448], %h82;
|
|
st.shared.b16 [%r37+16], %h83;
|
|
st.shared.b16 [%r38+16], %h84;
|
|
st.shared.b16 [%r28+10240], %h85;
|
|
st.shared.b16 [%r28+10496], %h86;
|
|
st.shared.b16 [%r39+16], %h87;
|
|
st.shared.b16 [%r40+16], %h88;
|
|
st.shared.b16 [%r28+12288], %h89;
|
|
st.shared.b16 [%r28+12544], %h90;
|
|
st.shared.b16 [%r41+16], %h91;
|
|
st.shared.b16 [%r42+16], %h92;
|
|
st.shared.b16 [%r28+14336], %h93;
|
|
st.shared.b16 [%r28+14592], %h94;
|
|
st.shared.b16 [%r43+16], %h95;
|
|
st.shared.b16 [%r44+16], %h96;
|
|
st.shared.b16 [%r28+16384], %h97;
|
|
st.shared.b16 [%r28+16640], %h98;
|
|
st.shared.b16 [%r45+16], %h99;
|
|
st.shared.b16 [%r46+16], %h100;
|
|
st.shared.b16 [%r28+18432], %h101;
|
|
st.shared.b16 [%r28+18688], %h102;
|
|
st.shared.b16 [%r47+16], %h103;
|
|
st.shared.b16 [%r48+16], %h104;
|
|
st.shared.b16 [%r28+20480], %h105;
|
|
st.shared.b16 [%r28+20736], %h106;
|
|
st.shared.b16 [%r49+16], %h107;
|
|
st.shared.b16 [%r50+16], %h108;
|
|
st.shared.b16 [%r28+22528], %h109;
|
|
st.shared.b16 [%r28+22784], %h110;
|
|
st.shared.b16 [%r51+16], %h111;
|
|
st.shared.b16 [%r52+16], %h112;
|
|
st.shared.b16 [%r28+24576], %h113;
|
|
st.shared.b16 [%r28+24832], %h114;
|
|
st.shared.b16 [%r53+16], %h115;
|
|
st.shared.b16 [%r54+16], %h116;
|
|
st.shared.b16 [%r28+26624], %h117;
|
|
st.shared.b16 [%r28+26880], %h118;
|
|
st.shared.b16 [%r55+16], %h119;
|
|
st.shared.b16 [%r56+16], %h120;
|
|
st.shared.b16 [%r28+28672], %h121;
|
|
st.shared.b16 [%r28+28928], %h122;
|
|
st.shared.b16 [%r57+16], %h123;
|
|
st.shared.b16 [%r58+16], %h124;
|
|
st.shared.b16 [%r28+30720], %h125;
|
|
st.shared.b16 [%r28+30976], %h126;
|
|
st.shared.b16 [%r59+16], %h127;
|
|
st.shared.b16 [%r60+16], %h128;
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1691, %r1692, %r1693, %r1694 }, [ %r1531 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1803, %r1804, %r1805, %r1806 }, [ %r1536 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1915, %r1916, %r1917, %r1918 }, [ %r1541 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2027, %r2028, %r2029, %r2030 }, [ %r1546 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2139, %r2140, %r2141, %r2142 }, [ %r1551 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2251, %r2252, %r2253, %r2254 }, [ %r1556 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2363, %r2364, %r2365, %r2366 }, [ %r1561 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2475, %r2476, %r2477, %r2478 }, [ %r1566 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1747, %r1748, %r1749, %r1750 }, [ %r1571 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1859, %r1860, %r1861, %r1862 }, [ %r1576 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1971, %r1972, %r1973, %r1974 }, [ %r1581 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2083, %r2084, %r2085, %r2086 }, [ %r1586 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2195, %r2196, %r2197, %r2198 }, [ %r1591 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2307, %r2308, %r2309, %r2310 }, [ %r1596 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2419, %r2420, %r2421, %r2422 }, [ %r1601 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2531, %r2532, %r2533, %r2534 }, [ %r1606 + 0 ];
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534};
|
|
st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538};
|
|
st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542};
|
|
st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1695, %r1696, %r1709, %r1710 }, [ %r1611 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1807, %r1808, %r1821, %r1822 }, [ %r1616 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1919, %r1920, %r1933, %r1934 }, [ %r1621 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2031, %r2032, %r2045, %r2046 }, [ %r1626 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2143, %r2144, %r2157, %r2158 }, [ %r1631 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2255, %r2256, %r2269, %r2270 }, [ %r1636 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2367, %r2368, %r2381, %r2382 }, [ %r1641 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2479, %r2480, %r2493, %r2494 }, [ %r1646 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1723, %r1724, %r1737, %r1738 }, [ %r1651 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1835, %r1836, %r1849, %r1850 }, [ %r1656 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1947, %r1948, %r1961, %r1962 }, [ %r1661 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2059, %r2060, %r2073, %r2074 }, [ %r1666 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2171, %r2172, %r2185, %r2186 }, [ %r1671 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2283, %r2284, %r2297, %r2298 }, [ %r1676 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2395, %r2396, %r2409, %r2410 }, [ %r1681 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2507, %r2508, %r2521, %r2522 }, [ %r1686 + 0 ];
|
|
mov.b32 %r1799, %f515;
|
|
mov.b32 %r1800, %f516;
|
|
mov.b32 %r1801, %f517;
|
|
mov.b32 %r1802, %f518;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1695, %r1696 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mov.b32 %r1813, %f519;
|
|
mov.b32 %r1814, %f520;
|
|
mov.b32 %r1815, %f521;
|
|
mov.b32 %r1816, %f522;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1709, %r1710 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mov.b32 %r1827, %f523;
|
|
mov.b32 %r1828, %f524;
|
|
mov.b32 %r1829, %f525;
|
|
mov.b32 %r1830, %f526;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1723, %r1724 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mov.b32 %r1841, %f527;
|
|
mov.b32 %r1842, %f528;
|
|
mov.b32 %r1843, %f529;
|
|
mov.b32 %r1844, %f530;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1691, %r1692, %r1693, %r1694 }, { %r1737, %r1738 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mov.b32 %r1855, %f531;
|
|
mov.b32 %r1856, %f532;
|
|
mov.b32 %r1857, %f533;
|
|
mov.b32 %r1858, %f534;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1695, %r1696 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mov.b32 %r1869, %f535;
|
|
mov.b32 %r1870, %f536;
|
|
mov.b32 %r1871, %f537;
|
|
mov.b32 %r1872, %f538;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1709, %r1710 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mov.b32 %r1883, %f539;
|
|
mov.b32 %r1884, %f540;
|
|
mov.b32 %r1885, %f541;
|
|
mov.b32 %r1886, %f542;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1723, %r1724 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mov.b32 %r1897, %f543;
|
|
mov.b32 %r1898, %f544;
|
|
mov.b32 %r1899, %f545;
|
|
mov.b32 %r1900, %f546;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1747, %r1748, %r1749, %r1750 }, { %r1737, %r1738 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1807, %r1808 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1821, %r1822 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1835, %r1836 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1803, %r1804, %r1805, %r1806 }, { %r1849, %r1850 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1807, %r1808 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1821, %r1822 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1835, %r1836 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1859, %r1860, %r1861, %r1862 }, { %r1849, %r1850 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1919, %r1920 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1933, %r1934 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1947, %r1948 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r1915, %r1916, %r1917, %r1918 }, { %r1961, %r1962 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1919, %r1920 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1933, %r1934 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1947, %r1948 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r1971, %r1972, %r1973, %r1974 }, { %r1961, %r1962 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2031, %r2032 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2045, %r2046 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2059, %r2060 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2027, %r2028, %r2029, %r2030 }, { %r2073, %r2074 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2031, %r2032 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2045, %r2046 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2059, %r2060 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2083, %r2084, %r2085, %r2086 }, { %r2073, %r2074 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2143, %r2144 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2157, %r2158 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2171, %r2172 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2139, %r2140, %r2141, %r2142 }, { %r2185, %r2186 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2143, %r2144 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2157, %r2158 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2171, %r2172 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2195, %r2196, %r2197, %r2198 }, { %r2185, %r2186 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2255, %r2256 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2269, %r2270 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2283, %r2284 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2251, %r2252, %r2253, %r2254 }, { %r2297, %r2298 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2255, %r2256 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2269, %r2270 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2283, %r2284 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2307, %r2308, %r2309, %r2310 }, { %r2297, %r2298 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2367, %r2368 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2381, %r2382 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2395, %r2396 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2363, %r2364, %r2365, %r2366 }, { %r2409, %r2410 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2367, %r2368 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2381, %r2382 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2395, %r2396 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2419, %r2420, %r2421, %r2422 }, { %r2409, %r2410 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1799, %r1800, %r1801, %r1802 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2479, %r2480 }, { %r1799, %r1800, %r1801, %r1802 };
|
|
mov.b32 %f518, %r1802;
|
|
mov.b32 %f517, %r1801;
|
|
mov.b32 %f516, %r1800;
|
|
mov.b32 %f515, %r1799;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1813, %r1814, %r1815, %r1816 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2493, %r2494 }, { %r1813, %r1814, %r1815, %r1816 };
|
|
mov.b32 %f522, %r1816;
|
|
mov.b32 %f521, %r1815;
|
|
mov.b32 %f520, %r1814;
|
|
mov.b32 %f519, %r1813;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1827, %r1828, %r1829, %r1830 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2507, %r2508 }, { %r1827, %r1828, %r1829, %r1830 };
|
|
mov.b32 %f526, %r1830;
|
|
mov.b32 %f525, %r1829;
|
|
mov.b32 %f524, %r1828;
|
|
mov.b32 %f523, %r1827;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1841, %r1842, %r1843, %r1844 }, { %r2475, %r2476, %r2477, %r2478 }, { %r2521, %r2522 }, { %r1841, %r1842, %r1843, %r1844 };
|
|
mov.b32 %f530, %r1844;
|
|
mov.b32 %f529, %r1843;
|
|
mov.b32 %f528, %r1842;
|
|
mov.b32 %f527, %r1841;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1855, %r1856, %r1857, %r1858 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2479, %r2480 }, { %r1855, %r1856, %r1857, %r1858 };
|
|
mov.b32 %f534, %r1858;
|
|
mov.b32 %f533, %r1857;
|
|
mov.b32 %f532, %r1856;
|
|
mov.b32 %f531, %r1855;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1869, %r1870, %r1871, %r1872 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2493, %r2494 }, { %r1869, %r1870, %r1871, %r1872 };
|
|
mov.b32 %f538, %r1872;
|
|
mov.b32 %f537, %r1871;
|
|
mov.b32 %f536, %r1870;
|
|
mov.b32 %f535, %r1869;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1883, %r1884, %r1885, %r1886 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2507, %r2508 }, { %r1883, %r1884, %r1885, %r1886 };
|
|
mov.b32 %f542, %r1886;
|
|
mov.b32 %f541, %r1885;
|
|
mov.b32 %f540, %r1884;
|
|
mov.b32 %f539, %r1883;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1897, %r1898, %r1899, %r1900 }, { %r2531, %r2532, %r2533, %r2534 }, { %r2521, %r2522 }, { %r1897, %r1898, %r1899, %r1900 };
|
|
mov.b32 %f546, %r1900;
|
|
mov.b32 %f545, %r1899;
|
|
mov.b32 %f544, %r1898;
|
|
mov.b32 %f543, %r1897;
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r20], {%r3531, %r3532, %r3533, %r3534};
|
|
st.shared.v4.b32 [%r21], {%r3535, %r3536, %r3537, %r3538};
|
|
st.shared.v4.b32 [%r22], {%r3539, %r3540, %r3541, %r3542};
|
|
st.shared.v4.b32 [%r23], {%r3543, %r3544, %r3545, %r3546};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2607, %r2608, %r2609, %r2610 }, [ %r599 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2831, %r2832, %r2833, %r2834 }, [ %r604 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3055, %r3056, %r3057, %r3058 }, [ %r609 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3279, %r3280, %r3281, %r3282 }, [ %r614 + 0 ];
|
|
mov.u32 %r2827, %r625;
|
|
mov.u32 %r2828, %r625;
|
|
mov.u32 %r2829, %r625;
|
|
mov.u32 %r2830, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3719, %r3720 }, { %r2827, %r2828, %r2829, %r2830 };
|
|
mov.u32 %r2841, %r625;
|
|
mov.u32 %r2842, %r625;
|
|
mov.u32 %r2843, %r625;
|
|
mov.u32 %r2844, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3721, %r3722 }, { %r2841, %r2842, %r2843, %r2844 };
|
|
mov.u32 %r2855, %r625;
|
|
mov.u32 %r2856, %r625;
|
|
mov.u32 %r2857, %r625;
|
|
mov.u32 %r2858, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3723, %r3724 }, { %r2855, %r2856, %r2857, %r2858 };
|
|
mov.u32 %r2869, %r625;
|
|
mov.u32 %r2870, %r625;
|
|
mov.u32 %r2871, %r625;
|
|
mov.u32 %r2872, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3725, %r3726 }, { %r2869, %r2870, %r2871, %r2872 };
|
|
mov.u32 %r2883, %r625;
|
|
mov.u32 %r2884, %r625;
|
|
mov.u32 %r2885, %r625;
|
|
mov.u32 %r2886, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3727, %r3728 }, { %r2883, %r2884, %r2885, %r2886 };
|
|
mov.u32 %r2897, %r625;
|
|
mov.u32 %r2898, %r625;
|
|
mov.u32 %r2899, %r625;
|
|
mov.u32 %r2900, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3729, %r3730 }, { %r2897, %r2898, %r2899, %r2900 };
|
|
mov.u32 %r2911, %r625;
|
|
mov.u32 %r2912, %r625;
|
|
mov.u32 %r2913, %r625;
|
|
mov.u32 %r2914, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3731, %r3732 }, { %r2911, %r2912, %r2913, %r2914 };
|
|
mov.u32 %r2925, %r625;
|
|
mov.u32 %r2926, %r625;
|
|
mov.u32 %r2927, %r625;
|
|
mov.u32 %r2928, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3733, %r3734 }, { %r2925, %r2926, %r2927, %r2928 };
|
|
mov.u32 %r2939, %r625;
|
|
mov.u32 %r2940, %r625;
|
|
mov.u32 %r2941, %r625;
|
|
mov.u32 %r2942, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3735, %r3736 }, { %r2939, %r2940, %r2941, %r2942 };
|
|
mov.u32 %r2953, %r625;
|
|
mov.u32 %r2954, %r625;
|
|
mov.u32 %r2955, %r625;
|
|
mov.u32 %r2956, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3737, %r3738 }, { %r2953, %r2954, %r2955, %r2956 };
|
|
mov.u32 %r2967, %r625;
|
|
mov.u32 %r2968, %r625;
|
|
mov.u32 %r2969, %r625;
|
|
mov.u32 %r2970, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3739, %r3740 }, { %r2967, %r2968, %r2969, %r2970 };
|
|
mov.u32 %r2981, %r625;
|
|
mov.u32 %r2982, %r625;
|
|
mov.u32 %r2983, %r625;
|
|
mov.u32 %r2984, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3741, %r3742 }, { %r2981, %r2982, %r2983, %r2984 };
|
|
mov.u32 %r2995, %r625;
|
|
mov.u32 %r2996, %r625;
|
|
mov.u32 %r2997, %r625;
|
|
mov.u32 %r2998, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3743, %r3744 }, { %r2995, %r2996, %r2997, %r2998 };
|
|
mov.u32 %r3009, %r625;
|
|
mov.u32 %r3010, %r625;
|
|
mov.u32 %r3011, %r625;
|
|
mov.u32 %r3012, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3745, %r3746 }, { %r3009, %r3010, %r3011, %r3012 };
|
|
mov.u32 %r3023, %r625;
|
|
mov.u32 %r3024, %r625;
|
|
mov.u32 %r3025, %r625;
|
|
mov.u32 %r3026, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3747, %r3748 }, { %r3023, %r3024, %r3025, %r3026 };
|
|
mov.u32 %r3040, %r625;
|
|
mov.u32 %r3037, %r625;
|
|
mov.u32 %r3038, %r625;
|
|
mov.u32 %r3039, %r625;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2607, %r2608, %r2609, %r2610 }, { %r3749, %r3750 }, { %r3037, %r3038, %r3039, %r3040 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3751, %r3752 }, { %r2827, %r2828, %r2829, %r2830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3753, %r3754 }, { %r2841, %r2842, %r2843, %r2844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3755, %r3756 }, { %r2855, %r2856, %r2857, %r2858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3757, %r3758 }, { %r2869, %r2870, %r2871, %r2872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3759, %r3760 }, { %r2883, %r2884, %r2885, %r2886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3761, %r3762 }, { %r2897, %r2898, %r2899, %r2900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3763, %r3764 }, { %r2911, %r2912, %r2913, %r2914 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3765, %r3766 }, { %r2925, %r2926, %r2927, %r2928 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3767, %r3768 }, { %r2939, %r2940, %r2941, %r2942 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3769, %r3770 }, { %r2953, %r2954, %r2955, %r2956 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3771, %r3772 }, { %r2967, %r2968, %r2969, %r2970 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3773, %r3774 }, { %r2981, %r2982, %r2983, %r2984 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3775, %r3776 }, { %r2995, %r2996, %r2997, %r2998 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3777, %r3778 }, { %r3009, %r3010, %r3011, %r3012 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3779, %r3780 }, { %r3023, %r3024, %r3025, %r3026 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r2831, %r2832, %r2833, %r2834 }, { %r3781, %r3782 }, { %r3037, %r3038, %r3039, %r3040 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3783, %r3784 }, { %r2827, %r2828, %r2829, %r2830 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3785, %r3786 }, { %r2841, %r2842, %r2843, %r2844 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3787, %r3788 }, { %r2855, %r2856, %r2857, %r2858 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3789, %r3790 }, { %r2869, %r2870, %r2871, %r2872 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3791, %r3792 }, { %r2883, %r2884, %r2885, %r2886 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3793, %r3794 }, { %r2897, %r2898, %r2899, %r2900 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3795, %r3796 }, { %r2911, %r2912, %r2913, %r2914 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3797, %r3798 }, { %r2925, %r2926, %r2927, %r2928 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3799, %r3800 }, { %r2939, %r2940, %r2941, %r2942 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3801, %r3802 }, { %r2953, %r2954, %r2955, %r2956 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3803, %r3804 }, { %r2967, %r2968, %r2969, %r2970 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3805, %r3806 }, { %r2981, %r2982, %r2983, %r2984 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3807, %r3808 }, { %r2995, %r2996, %r2997, %r2998 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3809, %r3810 }, { %r3009, %r3010, %r3011, %r3012 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3811, %r3812 }, { %r3023, %r3024, %r3025, %r3026 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3055, %r3056, %r3057, %r3058 }, { %r3813, %r3814 }, { %r3037, %r3038, %r3039, %r3040 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2827, %r2828, %r2829, %r2830 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3815, %r3816 }, { %r2827, %r2828, %r2829, %r2830 };
|
|
mov.b32 %f226, %r2830;
|
|
mov.b32 %f227, %r2829;
|
|
mov.b32 %f228, %r2828;
|
|
mov.b32 %f229, %r2827;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2841, %r2842, %r2843, %r2844 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3817, %r3818 }, { %r2841, %r2842, %r2843, %r2844 };
|
|
mov.b32 %f230, %r2844;
|
|
mov.b32 %f231, %r2843;
|
|
mov.b32 %f232, %r2842;
|
|
mov.b32 %f233, %r2841;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2855, %r2856, %r2857, %r2858 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3819, %r3820 }, { %r2855, %r2856, %r2857, %r2858 };
|
|
mov.b32 %f234, %r2858;
|
|
mov.b32 %f235, %r2857;
|
|
mov.b32 %f236, %r2856;
|
|
mov.b32 %f237, %r2855;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2869, %r2870, %r2871, %r2872 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3821, %r3822 }, { %r2869, %r2870, %r2871, %r2872 };
|
|
mov.b32 %f238, %r2872;
|
|
mov.b32 %f239, %r2871;
|
|
mov.b32 %f240, %r2870;
|
|
mov.b32 %f241, %r2869;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2883, %r2884, %r2885, %r2886 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3823, %r3824 }, { %r2883, %r2884, %r2885, %r2886 };
|
|
mov.b32 %f242, %r2886;
|
|
mov.b32 %f243, %r2885;
|
|
mov.b32 %f244, %r2884;
|
|
mov.b32 %f245, %r2883;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2897, %r2898, %r2899, %r2900 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3825, %r3826 }, { %r2897, %r2898, %r2899, %r2900 };
|
|
mov.b32 %f246, %r2900;
|
|
mov.b32 %f247, %r2899;
|
|
mov.b32 %f248, %r2898;
|
|
mov.b32 %f249, %r2897;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2911, %r2912, %r2913, %r2914 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3827, %r3828 }, { %r2911, %r2912, %r2913, %r2914 };
|
|
mov.b32 %f250, %r2914;
|
|
mov.b32 %f251, %r2913;
|
|
mov.b32 %f252, %r2912;
|
|
mov.b32 %f253, %r2911;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2925, %r2926, %r2927, %r2928 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3829, %r3830 }, { %r2925, %r2926, %r2927, %r2928 };
|
|
mov.b32 %f254, %r2928;
|
|
mov.b32 %f255, %r2927;
|
|
mov.b32 %f256, %r2926;
|
|
mov.b32 %f257, %r2925;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2939, %r2940, %r2941, %r2942 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3831, %r3832 }, { %r2939, %r2940, %r2941, %r2942 };
|
|
mov.b32 %f258, %r2942;
|
|
mov.b32 %f259, %r2941;
|
|
mov.b32 %f260, %r2940;
|
|
mov.b32 %f261, %r2939;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2953, %r2954, %r2955, %r2956 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3833, %r3834 }, { %r2953, %r2954, %r2955, %r2956 };
|
|
mov.b32 %f262, %r2956;
|
|
mov.b32 %f263, %r2955;
|
|
mov.b32 %f264, %r2954;
|
|
mov.b32 %f265, %r2953;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2967, %r2968, %r2969, %r2970 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3835, %r3836 }, { %r2967, %r2968, %r2969, %r2970 };
|
|
mov.b32 %f266, %r2970;
|
|
mov.b32 %f267, %r2969;
|
|
mov.b32 %f268, %r2968;
|
|
mov.b32 %f269, %r2967;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2981, %r2982, %r2983, %r2984 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3837, %r3838 }, { %r2981, %r2982, %r2983, %r2984 };
|
|
mov.b32 %f270, %r2984;
|
|
mov.b32 %f271, %r2983;
|
|
mov.b32 %f272, %r2982;
|
|
mov.b32 %f273, %r2981;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2995, %r2996, %r2997, %r2998 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3839, %r3840 }, { %r2995, %r2996, %r2997, %r2998 };
|
|
mov.b32 %f274, %r2998;
|
|
mov.b32 %f275, %r2997;
|
|
mov.b32 %f276, %r2996;
|
|
mov.b32 %f277, %r2995;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3009, %r3010, %r3011, %r3012 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3841, %r3842 }, { %r3009, %r3010, %r3011, %r3012 };
|
|
mov.b32 %f278, %r3012;
|
|
mov.b32 %f279, %r3011;
|
|
mov.b32 %f280, %r3010;
|
|
mov.b32 %f281, %r3009;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3023, %r3024, %r3025, %r3026 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3843, %r3844 }, { %r3023, %r3024, %r3025, %r3026 };
|
|
mov.b32 %f282, %r3026;
|
|
mov.b32 %f283, %r3025;
|
|
mov.b32 %f284, %r3024;
|
|
mov.b32 %f285, %r3023;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3037, %r3038, %r3039, %r3040 }, { %r3279, %r3280, %r3281, %r3282 }, { %r3845, %r3846 }, { %r3037, %r3038, %r3039, %r3040 };
|
|
mov.b32 %f286, %r3040;
|
|
mov.b32 %f287, %r3039;
|
|
mov.b32 %f288, %r3038;
|
|
mov.b32 %f289, %r3037;
|
|
mul.f32 %f290, %f165, %f229;
|
|
mul.f32 %f291, %f164, %f228;
|
|
mul.f32 %f292, %f163, %f227;
|
|
mul.f32 %f293, %f162, %f226;
|
|
mul.f32 %f294, %f169, %f233;
|
|
mul.f32 %f295, %f168, %f232;
|
|
mul.f32 %f296, %f167, %f231;
|
|
mul.f32 %f297, %f166, %f230;
|
|
mul.f32 %f298, %f173, %f237;
|
|
mul.f32 %f299, %f172, %f236;
|
|
mul.f32 %f300, %f171, %f235;
|
|
mul.f32 %f301, %f170, %f234;
|
|
mul.f32 %f302, %f177, %f241;
|
|
mul.f32 %f303, %f176, %f240;
|
|
mul.f32 %f304, %f175, %f239;
|
|
mul.f32 %f305, %f174, %f238;
|
|
mul.f32 %f306, %f181, %f245;
|
|
mul.f32 %f307, %f180, %f244;
|
|
mul.f32 %f308, %f179, %f243;
|
|
mul.f32 %f309, %f178, %f242;
|
|
mul.f32 %f310, %f185, %f249;
|
|
mul.f32 %f311, %f184, %f248;
|
|
mul.f32 %f312, %f183, %f247;
|
|
mul.f32 %f313, %f182, %f246;
|
|
mul.f32 %f314, %f189, %f253;
|
|
mul.f32 %f315, %f188, %f252;
|
|
mul.f32 %f316, %f187, %f251;
|
|
mul.f32 %f317, %f186, %f250;
|
|
mul.f32 %f318, %f193, %f257;
|
|
mul.f32 %f319, %f192, %f256;
|
|
mul.f32 %f320, %f191, %f255;
|
|
mul.f32 %f321, %f190, %f254;
|
|
mul.f32 %f322, %f197, %f261;
|
|
mul.f32 %f323, %f196, %f260;
|
|
mul.f32 %f324, %f195, %f259;
|
|
mul.f32 %f325, %f194, %f258;
|
|
mul.f32 %f326, %f201, %f265;
|
|
mul.f32 %f327, %f200, %f264;
|
|
mul.f32 %f328, %f199, %f263;
|
|
mul.f32 %f329, %f198, %f262;
|
|
mul.f32 %f330, %f205, %f269;
|
|
mul.f32 %f331, %f204, %f268;
|
|
mul.f32 %f332, %f203, %f267;
|
|
mul.f32 %f333, %f202, %f266;
|
|
mul.f32 %f334, %f209, %f273;
|
|
mul.f32 %f335, %f208, %f272;
|
|
mul.f32 %f336, %f207, %f271;
|
|
mul.f32 %f337, %f206, %f270;
|
|
mul.f32 %f338, %f213, %f277;
|
|
mul.f32 %f339, %f212, %f276;
|
|
mul.f32 %f340, %f211, %f275;
|
|
mul.f32 %f341, %f210, %f274;
|
|
mul.f32 %f342, %f217, %f281;
|
|
mul.f32 %f343, %f216, %f280;
|
|
mul.f32 %f344, %f215, %f279;
|
|
mul.f32 %f345, %f214, %f278;
|
|
mul.f32 %f346, %f221, %f285;
|
|
mul.f32 %f347, %f220, %f284;
|
|
mul.f32 %f348, %f219, %f283;
|
|
mul.f32 %f349, %f218, %f282;
|
|
mul.f32 %f350, %f225, %f289;
|
|
mul.f32 %f351, %f224, %f288;
|
|
mul.f32 %f352, %f223, %f287;
|
|
mul.f32 %f353, %f222, %f286;
|
|
mul.f32 %f354, %f290, %f97;
|
|
mul.f32 %f355, %f291, %f97;
|
|
mul.f32 %f356, %f292, %f97;
|
|
mul.f32 %f357, %f293, %f97;
|
|
mul.f32 %f358, %f294, %f97;
|
|
mul.f32 %f359, %f295, %f97;
|
|
mul.f32 %f360, %f296, %f97;
|
|
mul.f32 %f361, %f297, %f97;
|
|
mul.f32 %f362, %f298, %f97;
|
|
mul.f32 %f363, %f299, %f97;
|
|
mul.f32 %f364, %f300, %f97;
|
|
mul.f32 %f365, %f301, %f97;
|
|
mul.f32 %f366, %f302, %f97;
|
|
mul.f32 %f367, %f303, %f97;
|
|
mul.f32 %f368, %f304, %f97;
|
|
mul.f32 %f369, %f305, %f97;
|
|
mul.f32 %f370, %f306, %f97;
|
|
mul.f32 %f371, %f307, %f97;
|
|
mul.f32 %f372, %f308, %f97;
|
|
mul.f32 %f373, %f309, %f97;
|
|
mul.f32 %f374, %f310, %f97;
|
|
mul.f32 %f375, %f311, %f97;
|
|
mul.f32 %f376, %f312, %f97;
|
|
mul.f32 %f377, %f313, %f97;
|
|
mul.f32 %f378, %f314, %f97;
|
|
mul.f32 %f379, %f315, %f97;
|
|
mul.f32 %f380, %f316, %f97;
|
|
mul.f32 %f381, %f317, %f97;
|
|
mul.f32 %f382, %f318, %f97;
|
|
mul.f32 %f383, %f319, %f97;
|
|
mul.f32 %f384, %f320, %f97;
|
|
mul.f32 %f385, %f321, %f97;
|
|
mul.f32 %f386, %f322, %f97;
|
|
mul.f32 %f387, %f323, %f97;
|
|
mul.f32 %f388, %f324, %f97;
|
|
mul.f32 %f389, %f325, %f97;
|
|
mul.f32 %f390, %f326, %f97;
|
|
mul.f32 %f391, %f327, %f97;
|
|
mul.f32 %f392, %f328, %f97;
|
|
mul.f32 %f393, %f329, %f97;
|
|
mul.f32 %f394, %f330, %f97;
|
|
mul.f32 %f395, %f331, %f97;
|
|
mul.f32 %f396, %f332, %f97;
|
|
mul.f32 %f397, %f333, %f97;
|
|
mul.f32 %f398, %f334, %f97;
|
|
mul.f32 %f399, %f335, %f97;
|
|
mul.f32 %f400, %f336, %f97;
|
|
mul.f32 %f401, %f337, %f97;
|
|
mul.f32 %f402, %f338, %f97;
|
|
mul.f32 %f403, %f339, %f97;
|
|
mul.f32 %f404, %f340, %f97;
|
|
mul.f32 %f405, %f341, %f97;
|
|
mul.f32 %f406, %f342, %f97;
|
|
mul.f32 %f407, %f343, %f97;
|
|
mul.f32 %f408, %f344, %f97;
|
|
mul.f32 %f409, %f345, %f97;
|
|
mul.f32 %f410, %f346, %f97;
|
|
mul.f32 %f411, %f347, %f97;
|
|
mul.f32 %f412, %f348, %f97;
|
|
mul.f32 %f413, %f349, %f97;
|
|
mul.f32 %f414, %f350, %f97;
|
|
mul.f32 %f415, %f351, %f97;
|
|
mul.f32 %f416, %f352, %f97;
|
|
mul.f32 %f417, %f353, %f97;
|
|
bar.sync 0;
|
|
st.shared.b32 [%r28], %f254;
|
|
st.shared.b32 [%r28+256], %f255;
|
|
st.shared.b32 [%r28+16], %f256;
|
|
st.shared.b32 [%r28+16], %f257;
|
|
st.shared.b32 [%r28+2048], %f258;
|
|
st.shared.b32 [%r28+2304], %f259;
|
|
st.shared.b32 [%r28+16], %f260;
|
|
st.shared.b32 [%r28+16], %f261;
|
|
st.shared.b32 [%r28+4096], %f262;
|
|
st.shared.b32 [%r28+4352], %f263;
|
|
st.shared.b32 [%r28+16], %f264;
|
|
st.shared.b32 [%r28+16], %f265;
|
|
st.shared.b32 [%r28+6144], %f266;
|
|
st.shared.b32 [%r28+6400], %f267;
|
|
st.shared.b32 [%r28+16], %f268;
|
|
st.shared.b32 [%r28+16], %f269;
|
|
st.shared.b32 [%r28+8192], %f270;
|
|
st.shared.b32 [%r28+8448], %f271;
|
|
st.shared.b32 [%r28+16], %f272;
|
|
st.shared.b32 [%r28+16], %f273;
|
|
st.shared.b32 [%r28+10240], %f274;
|
|
st.shared.b32 [%r28+10496], %f275;
|
|
st.shared.b32 [%r28+16], %f276;
|
|
st.shared.b32 [%r28+16], %f277;
|
|
st.shared.b32 [%r28+12288], %f278;
|
|
st.shared.b32 [%r28+12544], %f279;
|
|
st.shared.b32 [%r28+16], %f280;
|
|
st.shared.b32 [%r28+16], %f281;
|
|
st.shared.b32 [%r28+14336], %f282;
|
|
st.shared.b32 [%r28+14592], %f283;
|
|
st.shared.b32 [%r28+16], %f284;
|
|
st.shared.b32 [%r28+16], %f285;
|
|
st.shared.b32 [%r28+16384], %f286;
|
|
st.shared.b32 [%r28+16640], %f287;
|
|
st.shared.b32 [%r28+16], %f288;
|
|
st.shared.b32 [%r28+16], %f289;
|
|
st.shared.b32 [%r28+18432], %f290;
|
|
st.shared.b32 [%r28+18688], %f291;
|
|
st.shared.b32 [%r28+16], %f292;
|
|
st.shared.b32 [%r28+16], %f293;
|
|
st.shared.b32 [%r28+20480], %f294;
|
|
st.shared.b32 [%r28+20736], %f295;
|
|
st.shared.b32 [%r28+16], %f296;
|
|
st.shared.b32 [%r28+16], %f297;
|
|
st.shared.b32 [%r28+22528], %f298;
|
|
st.shared.b32 [%r28+22784], %f299;
|
|
st.shared.b32 [%r28+16], %f300;
|
|
st.shared.b32 [%r28+16], %f301;
|
|
st.shared.b32 [%r28+24576], %f302;
|
|
st.shared.b32 [%r28+24832], %f303;
|
|
st.shared.b32 [%r28+16], %f304;
|
|
st.shared.b32 [%r28+16], %f305;
|
|
st.shared.b32 [%r28+26624], %f306;
|
|
st.shared.b32 [%r28+26880], %f307;
|
|
st.shared.b32 [%r28+16], %f308;
|
|
st.shared.b32 [%r28+16], %f309;
|
|
st.shared.b32 [%r28+28672], %f310;
|
|
st.shared.b32 [%r28+28928], %f311;
|
|
st.shared.b32 [%r28+16], %f312;
|
|
st.shared.b32 [%r28+16], %f313;
|
|
st.shared.b32 [%r28+30720], %f314;
|
|
st.shared.b32 [%r28+30976], %f315;
|
|
st.shared.b32 [%r28+16], %f316;
|
|
st.shared.b32 [%r28+16], %f317;
|
|
add.s32 %r3847, %r3847, 128;
|
|
setp.lt.s32 %p18, %r3847, %r11;
|
|
@%p18 bra LBB0_2;
|
|
LBB0_3:
|
|
bar.sync 0;
|
|
and.b32 %r3579, %r6, 48;
|
|
or.b32 %r3580, %r3579, %r8;
|
|
shr.u32 %r3581, %r1, 4;
|
|
and.b32 %r3582, %r3581, 56;
|
|
or.b32 %r3583, %r10, %r3582;
|
|
mad.lo.s32 %r3584, %r3580, 72, %r3583;
|
|
shl.b32 %r3585, %r3584, 2;
|
|
add.s32 %r3587, %r478, %r3585;
|
|
st.shared.v2.f32 [%r3587], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2304], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+64], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2368], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+128], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2432], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+192], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2496], {%f129, %f129};
|
|
bar.sync 0;
|
|
mad.lo.s32 %r3588, %r3, 72, %r5;
|
|
shl.b32 %r3589, %r3588, 2;
|
|
add.s32 %r3590, %r478, %r3589;
|
|
ld.shared.v4.f32 {%f419, %f420, %f421, %f422}, [%r3590];
|
|
ld.shared.v4.f32 {%f423, %f424, %f425, %f426}, [%r3590+16];
|
|
ld.shared.v4.f32 {%f427, %f428, %f429, %f430}, [%r3590+9216];
|
|
ld.shared.v4.f32 {%f431, %f432, %f433, %f434}, [%r3590+9232];
|
|
bar.sync 0;
|
|
st.shared.v2.f32 [%r3587], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2304], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+64], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2368], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+128], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2432], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+192], {%f129, %f129};
|
|
st.shared.v2.f32 [%r3587+2496], {%f129, %f129};
|
|
bar.sync 0;
|
|
ld.shared.v4.f32 {%f435, %f436, %f437, %f438}, [%r3590];
|
|
ld.shared.v4.f32 {%f439, %f440, %f441, %f442}, [%r3590+16];
|
|
ld.shared.v4.f32 {%f443, %f444, %f445, %f446}, [%r3590+9216];
|
|
ld.shared.v4.f32 {%f447, %f448, %f449, %f450}, [%r3590+9232];
|
|
bar.sync 0;
|
|
st.shared.v2.f32 [%r3587], {%f515, %f516};
|
|
st.shared.v2.f32 [%r3587+2304], {%f517, %f518};
|
|
st.shared.v2.f32 [%r3587+64], {%f519, %f520};
|
|
st.shared.v2.f32 [%r3587+2368], {%f521, %f522};
|
|
st.shared.v2.f32 [%r3587+128], {%f523, %f524};
|
|
st.shared.v2.f32 [%r3587+2432], {%f525, %f526};
|
|
st.shared.v2.f32 [%r3587+192], {%f527, %f528};
|
|
st.shared.v2.f32 [%r3587+2496], {%f529, %f530};
|
|
bar.sync 0;
|
|
ld.shared.v4.f32 {%f451, %f452, %f453, %f454}, [%r3590];
|
|
ld.shared.v4.f32 {%f455, %f456, %f457, %f458}, [%r3590+16];
|
|
ld.shared.v4.f32 {%f459, %f460, %f461, %f462}, [%r3590+9216];
|
|
ld.shared.v4.f32 {%f463, %f464, %f465, %f466}, [%r3590+9232];
|
|
bar.sync 0;
|
|
st.shared.v2.f32 [%r3587], {%f531, %f532};
|
|
st.shared.v2.f32 [%r3587+2304], {%f533, %f534};
|
|
st.shared.v2.f32 [%r3587+64], {%f535, %f536};
|
|
st.shared.v2.f32 [%r3587+2368], {%f537, %f538};
|
|
st.shared.v2.f32 [%r3587+128], {%f539, %f540};
|
|
st.shared.v2.f32 [%r3587+2432], {%f541, %f542};
|
|
st.shared.v2.f32 [%r3587+192], {%f543, %f544};
|
|
st.shared.v2.f32 [%r3587+2496], {%f545, %f546};
|
|
bar.sync 0;
|
|
ld.shared.v4.f32 {%f467, %f468, %f469, %f470}, [%r3590];
|
|
ld.shared.v4.f32 {%f471, %f472, %f473, %f474}, [%r3590+16];
|
|
ld.shared.v4.f32 {%f475, %f476, %f477, %f478}, [%r3590+9216];
|
|
ld.shared.v4.f32 {%f479, %f480, %f481, %f482}, [%r3590+9232];
|
|
shl.b64 %rd62, %rd7, 1;
|
|
add.s64 %rd54, %rd2, %rd62;
|
|
shl.b64 %rd63, %rd8, 1;
|
|
add.s64 %rd55, %rd2, %rd63;
|
|
shl.b64 %rd64, %rd9, 1;
|
|
add.s64 %rd56, %rd2, %rd64;
|
|
shl.b64 %rd65, %rd10, 1;
|
|
add.s64 %rd57, %rd2, %rd65;
|
|
cvt.rn.f16.f32 %h193, %f452;
|
|
cvt.rn.f16.f32 %h194, %f451;
|
|
mov.b32 %hh289, {%h194, %h193};
|
|
cvt.rn.f16.f32 %h195, %f454;
|
|
cvt.rn.f16.f32 %h196, %f453;
|
|
mov.b32 %hh290, {%h196, %h195};
|
|
cvt.rn.f16.f32 %h197, %f456;
|
|
cvt.rn.f16.f32 %h198, %f455;
|
|
mov.b32 %hh291, {%h198, %h197};
|
|
cvt.rn.f16.f32 %h199, %f458;
|
|
cvt.rn.f16.f32 %h200, %f457;
|
|
mov.b32 %hh292, {%h200, %h199};
|
|
cvt.rn.f16.f32 %h201, %f460;
|
|
cvt.rn.f16.f32 %h202, %f459;
|
|
mov.b32 %hh293, {%h202, %h201};
|
|
cvt.rn.f16.f32 %h203, %f462;
|
|
cvt.rn.f16.f32 %h204, %f461;
|
|
mov.b32 %hh294, {%h204, %h203};
|
|
cvt.rn.f16.f32 %h205, %f464;
|
|
cvt.rn.f16.f32 %h206, %f463;
|
|
mov.b32 %hh295, {%h206, %h205};
|
|
cvt.rn.f16.f32 %h207, %f466;
|
|
cvt.rn.f16.f32 %h208, %f465;
|
|
mov.b32 %hh296, {%h208, %h207};
|
|
cvt.rn.f16.f32 %h209, %f468;
|
|
cvt.rn.f16.f32 %h210, %f467;
|
|
mov.b32 %hh297, {%h210, %h209};
|
|
cvt.rn.f16.f32 %h211, %f470;
|
|
cvt.rn.f16.f32 %h212, %f469;
|
|
mov.b32 %hh298, {%h212, %h211};
|
|
cvt.rn.f16.f32 %h213, %f472;
|
|
cvt.rn.f16.f32 %h214, %f471;
|
|
mov.b32 %hh299, {%h214, %h213};
|
|
cvt.rn.f16.f32 %h215, %f474;
|
|
cvt.rn.f16.f32 %h216, %f473;
|
|
mov.b32 %hh300, {%h216, %h215};
|
|
cvt.rn.f16.f32 %h217, %f476;
|
|
cvt.rn.f16.f32 %h218, %f475;
|
|
mov.b32 %hh301, {%h218, %h217};
|
|
cvt.rn.f16.f32 %h219, %f478;
|
|
cvt.rn.f16.f32 %h220, %f477;
|
|
mov.b32 %hh302, {%h220, %h219};
|
|
cvt.rn.f16.f32 %h221, %f480;
|
|
cvt.rn.f16.f32 %h222, %f479;
|
|
mov.b32 %hh303, {%h222, %h221};
|
|
cvt.rn.f16.f32 %h223, %f482;
|
|
cvt.rn.f16.f32 %h224, %f481;
|
|
mov.b32 %hh304, {%h224, %h223};
|
|
mov.b32 %r3547, %hh289;
|
|
mov.b32 %r3548, %hh290;
|
|
mov.b32 %r3549, %hh291;
|
|
mov.b32 %r3550, %hh292;
|
|
@%p19 st.global.v4.b32 [ %rd54 + 0 ], { %r3547, %r3548, %r3549, %r3550 };
|
|
mov.b32 %r3551, %hh293;
|
|
mov.b32 %r3552, %hh294;
|
|
mov.b32 %r3553, %hh295;
|
|
mov.b32 %r3554, %hh296;
|
|
@%p19 st.global.v4.b32 [ %rd55 + 0 ], { %r3551, %r3552, %r3553, %r3554 };
|
|
mov.b32 %r3555, %hh297;
|
|
mov.b32 %r3556, %hh298;
|
|
mov.b32 %r3557, %hh299;
|
|
mov.b32 %r3558, %hh300;
|
|
@%p19 st.global.v4.b32 [ %rd56 + 0 ], { %r3555, %r3556, %r3557, %r3558 };
|
|
mov.b32 %r3559, %hh301;
|
|
mov.b32 %r3560, %hh302;
|
|
mov.b32 %r3561, %hh303;
|
|
mov.b32 %r3562, %hh304;
|
|
@%p19 st.global.v4.b32 [ %rd57 + 0 ], { %r3559, %r3560, %r3561, %r3562 };
|
|
shl.b64 %rd66, %rd3, 1;
|
|
add.s64 %rd58, %rd1, %rd66;
|
|
shl.b64 %rd67, %rd4, 1;
|
|
add.s64 %rd59, %rd1, %rd67;
|
|
shl.b64 %rd68, %rd5, 1;
|
|
add.s64 %rd60, %rd1, %rd68;
|
|
shl.b64 %rd69, %rd6, 1;
|
|
add.s64 %rd61, %rd1, %rd69;
|
|
cvt.rn.f16.f32 %h225, %f420;
|
|
cvt.rn.f16.f32 %h226, %f419;
|
|
mov.b32 %hh305, {%h226, %h225};
|
|
cvt.rn.f16.f32 %h227, %f422;
|
|
cvt.rn.f16.f32 %h228, %f421;
|
|
mov.b32 %hh306, {%h228, %h227};
|
|
cvt.rn.f16.f32 %h229, %f424;
|
|
cvt.rn.f16.f32 %h230, %f423;
|
|
mov.b32 %hh307, {%h230, %h229};
|
|
cvt.rn.f16.f32 %h231, %f426;
|
|
cvt.rn.f16.f32 %h232, %f425;
|
|
mov.b32 %hh308, {%h232, %h231};
|
|
cvt.rn.f16.f32 %h233, %f428;
|
|
cvt.rn.f16.f32 %h234, %f427;
|
|
mov.b32 %hh309, {%h234, %h233};
|
|
cvt.rn.f16.f32 %h235, %f430;
|
|
cvt.rn.f16.f32 %h236, %f429;
|
|
mov.b32 %hh310, {%h236, %h235};
|
|
cvt.rn.f16.f32 %h237, %f432;
|
|
cvt.rn.f16.f32 %h238, %f431;
|
|
mov.b32 %hh311, {%h238, %h237};
|
|
cvt.rn.f16.f32 %h239, %f434;
|
|
cvt.rn.f16.f32 %h240, %f433;
|
|
mov.b32 %hh312, {%h240, %h239};
|
|
cvt.rn.f16.f32 %h241, %f436;
|
|
cvt.rn.f16.f32 %h242, %f435;
|
|
mov.b32 %hh313, {%h242, %h241};
|
|
cvt.rn.f16.f32 %h243, %f438;
|
|
cvt.rn.f16.f32 %h244, %f437;
|
|
mov.b32 %hh314, {%h244, %h243};
|
|
cvt.rn.f16.f32 %h245, %f440;
|
|
cvt.rn.f16.f32 %h246, %f439;
|
|
mov.b32 %hh315, {%h246, %h245};
|
|
cvt.rn.f16.f32 %h247, %f442;
|
|
cvt.rn.f16.f32 %h248, %f441;
|
|
mov.b32 %hh316, {%h248, %h247};
|
|
cvt.rn.f16.f32 %h249, %f444;
|
|
cvt.rn.f16.f32 %h250, %f443;
|
|
mov.b32 %hh317, {%h250, %h249};
|
|
cvt.rn.f16.f32 %h251, %f446;
|
|
cvt.rn.f16.f32 %h252, %f445;
|
|
mov.b32 %hh318, {%h252, %h251};
|
|
cvt.rn.f16.f32 %h253, %f448;
|
|
cvt.rn.f16.f32 %h254, %f447;
|
|
mov.b32 %hh319, {%h254, %h253};
|
|
cvt.rn.f16.f32 %h255, %f450;
|
|
cvt.rn.f16.f32 %h256, %f449;
|
|
mov.b32 %hh320, {%h256, %h255};
|
|
mov.b32 %r3563, %hh305;
|
|
mov.b32 %r3564, %hh306;
|
|
mov.b32 %r3565, %hh307;
|
|
mov.b32 %r3566, %hh308;
|
|
@%p19 st.global.v4.b32 [ %rd58 + 0 ], { %r3563, %r3564, %r3565, %r3566 };
|
|
mov.b32 %r3567, %hh309;
|
|
mov.b32 %r3568, %hh310;
|
|
mov.b32 %r3569, %hh311;
|
|
mov.b32 %r3570, %hh312;
|
|
@%p19 st.global.v4.b32 [ %rd59 + 0 ], { %r3567, %r3568, %r3569, %r3570 };
|
|
mov.b32 %r3571, %hh313;
|
|
mov.b32 %r3572, %hh314;
|
|
mov.b32 %r3573, %hh315;
|
|
mov.b32 %r3574, %hh316;
|
|
@%p19 st.global.v4.b32 [ %rd60 + 0 ], { %r3571, %r3572, %r3573, %r3574 };
|
|
mov.b32 %r3575, %hh317;
|
|
mov.b32 %r3576, %hh318;
|
|
mov.b32 %r3577, %hh319;
|
|
mov.b32 %r3578, %hh320;
|
|
@%p19 st.global.v4.b32 [ %rd61 + 0 ], { %r3575, %r3576, %r3577, %r3578 };
|
|
ret;
|
|
|
|
}
|