2765 lines
145 KiB
Plaintext
2765 lines
145 KiB
Plaintext
//
|
|
// Generated by LLVM NVPTX Back-End
|
|
//
|
|
|
|
.version 7.4
|
|
.target sm_86
|
|
.address_size 64
|
|
|
|
// .globl _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27
|
|
.extern .shared .align 1 .b8 global_smem[];
|
|
|
|
.visible .entry _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27(
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2,
|
|
.param .f32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_4,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_9,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10,
|
|
.param .u64 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_15,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_16,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_18,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_19,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_20,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_21,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23,
|
|
.param .u32 _bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24
|
|
)
|
|
.maxntid 256, 1, 1
|
|
{
|
|
.reg .pred %p<111>;
|
|
.reg .b16 %h<193>;
|
|
.reg .b32 %r<6177>;
|
|
.reg .b32 %hh<65>;
|
|
.reg .f32 %f<973>;
|
|
.reg .b64 %rd<139>;
|
|
|
|
ld.param.u32 %r380, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_24];
|
|
setp.lt.s32 %p1, %r380, 1;
|
|
@%p1 bra LBB0_6;
|
|
ld.param.u32 %r379, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_17];
|
|
ld.param.u32 %r378, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_14];
|
|
ld.param.u64 %rd55, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_5];
|
|
ld.param.f32 %f195, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_3];
|
|
ld.param.u64 %rd54, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_0];
|
|
mov.u32 %r1, %tid.x;
|
|
ld.param.u64 %rd56, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_1];
|
|
ld.param.u64 %rd57, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_2];
|
|
bfe.u32 %r2, %r1, 5, 2;
|
|
and.b32 %r3, %r1, 127;
|
|
bfe.u32 %r4, %r1, 3, 2;
|
|
ld.param.u64 %rd58, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_6];
|
|
shr.u32 %r381, %r1, 3;
|
|
ld.param.u64 %rd59, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_7];
|
|
and.b32 %r382, %r381, 124;
|
|
ld.param.u64 %rd60, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_8];
|
|
or.b32 %r5, %r382, %r4;
|
|
add.s32 %r6, %r5, 32;
|
|
ld.param.u64 %rd61, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_10];
|
|
add.s32 %r7, %r5, 64;
|
|
ld.param.u64 %rd62, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_11];
|
|
add.s32 %r8, %r5, 96;
|
|
ld.param.u32 %r383, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_12];
|
|
shl.b32 %r384, %r1, 1;
|
|
ld.param.u32 %r385, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_13];
|
|
and.b32 %r9, %r384, 6;
|
|
bfe.u32 %r10, %r1, 4, 1;
|
|
shr.u32 %r386, %r1, 4;
|
|
and.b32 %r387, %r386, 126;
|
|
or.b32 %r11, %r387, %r10;
|
|
add.s32 %r12, %r11, 16;
|
|
add.s32 %r13, %r11, 32;
|
|
add.s32 %r14, %r11, 48;
|
|
shr.u32 %r388, %r1, 1;
|
|
ld.param.u32 %r389, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_22];
|
|
and.b32 %r15, %r388, 112;
|
|
ld.param.u32 %r390, [_bwd_kernel_0d1d2d34d5d6d7d8d9d10d11d12d13d14d15c16d17d18d19c20d21d22d23c2425d26d27_param_23];
|
|
bfe.u32 %r16, %r1, 2, 3;
|
|
or.b32 %r17, %r15, %r16;
|
|
or.b32 %r18, %r17, 8;
|
|
and.b32 %r19, %r1, 7;
|
|
shl.b32 %r20, %r19, 3;
|
|
shl.b32 %r391, %r1, 2;
|
|
and.b32 %r21, %r391, 60;
|
|
mov.u32 %r392, %ctaid.x;
|
|
div.s32 %r395, %r392, %r389;
|
|
mul.lo.s32 %r396, %r395, %r389;
|
|
sub.s32 %r397, %r392, %r396;
|
|
mul.lo.s32 %r398, %r395, %r383;
|
|
mad.lo.s32 %r399, %r397, %r385, %r398;
|
|
cvt.s64.s32 %rd1, %r399;
|
|
mul.wide.s32 %rd63, %r399, 2;
|
|
add.s64 %rd2, %rd56, %rd63;
|
|
add.s64 %rd3, %rd57, %rd63;
|
|
mul.wide.s32 %rd64, %r399, 4;
|
|
add.s64 %rd4, %rd58, %rd64;
|
|
add.s64 %rd5, %rd59, %rd63;
|
|
add.s64 %rd6, %rd60, %rd63;
|
|
mul.lo.s32 %r400, %r392, %r390;
|
|
mul.wide.s32 %rd65, %r400, 4;
|
|
add.s64 %rd7, %rd62, %rd65;
|
|
add.s64 %rd8, %rd61, %rd65;
|
|
shl.b32 %r22, %r380, 7;
|
|
shl.b32 %r23, %r378, 7;
|
|
and.b32 %r402, %r5, 7;
|
|
xor.b32 %r403, %r402, %r19;
|
|
shl.b32 %r404, %r5, 7;
|
|
shl.b32 %r405, %r403, 4;
|
|
or.b32 %r406, %r405, %r404;
|
|
mov.u32 %r407, global_smem;
|
|
add.s32 %r24, %r407, %r406;
|
|
shl.b32 %r408, %r6, 7;
|
|
or.b32 %r409, %r408, %r405;
|
|
add.s32 %r25, %r407, %r409;
|
|
shl.b32 %r410, %r7, 7;
|
|
or.b32 %r411, %r410, %r405;
|
|
add.s32 %r26, %r407, %r411;
|
|
shl.b32 %r412, %r8, 7;
|
|
or.b32 %r413, %r412, %r405;
|
|
add.s32 %r27, %r407, %r413;
|
|
add.s32 %r414, %r407, 16384;
|
|
add.s32 %r28, %r414, %r406;
|
|
add.s32 %r29, %r414, %r409;
|
|
add.s32 %r30, %r414, %r411;
|
|
add.s32 %r31, %r414, %r413;
|
|
add.s32 %r415, %r407, 32768;
|
|
add.s32 %r32, %r415, %r406;
|
|
add.s32 %r33, %r415, %r409;
|
|
add.s32 %r34, %r415, %r411;
|
|
add.s32 %r35, %r415, %r413;
|
|
and.b32 %r416, %r4, 1;
|
|
shl.b32 %r417, %r416, 3;
|
|
or.b32 %r418, %r417, %r15;
|
|
or.b32 %r419, %r418, %r19;
|
|
xor.b32 %r420, %r10, %r19;
|
|
shl.b32 %r421, %r419, 6;
|
|
shl.b32 %r422, %r420, 3;
|
|
or.b32 %r423, %r421, %r422;
|
|
shl.b32 %r424, %r423, 1;
|
|
add.s32 %r704, %r415, %r424;
|
|
or.b32 %r425, %r10, 2;
|
|
xor.b32 %r426, %r425, %r19;
|
|
shl.b32 %r427, %r426, 3;
|
|
or.b32 %r428, %r421, %r427;
|
|
shl.b32 %r429, %r428, 1;
|
|
add.s32 %r709, %r415, %r429;
|
|
or.b32 %r430, %r10, 4;
|
|
xor.b32 %r431, %r430, %r19;
|
|
shl.b32 %r432, %r431, 3;
|
|
or.b32 %r433, %r421, %r432;
|
|
shl.b32 %r434, %r433, 1;
|
|
add.s32 %r714, %r415, %r434;
|
|
or.b32 %r435, %r10, 6;
|
|
xor.b32 %r436, %r435, %r19;
|
|
shl.b32 %r437, %r436, 3;
|
|
or.b32 %r438, %r421, %r437;
|
|
shl.b32 %r439, %r438, 1;
|
|
add.s32 %r719, %r415, %r439;
|
|
shl.b32 %r440, %r10, 3;
|
|
or.b32 %r441, %r440, %r19;
|
|
xor.b32 %r442, %r416, %r19;
|
|
shl.b32 %r443, %r442, 4;
|
|
shl.b32 %r444, %r441, 7;
|
|
or.b32 %r445, %r443, %r444;
|
|
add.s32 %r724, %r407, %r445;
|
|
or.b32 %r446, %r4, 2;
|
|
xor.b32 %r447, %r446, %r19;
|
|
shl.b32 %r448, %r447, 4;
|
|
or.b32 %r449, %r448, %r444;
|
|
add.s32 %r729, %r407, %r449;
|
|
or.b32 %r450, %r416, 4;
|
|
xor.b32 %r451, %r450, %r19;
|
|
shl.b32 %r452, %r451, 4;
|
|
or.b32 %r453, %r452, %r444;
|
|
add.s32 %r734, %r407, %r453;
|
|
or.b32 %r454, %r4, 6;
|
|
xor.b32 %r455, %r454, %r19;
|
|
shl.b32 %r456, %r455, 4;
|
|
or.b32 %r457, %r456, %r444;
|
|
add.s32 %r739, %r407, %r457;
|
|
add.s32 %r744, %r724, 2048;
|
|
add.s32 %r749, %r729, 2048;
|
|
add.s32 %r754, %r734, 2048;
|
|
add.s32 %r759, %r739, 2048;
|
|
add.s32 %r764, %r724, 4096;
|
|
add.s32 %r769, %r729, 4096;
|
|
add.s32 %r774, %r734, 4096;
|
|
add.s32 %r779, %r739, 4096;
|
|
add.s32 %r784, %r724, 6144;
|
|
add.s32 %r789, %r729, 6144;
|
|
add.s32 %r794, %r734, 6144;
|
|
add.s32 %r799, %r739, 6144;
|
|
add.s32 %r804, %r724, 8192;
|
|
add.s32 %r809, %r729, 8192;
|
|
add.s32 %r814, %r734, 8192;
|
|
add.s32 %r819, %r739, 8192;
|
|
add.s32 %r824, %r724, 10240;
|
|
add.s32 %r829, %r729, 10240;
|
|
add.s32 %r834, %r734, 10240;
|
|
add.s32 %r839, %r739, 10240;
|
|
add.s32 %r844, %r724, 12288;
|
|
add.s32 %r849, %r729, 12288;
|
|
add.s32 %r854, %r734, 12288;
|
|
add.s32 %r859, %r739, 12288;
|
|
add.s32 %r864, %r724, 14336;
|
|
add.s32 %r869, %r729, 14336;
|
|
add.s32 %r874, %r734, 14336;
|
|
add.s32 %r879, %r739, 14336;
|
|
shl.b32 %r458, %r3, 2;
|
|
add.s32 %r459, %r407, 49152;
|
|
add.s32 %r72, %r459, %r458;
|
|
shl.b32 %r460, %r17, 2;
|
|
add.s32 %r73, %r459, %r460;
|
|
add.s32 %r461, %r16, %r15;
|
|
shl.b32 %r462, %r461, 2;
|
|
add.s32 %r74, %r459, %r462;
|
|
add.s32 %r75, %r459, %r406;
|
|
add.s32 %r76, %r459, %r409;
|
|
add.s32 %r77, %r459, %r411;
|
|
add.s32 %r78, %r459, %r413;
|
|
shl.b32 %r463, %r17, 7;
|
|
shl.b32 %r464, %r16, 3;
|
|
or.b32 %r465, %r464, %r9;
|
|
or.b32 %r466, %r463, %r465;
|
|
shl.b32 %r467, %r466, 1;
|
|
add.s32 %r468, %r407, 65536;
|
|
add.s32 %r79, %r468, %r467;
|
|
shl.b32 %r469, %r18, 7;
|
|
or.b32 %r470, %r469, %r465;
|
|
shl.b32 %r471, %r470, 1;
|
|
add.s32 %r80, %r468, %r471;
|
|
xor.b32 %r472, %r466, 8;
|
|
shl.b32 %r473, %r472, 1;
|
|
add.s32 %r81, %r468, %r473;
|
|
xor.b32 %r474, %r470, 8;
|
|
shl.b32 %r475, %r474, 1;
|
|
add.s32 %r82, %r468, %r475;
|
|
xor.b32 %r476, %r466, 16;
|
|
shl.b32 %r477, %r476, 1;
|
|
add.s32 %r83, %r468, %r477;
|
|
xor.b32 %r478, %r470, 16;
|
|
shl.b32 %r479, %r478, 1;
|
|
add.s32 %r84, %r468, %r479;
|
|
xor.b32 %r480, %r466, 24;
|
|
shl.b32 %r481, %r480, 1;
|
|
add.s32 %r85, %r468, %r481;
|
|
xor.b32 %r482, %r470, 24;
|
|
shl.b32 %r483, %r482, 1;
|
|
add.s32 %r86, %r468, %r483;
|
|
xor.b32 %r484, %r466, 32;
|
|
shl.b32 %r485, %r484, 1;
|
|
add.s32 %r87, %r468, %r485;
|
|
xor.b32 %r486, %r470, 32;
|
|
shl.b32 %r487, %r486, 1;
|
|
add.s32 %r88, %r468, %r487;
|
|
xor.b32 %r488, %r466, 40;
|
|
shl.b32 %r489, %r488, 1;
|
|
add.s32 %r89, %r468, %r489;
|
|
xor.b32 %r490, %r470, 40;
|
|
shl.b32 %r491, %r490, 1;
|
|
add.s32 %r90, %r468, %r491;
|
|
xor.b32 %r492, %r466, 48;
|
|
shl.b32 %r493, %r492, 1;
|
|
add.s32 %r91, %r468, %r493;
|
|
xor.b32 %r494, %r470, 48;
|
|
shl.b32 %r495, %r494, 1;
|
|
add.s32 %r92, %r468, %r495;
|
|
xor.b32 %r496, %r466, 56;
|
|
shl.b32 %r497, %r496, 1;
|
|
add.s32 %r93, %r468, %r497;
|
|
xor.b32 %r498, %r470, 56;
|
|
shl.b32 %r499, %r498, 1;
|
|
add.s32 %r94, %r468, %r499;
|
|
xor.b32 %r500, %r466, 72;
|
|
shl.b32 %r501, %r500, 1;
|
|
add.s32 %r97, %r468, %r501;
|
|
xor.b32 %r502, %r470, 72;
|
|
shl.b32 %r503, %r502, 1;
|
|
add.s32 %r98, %r468, %r503;
|
|
xor.b32 %r504, %r466, 80;
|
|
shl.b32 %r505, %r504, 1;
|
|
add.s32 %r99, %r468, %r505;
|
|
xor.b32 %r506, %r470, 80;
|
|
shl.b32 %r507, %r506, 1;
|
|
add.s32 %r100, %r468, %r507;
|
|
xor.b32 %r508, %r466, 88;
|
|
shl.b32 %r509, %r508, 1;
|
|
add.s32 %r101, %r468, %r509;
|
|
xor.b32 %r510, %r470, 88;
|
|
shl.b32 %r511, %r510, 1;
|
|
add.s32 %r102, %r468, %r511;
|
|
xor.b32 %r512, %r466, 96;
|
|
shl.b32 %r513, %r512, 1;
|
|
add.s32 %r103, %r468, %r513;
|
|
xor.b32 %r514, %r470, 96;
|
|
shl.b32 %r515, %r514, 1;
|
|
add.s32 %r104, %r468, %r515;
|
|
xor.b32 %r516, %r466, 104;
|
|
shl.b32 %r517, %r516, 1;
|
|
add.s32 %r105, %r468, %r517;
|
|
xor.b32 %r518, %r470, 104;
|
|
shl.b32 %r519, %r518, 1;
|
|
add.s32 %r106, %r468, %r519;
|
|
xor.b32 %r520, %r466, 112;
|
|
shl.b32 %r521, %r520, 1;
|
|
add.s32 %r107, %r468, %r521;
|
|
xor.b32 %r522, %r470, 112;
|
|
shl.b32 %r523, %r522, 1;
|
|
add.s32 %r108, %r468, %r523;
|
|
xor.b32 %r524, %r466, 120;
|
|
shl.b32 %r525, %r524, 1;
|
|
add.s32 %r109, %r468, %r525;
|
|
xor.b32 %r526, %r470, 120;
|
|
shl.b32 %r527, %r526, 1;
|
|
add.s32 %r110, %r468, %r527;
|
|
shl.b32 %r528, %r2, 1;
|
|
or.b32 %r529, %r528, %r416;
|
|
xor.b32 %r530, %r529, %r19;
|
|
shl.b32 %r531, %r530, 4;
|
|
shl.b32 %r532, %r441, 8;
|
|
or.b32 %r533, %r531, %r532;
|
|
add.s32 %r1797, %r468, %r533;
|
|
add.s32 %r1802, %r1797, 4096;
|
|
add.s32 %r1807, %r1797, 8192;
|
|
add.s32 %r1812, %r1797, 12288;
|
|
add.s32 %r1817, %r1797, 16384;
|
|
add.s32 %r1822, %r1797, 20480;
|
|
add.s32 %r1827, %r1797, 24576;
|
|
add.s32 %r1832, %r1797, 28672;
|
|
or.b32 %r534, %r529, 8;
|
|
xor.b32 %r535, %r534, %r19;
|
|
shl.b32 %r536, %r535, 4;
|
|
or.b32 %r537, %r536, %r532;
|
|
add.s32 %r1837, %r468, %r537;
|
|
add.s32 %r1842, %r1837, 4096;
|
|
add.s32 %r1847, %r1837, 8192;
|
|
add.s32 %r1852, %r1837, 12288;
|
|
add.s32 %r1857, %r1837, 16384;
|
|
add.s32 %r1862, %r1837, 20480;
|
|
add.s32 %r1867, %r1837, 24576;
|
|
add.s32 %r1872, %r1837, 28672;
|
|
bfe.u32 %r538, %r1, 7, 1;
|
|
shl.b32 %r539, %r10, 1;
|
|
or.b32 %r540, %r539, %r538;
|
|
xor.b32 %r541, %r540, %r19;
|
|
shl.b32 %r542, %r416, 9;
|
|
shl.b32 %r543, %r19, 6;
|
|
or.b32 %r544, %r542, %r543;
|
|
shl.b32 %r545, %r541, 4;
|
|
shl.b32 %r546, %r544, 1;
|
|
or.b32 %r547, %r545, %r546;
|
|
add.s32 %r1877, %r459, %r547;
|
|
add.s32 %r1882, %r1877, 2048;
|
|
add.s32 %r1887, %r1877, 4096;
|
|
add.s32 %r1892, %r1877, 6144;
|
|
add.s32 %r1897, %r1877, 8192;
|
|
add.s32 %r1902, %r1877, 10240;
|
|
add.s32 %r1907, %r1877, 12288;
|
|
add.s32 %r1912, %r1877, 14336;
|
|
or.b32 %r548, %r540, 4;
|
|
xor.b32 %r549, %r548, %r19;
|
|
shl.b32 %r550, %r549, 4;
|
|
or.b32 %r551, %r550, %r546;
|
|
add.s32 %r1917, %r459, %r551;
|
|
add.s32 %r1922, %r1917, 2048;
|
|
add.s32 %r1927, %r1917, 4096;
|
|
add.s32 %r1932, %r1917, 6144;
|
|
add.s32 %r1937, %r1917, 8192;
|
|
add.s32 %r1942, %r1917, 10240;
|
|
add.s32 %r1947, %r1917, 12288;
|
|
add.s32 %r1952, %r1917, 14336;
|
|
add.s32 %r143, %r468, %r458;
|
|
add.s32 %r144, %r468, %r460;
|
|
add.s32 %r145, %r468, %r462;
|
|
add.s32 %r2854, %r459, %r424;
|
|
add.s32 %r2859, %r459, %r429;
|
|
add.s32 %r2864, %r459, %r434;
|
|
add.s32 %r2869, %r459, %r439;
|
|
add.s32 %r2874, %r414, %r445;
|
|
add.s32 %r2879, %r414, %r449;
|
|
add.s32 %r2884, %r414, %r453;
|
|
add.s32 %r2889, %r414, %r457;
|
|
add.s32 %r2894, %r2874, 2048;
|
|
add.s32 %r2899, %r2879, 2048;
|
|
add.s32 %r2904, %r2884, 2048;
|
|
add.s32 %r2909, %r2889, 2048;
|
|
add.s32 %r2914, %r2874, 4096;
|
|
add.s32 %r2919, %r2879, 4096;
|
|
add.s32 %r2924, %r2884, 4096;
|
|
add.s32 %r2929, %r2889, 4096;
|
|
add.s32 %r2934, %r2874, 6144;
|
|
add.s32 %r2939, %r2879, 6144;
|
|
add.s32 %r2944, %r2884, 6144;
|
|
add.s32 %r2949, %r2889, 6144;
|
|
add.s32 %r2954, %r2874, 8192;
|
|
add.s32 %r2959, %r2879, 8192;
|
|
add.s32 %r2964, %r2884, 8192;
|
|
add.s32 %r2969, %r2889, 8192;
|
|
add.s32 %r2974, %r2874, 10240;
|
|
add.s32 %r2979, %r2879, 10240;
|
|
add.s32 %r2984, %r2884, 10240;
|
|
add.s32 %r2989, %r2889, 10240;
|
|
add.s32 %r2994, %r2874, 12288;
|
|
add.s32 %r2999, %r2879, 12288;
|
|
add.s32 %r3004, %r2884, 12288;
|
|
add.s32 %r3009, %r2889, 12288;
|
|
add.s32 %r3014, %r2874, 14336;
|
|
add.s32 %r3019, %r2879, 14336;
|
|
add.s32 %r3024, %r2884, 14336;
|
|
add.s32 %r3029, %r2889, 14336;
|
|
add.s32 %r552, %r407, 50176;
|
|
add.s32 %r182, %r552, %r467;
|
|
add.s32 %r183, %r552, %r471;
|
|
add.s32 %r184, %r552, %r473;
|
|
add.s32 %r185, %r552, %r475;
|
|
add.s32 %r186, %r552, %r477;
|
|
add.s32 %r187, %r552, %r479;
|
|
add.s32 %r188, %r552, %r481;
|
|
add.s32 %r189, %r552, %r483;
|
|
add.s32 %r190, %r552, %r485;
|
|
add.s32 %r191, %r552, %r487;
|
|
add.s32 %r192, %r552, %r489;
|
|
add.s32 %r193, %r552, %r491;
|
|
add.s32 %r194, %r552, %r493;
|
|
add.s32 %r195, %r552, %r495;
|
|
add.s32 %r196, %r552, %r497;
|
|
add.s32 %r197, %r552, %r499;
|
|
add.s32 %r200, %r552, %r501;
|
|
add.s32 %r201, %r552, %r503;
|
|
add.s32 %r202, %r552, %r505;
|
|
add.s32 %r203, %r552, %r507;
|
|
add.s32 %r204, %r552, %r509;
|
|
add.s32 %r205, %r552, %r511;
|
|
add.s32 %r206, %r552, %r513;
|
|
add.s32 %r207, %r552, %r515;
|
|
add.s32 %r208, %r552, %r517;
|
|
add.s32 %r209, %r552, %r519;
|
|
add.s32 %r210, %r552, %r521;
|
|
add.s32 %r211, %r552, %r523;
|
|
add.s32 %r212, %r552, %r525;
|
|
add.s32 %r213, %r552, %r527;
|
|
add.s32 %r3930, %r552, %r533;
|
|
add.s32 %r3935, %r3930, 4096;
|
|
add.s32 %r3940, %r3930, 8192;
|
|
add.s32 %r3945, %r3930, 12288;
|
|
add.s32 %r3950, %r3930, 16384;
|
|
add.s32 %r3955, %r3930, 20480;
|
|
add.s32 %r3960, %r3930, 24576;
|
|
add.s32 %r3965, %r3930, 28672;
|
|
add.s32 %r3970, %r552, %r537;
|
|
add.s32 %r3975, %r3970, 4096;
|
|
add.s32 %r3980, %r3970, 8192;
|
|
add.s32 %r3985, %r3970, 12288;
|
|
add.s32 %r3990, %r3970, 16384;
|
|
add.s32 %r3995, %r3970, 20480;
|
|
add.s32 %r4000, %r3970, 24576;
|
|
add.s32 %r4005, %r3970, 28672;
|
|
add.s32 %r4010, %r415, %r547;
|
|
add.s32 %r4015, %r4010, 2048;
|
|
add.s32 %r4020, %r4010, 4096;
|
|
add.s32 %r4025, %r4010, 6144;
|
|
add.s32 %r4030, %r4010, 8192;
|
|
add.s32 %r4035, %r4010, 10240;
|
|
add.s32 %r4040, %r4010, 12288;
|
|
add.s32 %r4045, %r4010, 14336;
|
|
add.s32 %r4050, %r415, %r551;
|
|
add.s32 %r4055, %r4050, 2048;
|
|
add.s32 %r4060, %r4050, 4096;
|
|
add.s32 %r4065, %r4050, 6144;
|
|
add.s32 %r4070, %r4050, 8192;
|
|
add.s32 %r4075, %r4050, 10240;
|
|
add.s32 %r4080, %r4050, 12288;
|
|
add.s32 %r4085, %r4050, 14336;
|
|
mad.lo.s32 %r553, %r11, 68, %r21;
|
|
shl.b32 %r554, %r553, 2;
|
|
add.s32 %r246, %r415, %r554;
|
|
shl.b32 %r555, %r2, 4;
|
|
or.b32 %r556, %r555, %r16;
|
|
and.b32 %r558, %r386, 56;
|
|
or.b32 %r559, %r9, %r558;
|
|
mad.lo.s32 %r560, %r556, 68, %r559;
|
|
shl.b32 %r561, %r560, 2;
|
|
add.s32 %r247, %r415, %r561;
|
|
or.b32 %r562, %r556, 8;
|
|
mad.lo.s32 %r563, %r562, 68, %r559;
|
|
shl.b32 %r564, %r563, 2;
|
|
add.s32 %r248, %r415, %r564;
|
|
shl.b32 %r565, %r529, 10;
|
|
shl.b32 %r566, %r19, 7;
|
|
or.b32 %r567, %r565, %r566;
|
|
or.b32 %r568, %r567, %r422;
|
|
shl.b32 %r569, %r568, 1;
|
|
add.s32 %r5018, %r552, %r569;
|
|
or.b32 %r570, %r567, %r427;
|
|
shl.b32 %r571, %r570, 1;
|
|
add.s32 %r5023, %r552, %r571;
|
|
or.b32 %r572, %r567, %r432;
|
|
shl.b32 %r573, %r572, 1;
|
|
add.s32 %r5028, %r552, %r573;
|
|
or.b32 %r574, %r567, %r437;
|
|
shl.b32 %r575, %r574, 1;
|
|
add.s32 %r5033, %r552, %r575;
|
|
or.b32 %r576, %r10, 8;
|
|
xor.b32 %r577, %r576, %r19;
|
|
shl.b32 %r578, %r577, 4;
|
|
shl.b32 %r579, %r567, 1;
|
|
or.b32 %r580, %r578, %r579;
|
|
add.s32 %r5038, %r552, %r580;
|
|
or.b32 %r581, %r10, 10;
|
|
xor.b32 %r582, %r581, %r19;
|
|
shl.b32 %r583, %r582, 4;
|
|
or.b32 %r584, %r583, %r579;
|
|
add.s32 %r5043, %r552, %r584;
|
|
or.b32 %r585, %r10, 12;
|
|
xor.b32 %r586, %r585, %r19;
|
|
shl.b32 %r587, %r586, 4;
|
|
or.b32 %r588, %r587, %r579;
|
|
add.s32 %r5048, %r552, %r588;
|
|
or.b32 %r589, %r10, 14;
|
|
xor.b32 %r590, %r589, %r19;
|
|
shl.b32 %r591, %r590, 4;
|
|
or.b32 %r592, %r591, %r579;
|
|
add.s32 %r5053, %r552, %r592;
|
|
add.s32 %r5058, %r5018, 16384;
|
|
add.s32 %r5063, %r5023, 16384;
|
|
add.s32 %r5068, %r5028, 16384;
|
|
add.s32 %r5073, %r5033, 16384;
|
|
add.s32 %r5078, %r5038, 16384;
|
|
add.s32 %r5083, %r5043, 16384;
|
|
add.s32 %r5088, %r5048, 16384;
|
|
add.s32 %r5093, %r5053, 16384;
|
|
add.s32 %r5098, %r407, %r547;
|
|
add.s32 %r5103, %r5098, 2048;
|
|
add.s32 %r5108, %r5098, 4096;
|
|
add.s32 %r5113, %r5098, 6144;
|
|
add.s32 %r5118, %r5098, 8192;
|
|
add.s32 %r5123, %r5098, 10240;
|
|
add.s32 %r5128, %r5098, 12288;
|
|
add.s32 %r5133, %r5098, 14336;
|
|
add.s32 %r5138, %r407, %r551;
|
|
add.s32 %r5143, %r5138, 2048;
|
|
add.s32 %r5148, %r5138, 4096;
|
|
add.s32 %r5153, %r5138, 6144;
|
|
add.s32 %r5158, %r5138, 8192;
|
|
add.s32 %r5163, %r5138, 10240;
|
|
add.s32 %r5168, %r5138, 12288;
|
|
add.s32 %r5173, %r5138, 14336;
|
|
mad.lo.s32 %r593, %r556, 72, %r559;
|
|
shl.b32 %r594, %r593, 1;
|
|
add.s32 %r281, %r407, %r594;
|
|
add.s32 %r282, %r281, 1152;
|
|
mad.lo.s32 %r595, %r5, 72, %r20;
|
|
shl.b32 %r596, %r595, 1;
|
|
add.s32 %r283, %r407, %r596;
|
|
shl.b64 %rd9, %rd1, 1;
|
|
mad.lo.s32 %r6174, %r378, %r8, %r20;
|
|
mul.wide.s32 %rd10, %r23, 2;
|
|
mad.lo.s32 %r6173, %r378, %r7, %r20;
|
|
mad.lo.s32 %r6172, %r378, %r6, %r20;
|
|
mul.wide.s32 %rd11, %r23, 4;
|
|
mad.lo.s32 %r6171, %r378, %r14, %r21;
|
|
mad.lo.s32 %r6170, %r378, %r13, %r21;
|
|
mad.lo.s32 %r6169, %r378, %r12, %r21;
|
|
mov.u32 %r6175, 0;
|
|
mov.pred %p102, -1;
|
|
mov.f32 %f227, 0f00000000;
|
|
bra.uni LBB0_2;
|
|
LBB0_5:
|
|
add.s64 %rd113, %rd6, %rd129;
|
|
shl.b64 %rd122, %rd26, 1;
|
|
add.s64 %rd114, %rd6, %rd122;
|
|
shl.b64 %rd123, %rd27, 1;
|
|
add.s64 %rd115, %rd6, %rd123;
|
|
shl.b64 %rd124, %rd28, 1;
|
|
add.s64 %rd116, %rd6, %rd124;
|
|
cvt.rn.f16.f32 %h129, %f909;
|
|
cvt.rn.f16.f32 %h130, %f910;
|
|
cvt.rn.f16.f32 %h131, %f911;
|
|
cvt.rn.f16.f32 %h132, %f912;
|
|
cvt.rn.f16.f32 %h133, %f913;
|
|
cvt.rn.f16.f32 %h134, %f914;
|
|
cvt.rn.f16.f32 %h135, %f915;
|
|
cvt.rn.f16.f32 %h136, %f916;
|
|
cvt.rn.f16.f32 %h137, %f917;
|
|
cvt.rn.f16.f32 %h138, %f918;
|
|
cvt.rn.f16.f32 %h139, %f919;
|
|
cvt.rn.f16.f32 %h140, %f920;
|
|
cvt.rn.f16.f32 %h141, %f921;
|
|
cvt.rn.f16.f32 %h142, %f922;
|
|
cvt.rn.f16.f32 %h143, %f923;
|
|
cvt.rn.f16.f32 %h144, %f924;
|
|
cvt.rn.f16.f32 %h145, %f925;
|
|
cvt.rn.f16.f32 %h146, %f926;
|
|
cvt.rn.f16.f32 %h147, %f927;
|
|
cvt.rn.f16.f32 %h148, %f928;
|
|
cvt.rn.f16.f32 %h149, %f929;
|
|
cvt.rn.f16.f32 %h150, %f930;
|
|
cvt.rn.f16.f32 %h151, %f931;
|
|
cvt.rn.f16.f32 %h152, %f932;
|
|
cvt.rn.f16.f32 %h153, %f933;
|
|
cvt.rn.f16.f32 %h154, %f934;
|
|
cvt.rn.f16.f32 %h155, %f935;
|
|
cvt.rn.f16.f32 %h156, %f936;
|
|
cvt.rn.f16.f32 %h157, %f937;
|
|
cvt.rn.f16.f32 %h158, %f938;
|
|
cvt.rn.f16.f32 %h159, %f939;
|
|
cvt.rn.f16.f32 %h160, %f940;
|
|
st.shared.v2.b16 [%r281], {%h129, %h130};
|
|
st.shared.v2.b16 [%r282], {%h131, %h132};
|
|
st.shared.v2.b16 [%r281+32], {%h133, %h134};
|
|
st.shared.v2.b16 [%r282+32], {%h135, %h136};
|
|
st.shared.v2.b16 [%r281+64], {%h137, %h138};
|
|
st.shared.v2.b16 [%r282+64], {%h139, %h140};
|
|
st.shared.v2.b16 [%r281+96], {%h141, %h142};
|
|
st.shared.v2.b16 [%r282+96], {%h143, %h144};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6137, %r6138, %r6139, %r6140}, [%r283];
|
|
ld.shared.v4.u32 {%r6141, %r6142, %r6143, %r6144}, [%r283+4608];
|
|
bar.sync 0;
|
|
st.shared.v2.b16 [%r281], {%h145, %h146};
|
|
st.shared.v2.b16 [%r282], {%h147, %h148};
|
|
st.shared.v2.b16 [%r281+32], {%h149, %h150};
|
|
st.shared.v2.b16 [%r282+32], {%h151, %h152};
|
|
st.shared.v2.b16 [%r281+64], {%h153, %h154};
|
|
st.shared.v2.b16 [%r282+64], {%h155, %h156};
|
|
st.shared.v2.b16 [%r281+96], {%h157, %h158};
|
|
st.shared.v2.b16 [%r282+96], {%h159, %h160};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6145, %r6146, %r6147, %r6148}, [%r283];
|
|
ld.shared.v4.u32 {%r6149, %r6150, %r6151, %r6152}, [%r283+4608];
|
|
@%p102 st.global.v4.b32 [ %rd113 + 0 ], { %r6137, %r6138, %r6139, %r6140 };
|
|
@%p102 st.global.v4.b32 [ %rd114 + 0 ], { %r6141, %r6142, %r6143, %r6144 };
|
|
@%p102 st.global.v4.b32 [ %rd115 + 0 ], { %r6145, %r6146, %r6147, %r6148 };
|
|
@%p102 st.global.v4.b32 [ %rd116 + 0 ], { %r6149, %r6150, %r6151, %r6152 };
|
|
shl.b64 %rd125, %rd21, 1;
|
|
add.s64 %rd117, %rd5, %rd125;
|
|
shl.b64 %rd126, %rd22, 1;
|
|
add.s64 %rd118, %rd5, %rd126;
|
|
shl.b64 %rd127, %rd23, 1;
|
|
add.s64 %rd119, %rd5, %rd127;
|
|
shl.b64 %rd128, %rd24, 1;
|
|
add.s64 %rd120, %rd5, %rd128;
|
|
cvt.rn.f16.f32 %h161, %f941;
|
|
cvt.rn.f16.f32 %h162, %f942;
|
|
cvt.rn.f16.f32 %h163, %f943;
|
|
cvt.rn.f16.f32 %h164, %f944;
|
|
cvt.rn.f16.f32 %h165, %f945;
|
|
cvt.rn.f16.f32 %h166, %f946;
|
|
cvt.rn.f16.f32 %h167, %f947;
|
|
cvt.rn.f16.f32 %h168, %f948;
|
|
cvt.rn.f16.f32 %h169, %f949;
|
|
cvt.rn.f16.f32 %h170, %f950;
|
|
cvt.rn.f16.f32 %h171, %f951;
|
|
cvt.rn.f16.f32 %h172, %f952;
|
|
cvt.rn.f16.f32 %h173, %f953;
|
|
cvt.rn.f16.f32 %h174, %f954;
|
|
cvt.rn.f16.f32 %h175, %f955;
|
|
cvt.rn.f16.f32 %h176, %f956;
|
|
cvt.rn.f16.f32 %h177, %f957;
|
|
cvt.rn.f16.f32 %h178, %f958;
|
|
cvt.rn.f16.f32 %h179, %f959;
|
|
cvt.rn.f16.f32 %h180, %f960;
|
|
cvt.rn.f16.f32 %h181, %f961;
|
|
cvt.rn.f16.f32 %h182, %f962;
|
|
cvt.rn.f16.f32 %h183, %f963;
|
|
cvt.rn.f16.f32 %h184, %f964;
|
|
cvt.rn.f16.f32 %h185, %f965;
|
|
cvt.rn.f16.f32 %h186, %f966;
|
|
cvt.rn.f16.f32 %h187, %f967;
|
|
cvt.rn.f16.f32 %h188, %f968;
|
|
cvt.rn.f16.f32 %h189, %f969;
|
|
cvt.rn.f16.f32 %h190, %f970;
|
|
cvt.rn.f16.f32 %h191, %f971;
|
|
cvt.rn.f16.f32 %h192, %f972;
|
|
bar.sync 0;
|
|
st.shared.v2.b16 [%r281], {%h161, %h162};
|
|
st.shared.v2.b16 [%r282], {%h163, %h164};
|
|
st.shared.v2.b16 [%r281+32], {%h165, %h166};
|
|
st.shared.v2.b16 [%r282+32], {%h167, %h168};
|
|
st.shared.v2.b16 [%r281+64], {%h169, %h170};
|
|
st.shared.v2.b16 [%r282+64], {%h171, %h172};
|
|
st.shared.v2.b16 [%r281+96], {%h173, %h174};
|
|
st.shared.v2.b16 [%r282+96], {%h175, %h176};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6153, %r6154, %r6155, %r6156}, [%r283];
|
|
ld.shared.v4.u32 {%r6157, %r6158, %r6159, %r6160}, [%r283+4608];
|
|
bar.sync 0;
|
|
st.shared.v2.b16 [%r281], {%h177, %h178};
|
|
st.shared.v2.b16 [%r282], {%h179, %h180};
|
|
st.shared.v2.b16 [%r281+32], {%h181, %h182};
|
|
st.shared.v2.b16 [%r282+32], {%h183, %h184};
|
|
st.shared.v2.b16 [%r281+64], {%h185, %h186};
|
|
st.shared.v2.b16 [%r282+64], {%h187, %h188};
|
|
st.shared.v2.b16 [%r281+96], {%h189, %h190};
|
|
st.shared.v2.b16 [%r282+96], {%h191, %h192};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6161, %r6162, %r6163, %r6164}, [%r283];
|
|
ld.shared.v4.u32 {%r6165, %r6166, %r6167, %r6168}, [%r283+4608];
|
|
@%p102 st.global.v4.b32 [ %rd117 + 0 ], { %r6153, %r6154, %r6155, %r6156 };
|
|
@%p102 st.global.v4.b32 [ %rd118 + 0 ], { %r6157, %r6158, %r6159, %r6160 };
|
|
@%p102 st.global.v4.b32 [ %rd119 + 0 ], { %r6161, %r6162, %r6163, %r6164 };
|
|
@%p102 st.global.v4.b32 [ %rd120 + 0 ], { %r6165, %r6166, %r6167, %r6168 };
|
|
add.s32 %r6175, %r6175, 1;
|
|
add.s32 %r6174, %r6174, %r23;
|
|
add.s32 %r6173, %r6173, %r23;
|
|
add.s32 %r6172, %r6172, %r23;
|
|
add.s32 %r6171, %r6171, %r23;
|
|
add.s32 %r6170, %r6170, %r23;
|
|
add.s32 %r6169, %r6169, %r23;
|
|
setp.lt.s32 %p110, %r6175, %r380;
|
|
@%p110 bra LBB0_2;
|
|
bra.uni LBB0_6;
|
|
LBB0_2:
|
|
shl.b32 %r6176, %r6175, 7;
|
|
or.b32 %r629, %r6176, %r5;
|
|
add.s32 %r630, %r6176, %r6;
|
|
add.s32 %r631, %r6176, %r7;
|
|
add.s32 %r632, %r6176, %r8;
|
|
mad.lo.s32 %r633, %r629, %r379, %r20;
|
|
mad.lo.s32 %r634, %r630, %r379, %r20;
|
|
mad.lo.s32 %r635, %r631, %r379, %r20;
|
|
mad.lo.s32 %r636, %r632, %r379, %r20;
|
|
cvt.s64.s32 %rd21, %r633;
|
|
mul.wide.s32 %rd77, %r633, 2;
|
|
add.s64 %rd66, %rd2, %rd77;
|
|
cvt.s64.s32 %rd22, %r634;
|
|
mul.wide.s32 %rd78, %r634, 2;
|
|
add.s64 %rd67, %rd2, %rd78;
|
|
cvt.s64.s32 %rd23, %r635;
|
|
mul.wide.s32 %rd79, %r635, 2;
|
|
add.s64 %rd68, %rd2, %rd79;
|
|
cvt.s64.s32 %rd24, %r636;
|
|
mul.wide.s32 %rd80, %r636, 2;
|
|
add.s64 %rd69, %rd2, %rd80;
|
|
@%p102 ld.global.v4.b32 { %r641, %r642, %r643, %r644 }, [ %rd66 + 0 ];
|
|
mov.b32 %hh1, %r641;
|
|
mov.b32 %hh2, %r642;
|
|
mov.b32 %hh3, %r643;
|
|
mov.b32 %hh4, %r644;
|
|
@%p102 ld.global.v4.b32 { %r645, %r646, %r647, %r648 }, [ %rd67 + 0 ];
|
|
mov.b32 %hh5, %r645;
|
|
mov.b32 %hh6, %r646;
|
|
mov.b32 %hh7, %r647;
|
|
mov.b32 %hh8, %r648;
|
|
@%p102 ld.global.v4.b32 { %r649, %r650, %r651, %r652 }, [ %rd68 + 0 ];
|
|
mov.b32 %hh9, %r649;
|
|
mov.b32 %hh10, %r650;
|
|
mov.b32 %hh11, %r651;
|
|
mov.b32 %hh12, %r652;
|
|
@%p102 ld.global.v4.b32 { %r653, %r654, %r655, %r656 }, [ %rd69 + 0 ];
|
|
mov.b32 %hh13, %r653;
|
|
mov.b32 %hh14, %r654;
|
|
mov.b32 %hh15, %r655;
|
|
mov.b32 %hh16, %r656;
|
|
mad.lo.s32 %r637, %r629, %r378, %r20;
|
|
mad.lo.s32 %r638, %r630, %r378, %r20;
|
|
mad.lo.s32 %r639, %r631, %r378, %r20;
|
|
mad.lo.s32 %r640, %r632, %r378, %r20;
|
|
cvt.s64.s32 %rd25, %r637;
|
|
mul.wide.s32 %rd81, %r637, 2;
|
|
add.s64 %rd70, %rd3, %rd81;
|
|
cvt.s64.s32 %rd26, %r638;
|
|
mul.wide.s32 %rd82, %r638, 2;
|
|
add.s64 %rd71, %rd3, %rd82;
|
|
cvt.s64.s32 %rd27, %r639;
|
|
mul.wide.s32 %rd83, %r639, 2;
|
|
add.s64 %rd72, %rd3, %rd83;
|
|
cvt.s64.s32 %rd28, %r640;
|
|
mul.wide.s32 %rd84, %r640, 2;
|
|
add.s64 %rd73, %rd3, %rd84;
|
|
@%p102 ld.global.v4.b32 { %r657, %r658, %r659, %r660 }, [ %rd70 + 0 ];
|
|
mov.b32 %hh17, %r657;
|
|
mov.b32 %hh18, %r658;
|
|
mov.b32 %hh19, %r659;
|
|
mov.b32 %hh20, %r660;
|
|
@%p102 ld.global.v4.b32 { %r661, %r662, %r663, %r664 }, [ %rd71 + 0 ];
|
|
mov.b32 %hh21, %r661;
|
|
mov.b32 %hh22, %r662;
|
|
mov.b32 %hh23, %r663;
|
|
mov.b32 %hh24, %r664;
|
|
@%p102 ld.global.v4.b32 { %r665, %r666, %r667, %r668 }, [ %rd72 + 0 ];
|
|
mov.b32 %hh25, %r665;
|
|
mov.b32 %hh26, %r666;
|
|
mov.b32 %hh27, %r667;
|
|
mov.b32 %hh28, %r668;
|
|
@%p102 ld.global.v4.b32 { %r669, %r670, %r671, %r672 }, [ %rd73 + 0 ];
|
|
mov.b32 %hh29, %r669;
|
|
mov.b32 %hh30, %r670;
|
|
mov.b32 %hh31, %r671;
|
|
mov.b32 %hh32, %r672;
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r24], {%r641, %r642, %r643, %r644};
|
|
st.shared.v4.b32 [%r25], {%r645, %r646, %r647, %r648};
|
|
st.shared.v4.b32 [%r26], {%r649, %r650, %r651, %r652};
|
|
st.shared.v4.b32 [%r27], {%r653, %r654, %r655, %r656};
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r28], {%r657, %r658, %r659, %r660};
|
|
st.shared.v4.b32 [%r29], {%r661, %r662, %r663, %r664};
|
|
st.shared.v4.b32 [%r30], {%r665, %r666, %r667, %r668};
|
|
st.shared.v4.b32 [%r31], {%r669, %r670, %r671, %r672};
|
|
bar.sync 0;
|
|
setp.ge.s32 %p10, %r6176, %r22;
|
|
shl.b64 %rd129, %rd25, 1;
|
|
mov.f32 %f909, %f227;
|
|
mov.f32 %f910, %f227;
|
|
mov.f32 %f911, %f227;
|
|
mov.f32 %f912, %f227;
|
|
mov.f32 %f913, %f227;
|
|
mov.f32 %f914, %f227;
|
|
mov.f32 %f915, %f227;
|
|
mov.f32 %f916, %f227;
|
|
mov.f32 %f917, %f227;
|
|
mov.f32 %f918, %f227;
|
|
mov.f32 %f919, %f227;
|
|
mov.f32 %f920, %f227;
|
|
mov.f32 %f921, %f227;
|
|
mov.f32 %f922, %f227;
|
|
mov.f32 %f923, %f227;
|
|
mov.f32 %f924, %f227;
|
|
mov.f32 %f925, %f227;
|
|
mov.f32 %f926, %f227;
|
|
mov.f32 %f927, %f227;
|
|
mov.f32 %f928, %f227;
|
|
mov.f32 %f929, %f227;
|
|
mov.f32 %f930, %f227;
|
|
mov.f32 %f931, %f227;
|
|
mov.f32 %f932, %f227;
|
|
mov.f32 %f933, %f227;
|
|
mov.f32 %f934, %f227;
|
|
mov.f32 %f935, %f227;
|
|
mov.f32 %f936, %f227;
|
|
mov.f32 %f937, %f227;
|
|
mov.f32 %f938, %f227;
|
|
mov.f32 %f939, %f227;
|
|
mov.f32 %f940, %f227;
|
|
mov.f32 %f941, %f227;
|
|
mov.f32 %f942, %f227;
|
|
mov.f32 %f943, %f227;
|
|
mov.f32 %f944, %f227;
|
|
mov.f32 %f945, %f227;
|
|
mov.f32 %f946, %f227;
|
|
mov.f32 %f947, %f227;
|
|
mov.f32 %f948, %f227;
|
|
mov.f32 %f949, %f227;
|
|
mov.f32 %f950, %f227;
|
|
mov.f32 %f951, %f227;
|
|
mov.f32 %f952, %f227;
|
|
mov.f32 %f953, %f227;
|
|
mov.f32 %f954, %f227;
|
|
mov.f32 %f955, %f227;
|
|
mov.f32 %f956, %f227;
|
|
mov.f32 %f957, %f227;
|
|
mov.f32 %f958, %f227;
|
|
mov.f32 %f959, %f227;
|
|
mov.f32 %f960, %f227;
|
|
mov.f32 %f961, %f227;
|
|
mov.f32 %f962, %f227;
|
|
mov.f32 %f963, %f227;
|
|
mov.f32 %f964, %f227;
|
|
mov.f32 %f965, %f227;
|
|
mov.f32 %f966, %f227;
|
|
mov.f32 %f967, %f227;
|
|
mov.f32 %f968, %f227;
|
|
mov.f32 %f969, %f227;
|
|
mov.f32 %f970, %f227;
|
|
mov.f32 %f971, %f227;
|
|
mov.f32 %f972, %f227;
|
|
@%p10 bra LBB0_5;
|
|
mul.wide.s32 %rd74, %r6174, 2;
|
|
add.s64 %rd138, %rd55, %rd74;
|
|
mul.wide.s32 %rd75, %r6173, 2;
|
|
add.s64 %rd137, %rd55, %rd75;
|
|
mul.wide.s32 %rd76, %r6172, 2;
|
|
add.s64 %rd136, %rd55, %rd76;
|
|
add.s64 %rd134, %rd54, %rd74;
|
|
add.s64 %rd133, %rd54, %rd75;
|
|
add.s64 %rd132, %rd54, %rd76;
|
|
mul.wide.s32 %rd18, %r6171, 4;
|
|
mul.wide.s32 %rd19, %r6170, 4;
|
|
mul.wide.s32 %rd20, %r6169, 4;
|
|
or.b32 %r300, %r6176, %r11;
|
|
or.b32 %r301, %r6176, %r9;
|
|
or.b32 %r302, %r301, 1;
|
|
or.b32 %r303, %r301, 8;
|
|
or.b32 %r304, %r301, 9;
|
|
or.b32 %r336, %r301, 64;
|
|
or.b32 %r335, %r301, 65;
|
|
or.b32 %r332, %r301, 72;
|
|
or.b32 %r331, %r301, 73;
|
|
or.b32 %r328, %r301, 80;
|
|
or.b32 %r327, %r301, 81;
|
|
or.b32 %r324, %r301, 88;
|
|
or.b32 %r323, %r301, 89;
|
|
or.b32 %r320, %r301, 96;
|
|
or.b32 %r319, %r301, 97;
|
|
or.b32 %r316, %r301, 104;
|
|
or.b32 %r315, %r301, 105;
|
|
or.b32 %r312, %r301, 112;
|
|
or.b32 %r311, %r301, 113;
|
|
or.b32 %r308, %r301, 120;
|
|
or.b32 %r307, %r301, 121;
|
|
or.b32 %r352, %r301, 32;
|
|
or.b32 %r351, %r301, 33;
|
|
or.b32 %r348, %r301, 40;
|
|
or.b32 %r347, %r301, 41;
|
|
or.b32 %r344, %r301, 48;
|
|
or.b32 %r343, %r301, 49;
|
|
or.b32 %r340, %r301, 56;
|
|
or.b32 %r339, %r301, 57;
|
|
or.b32 %r360, %r301, 16;
|
|
or.b32 %r359, %r301, 17;
|
|
or.b32 %r356, %r301, 24;
|
|
or.b32 %r355, %r301, 25;
|
|
add.s32 %r673, %r300, 112;
|
|
mul.lo.s32 %r674, %r673, %r378;
|
|
add.s32 %r675, %r674, %r21;
|
|
shl.b32 %r676, %r378, 4;
|
|
sub.s32 %r677, %r674, %r676;
|
|
add.s32 %r678, %r677, %r21;
|
|
sub.s32 %r679, %r677, %r676;
|
|
add.s32 %r680, %r679, %r21;
|
|
sub.s32 %r681, %r679, %r676;
|
|
add.s32 %r682, %r681, %r21;
|
|
mad.lo.s32 %r683, %r300, %r378, %r21;
|
|
add.s64 %rd135, %rd55, %rd129;
|
|
add.s64 %rd131, %rd54, %rd129;
|
|
mul.wide.s32 %rd31, %r675, 4;
|
|
mul.wide.s32 %rd32, %r678, 4;
|
|
mul.wide.s32 %rd33, %r680, 4;
|
|
mul.wide.s32 %rd34, %r682, 4;
|
|
mul.wide.s32 %rd35, %r683, 4;
|
|
mov.f32 %f259, 0f00000000;
|
|
mov.u64 %rd130, %rd4;
|
|
mov.f32 %f941, %f259;
|
|
mov.f32 %f942, %f259;
|
|
mov.f32 %f943, %f259;
|
|
mov.f32 %f944, %f259;
|
|
mov.f32 %f945, %f259;
|
|
mov.f32 %f946, %f259;
|
|
mov.f32 %f947, %f259;
|
|
mov.f32 %f948, %f259;
|
|
mov.f32 %f949, %f259;
|
|
mov.f32 %f950, %f259;
|
|
mov.f32 %f951, %f259;
|
|
mov.f32 %f952, %f259;
|
|
mov.f32 %f953, %f259;
|
|
mov.f32 %f954, %f259;
|
|
mov.f32 %f955, %f259;
|
|
mov.f32 %f956, %f259;
|
|
mov.f32 %f957, %f259;
|
|
mov.f32 %f958, %f259;
|
|
mov.f32 %f959, %f259;
|
|
mov.f32 %f960, %f259;
|
|
mov.f32 %f961, %f259;
|
|
mov.f32 %f962, %f259;
|
|
mov.f32 %f963, %f259;
|
|
mov.f32 %f964, %f259;
|
|
mov.f32 %f965, %f259;
|
|
mov.f32 %f966, %f259;
|
|
mov.f32 %f967, %f259;
|
|
mov.f32 %f968, %f259;
|
|
mov.f32 %f969, %f259;
|
|
mov.f32 %f970, %f259;
|
|
mov.f32 %f971, %f259;
|
|
mov.f32 %f972, %f259;
|
|
mov.f32 %f909, %f259;
|
|
mov.f32 %f910, %f259;
|
|
mov.f32 %f911, %f259;
|
|
mov.f32 %f912, %f259;
|
|
mov.f32 %f913, %f259;
|
|
mov.f32 %f914, %f259;
|
|
mov.f32 %f915, %f259;
|
|
mov.f32 %f916, %f259;
|
|
mov.f32 %f917, %f259;
|
|
mov.f32 %f918, %f259;
|
|
mov.f32 %f919, %f259;
|
|
mov.f32 %f920, %f259;
|
|
mov.f32 %f921, %f259;
|
|
mov.f32 %f922, %f259;
|
|
mov.f32 %f923, %f259;
|
|
mov.f32 %f924, %f259;
|
|
mov.f32 %f925, %f259;
|
|
mov.f32 %f926, %f259;
|
|
mov.f32 %f927, %f259;
|
|
mov.f32 %f928, %f259;
|
|
mov.f32 %f929, %f259;
|
|
mov.f32 %f930, %f259;
|
|
mov.f32 %f931, %f259;
|
|
mov.f32 %f932, %f259;
|
|
mov.f32 %f933, %f259;
|
|
mov.f32 %f934, %f259;
|
|
mov.f32 %f935, %f259;
|
|
mov.f32 %f936, %f259;
|
|
mov.f32 %f937, %f259;
|
|
mov.f32 %f938, %f259;
|
|
mov.f32 %f939, %f259;
|
|
mov.f32 %f940, %f259;
|
|
LBB0_4:
|
|
add.s64 %rd94, %rd138, %rd9;
|
|
add.s64 %rd93, %rd137, %rd9;
|
|
add.s64 %rd92, %rd136, %rd9;
|
|
add.s64 %rd91, %rd135, %rd9;
|
|
add.s64 %rd89, %rd134, %rd9;
|
|
add.s64 %rd88, %rd133, %rd9;
|
|
add.s64 %rd87, %rd132, %rd9;
|
|
add.s64 %rd86, %rd131, %rd9;
|
|
add.s64 %rd103, %rd130, %rd31;
|
|
add.s64 %rd102, %rd130, %rd32;
|
|
add.s64 %rd101, %rd130, %rd33;
|
|
add.s64 %rd100, %rd130, %rd34;
|
|
add.s64 %rd99, %rd130, %rd18;
|
|
add.s64 %rd98, %rd130, %rd19;
|
|
add.s64 %rd97, %rd130, %rd20;
|
|
add.s64 %rd96, %rd130, %rd35;
|
|
or.b32 %r6102, %r6176, %r3;
|
|
@%p102 ld.global.v4.b32 { %r6103, %r6104, %r6105, %r6106 }, [ %rd86 + 0 ];
|
|
mov.b32 %hh33, %r6103;
|
|
mov.b32 %hh34, %r6104;
|
|
mov.b32 %hh35, %r6105;
|
|
mov.b32 %hh36, %r6106;
|
|
@%p102 ld.global.v4.b32 { %r6107, %r6108, %r6109, %r6110 }, [ %rd87 + 0 ];
|
|
mov.b32 %hh37, %r6107;
|
|
mov.b32 %hh38, %r6108;
|
|
mov.b32 %hh39, %r6109;
|
|
mov.b32 %hh40, %r6110;
|
|
@%p102 ld.global.v4.b32 { %r6111, %r6112, %r6113, %r6114 }, [ %rd88 + 0 ];
|
|
mov.b32 %hh41, %r6111;
|
|
mov.b32 %hh42, %r6112;
|
|
mov.b32 %hh43, %r6113;
|
|
mov.b32 %hh44, %r6114;
|
|
@%p102 ld.global.v4.b32 { %r6115, %r6116, %r6117, %r6118 }, [ %rd89 + 0 ];
|
|
mov.b32 %hh45, %r6115;
|
|
mov.b32 %hh46, %r6116;
|
|
mov.b32 %hh47, %r6117;
|
|
mov.b32 %hh48, %r6118;
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r32], {%r6103, %r6104, %r6105, %r6106};
|
|
st.shared.v4.b32 [%r33], {%r6107, %r6108, %r6109, %r6110};
|
|
st.shared.v4.b32 [%r34], {%r6111, %r6112, %r6113, %r6114};
|
|
st.shared.v4.b32 [%r35], {%r6115, %r6116, %r6117, %r6118};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r884, %r885, %r886, %r887 }, [ %r704 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1108, %r1109, %r1110, %r1111 }, [ %r709 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1332, %r1333, %r1334, %r1335 }, [ %r714 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r1556, %r1557, %r1558, %r1559 }, [ %r719 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r720, %r721, %r722, %r723 }, [ %r724 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r725, %r726, %r727, %r728 }, [ %r729 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r730, %r731, %r732, %r733 }, [ %r734 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r735, %r736, %r737, %r738 }, [ %r739 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r740, %r741, %r742, %r743 }, [ %r744 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r745, %r746, %r747, %r748 }, [ %r749 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r750, %r751, %r752, %r753 }, [ %r754 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r755, %r756, %r757, %r758 }, [ %r759 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r760, %r761, %r762, %r763 }, [ %r764 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r765, %r766, %r767, %r768 }, [ %r769 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r770, %r771, %r772, %r773 }, [ %r774 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r775, %r776, %r777, %r778 }, [ %r779 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r780, %r781, %r782, %r783 }, [ %r784 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r785, %r786, %r787, %r788 }, [ %r789 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r790, %r791, %r792, %r793 }, [ %r794 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r795, %r796, %r797, %r798 }, [ %r799 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r800, %r801, %r802, %r803 }, [ %r804 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r805, %r806, %r807, %r808 }, [ %r809 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r810, %r811, %r812, %r813 }, [ %r814 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r815, %r816, %r817, %r818 }, [ %r819 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r820, %r821, %r822, %r823 }, [ %r824 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r825, %r826, %r827, %r828 }, [ %r829 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r830, %r831, %r832, %r833 }, [ %r834 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r835, %r836, %r837, %r838 }, [ %r839 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r840, %r841, %r842, %r843 }, [ %r844 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r845, %r846, %r847, %r848 }, [ %r849 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r850, %r851, %r852, %r853 }, [ %r854 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r855, %r856, %r857, %r858 }, [ %r859 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r860, %r861, %r862, %r863 }, [ %r864 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r865, %r866, %r867, %r868 }, [ %r869 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r870, %r871, %r872, %r873 }, [ %r874 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r875, %r876, %r877, %r878 }, [ %r879 + 0 ];
|
|
mov.u32 %r1317, 0;
|
|
mov.u32 %r1104, %r1317;
|
|
mov.u32 %r1105, %r1317;
|
|
mov.u32 %r1106, %r1317;
|
|
mov.u32 %r1107, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r884, %r885, %r886, %r887 }, { %r720, %r721 }, { %r1104, %r1105, %r1106, %r1107 };
|
|
mov.u32 %r1118, %r1317;
|
|
mov.u32 %r1119, %r1317;
|
|
mov.u32 %r1120, %r1317;
|
|
mov.u32 %r1121, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r884, %r885, %r886, %r887 }, { %r722, %r723 }, { %r1118, %r1119, %r1120, %r1121 };
|
|
mov.u32 %r1132, %r1317;
|
|
mov.u32 %r1133, %r1317;
|
|
mov.u32 %r1134, %r1317;
|
|
mov.u32 %r1135, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r884, %r885, %r886, %r887 }, { %r740, %r741 }, { %r1132, %r1133, %r1134, %r1135 };
|
|
mov.u32 %r1146, %r1317;
|
|
mov.u32 %r1147, %r1317;
|
|
mov.u32 %r1148, %r1317;
|
|
mov.u32 %r1149, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r884, %r885, %r886, %r887 }, { %r742, %r743 }, { %r1146, %r1147, %r1148, %r1149 };
|
|
mov.u32 %r1160, %r1317;
|
|
mov.u32 %r1161, %r1317;
|
|
mov.u32 %r1162, %r1317;
|
|
mov.u32 %r1163, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r884, %r885, %r886, %r887 }, { %r760, %r761 }, { %r1160, %r1161, %r1162, %r1163 };
|
|
mov.u32 %r1174, %r1317;
|
|
mov.u32 %r1175, %r1317;
|
|
mov.u32 %r1176, %r1317;
|
|
mov.u32 %r1177, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r884, %r885, %r886, %r887 }, { %r762, %r763 }, { %r1174, %r1175, %r1176, %r1177 };
|
|
mov.u32 %r1188, %r1317;
|
|
mov.u32 %r1189, %r1317;
|
|
mov.u32 %r1190, %r1317;
|
|
mov.u32 %r1191, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r884, %r885, %r886, %r887 }, { %r780, %r781 }, { %r1188, %r1189, %r1190, %r1191 };
|
|
mov.u32 %r1202, %r1317;
|
|
mov.u32 %r1203, %r1317;
|
|
mov.u32 %r1204, %r1317;
|
|
mov.u32 %r1205, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r884, %r885, %r886, %r887 }, { %r782, %r783 }, { %r1202, %r1203, %r1204, %r1205 };
|
|
mov.u32 %r1216, %r1317;
|
|
mov.u32 %r1217, %r1317;
|
|
mov.u32 %r1218, %r1317;
|
|
mov.u32 %r1219, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r884, %r885, %r886, %r887 }, { %r800, %r801 }, { %r1216, %r1217, %r1218, %r1219 };
|
|
mov.u32 %r1230, %r1317;
|
|
mov.u32 %r1231, %r1317;
|
|
mov.u32 %r1232, %r1317;
|
|
mov.u32 %r1233, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r884, %r885, %r886, %r887 }, { %r802, %r803 }, { %r1230, %r1231, %r1232, %r1233 };
|
|
mov.u32 %r1244, %r1317;
|
|
mov.u32 %r1245, %r1317;
|
|
mov.u32 %r1246, %r1317;
|
|
mov.u32 %r1247, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r884, %r885, %r886, %r887 }, { %r820, %r821 }, { %r1244, %r1245, %r1246, %r1247 };
|
|
mov.u32 %r1258, %r1317;
|
|
mov.u32 %r1259, %r1317;
|
|
mov.u32 %r1260, %r1317;
|
|
mov.u32 %r1261, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r884, %r885, %r886, %r887 }, { %r822, %r823 }, { %r1258, %r1259, %r1260, %r1261 };
|
|
mov.u32 %r1272, %r1317;
|
|
mov.u32 %r1273, %r1317;
|
|
mov.u32 %r1274, %r1317;
|
|
mov.u32 %r1275, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r884, %r885, %r886, %r887 }, { %r840, %r841 }, { %r1272, %r1273, %r1274, %r1275 };
|
|
mov.u32 %r1286, %r1317;
|
|
mov.u32 %r1287, %r1317;
|
|
mov.u32 %r1288, %r1317;
|
|
mov.u32 %r1289, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r884, %r885, %r886, %r887 }, { %r842, %r843 }, { %r1286, %r1287, %r1288, %r1289 };
|
|
mov.u32 %r1300, %r1317;
|
|
mov.u32 %r1301, %r1317;
|
|
mov.u32 %r1302, %r1317;
|
|
mov.u32 %r1303, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r884, %r885, %r886, %r887 }, { %r860, %r861 }, { %r1300, %r1301, %r1302, %r1303 };
|
|
mov.u32 %r1314, %r1317;
|
|
mov.u32 %r1315, %r1317;
|
|
mov.u32 %r1316, %r1317;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r884, %r885, %r886, %r887 }, { %r862, %r863 }, { %r1314, %r1315, %r1316, %r1317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1108, %r1109, %r1110, %r1111 }, { %r725, %r726 }, { %r1104, %r1105, %r1106, %r1107 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1108, %r1109, %r1110, %r1111 }, { %r727, %r728 }, { %r1118, %r1119, %r1120, %r1121 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1108, %r1109, %r1110, %r1111 }, { %r745, %r746 }, { %r1132, %r1133, %r1134, %r1135 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1108, %r1109, %r1110, %r1111 }, { %r747, %r748 }, { %r1146, %r1147, %r1148, %r1149 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1108, %r1109, %r1110, %r1111 }, { %r765, %r766 }, { %r1160, %r1161, %r1162, %r1163 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1108, %r1109, %r1110, %r1111 }, { %r767, %r768 }, { %r1174, %r1175, %r1176, %r1177 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1108, %r1109, %r1110, %r1111 }, { %r785, %r786 }, { %r1188, %r1189, %r1190, %r1191 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1108, %r1109, %r1110, %r1111 }, { %r787, %r788 }, { %r1202, %r1203, %r1204, %r1205 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1108, %r1109, %r1110, %r1111 }, { %r805, %r806 }, { %r1216, %r1217, %r1218, %r1219 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1108, %r1109, %r1110, %r1111 }, { %r807, %r808 }, { %r1230, %r1231, %r1232, %r1233 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1108, %r1109, %r1110, %r1111 }, { %r825, %r826 }, { %r1244, %r1245, %r1246, %r1247 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1108, %r1109, %r1110, %r1111 }, { %r827, %r828 }, { %r1258, %r1259, %r1260, %r1261 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1108, %r1109, %r1110, %r1111 }, { %r845, %r846 }, { %r1272, %r1273, %r1274, %r1275 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1108, %r1109, %r1110, %r1111 }, { %r847, %r848 }, { %r1286, %r1287, %r1288, %r1289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1108, %r1109, %r1110, %r1111 }, { %r865, %r866 }, { %r1300, %r1301, %r1302, %r1303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1108, %r1109, %r1110, %r1111 }, { %r867, %r868 }, { %r1314, %r1315, %r1316, %r1317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1332, %r1333, %r1334, %r1335 }, { %r730, %r731 }, { %r1104, %r1105, %r1106, %r1107 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1332, %r1333, %r1334, %r1335 }, { %r732, %r733 }, { %r1118, %r1119, %r1120, %r1121 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1332, %r1333, %r1334, %r1335 }, { %r750, %r751 }, { %r1132, %r1133, %r1134, %r1135 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1332, %r1333, %r1334, %r1335 }, { %r752, %r753 }, { %r1146, %r1147, %r1148, %r1149 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1332, %r1333, %r1334, %r1335 }, { %r770, %r771 }, { %r1160, %r1161, %r1162, %r1163 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1332, %r1333, %r1334, %r1335 }, { %r772, %r773 }, { %r1174, %r1175, %r1176, %r1177 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1332, %r1333, %r1334, %r1335 }, { %r790, %r791 }, { %r1188, %r1189, %r1190, %r1191 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1332, %r1333, %r1334, %r1335 }, { %r792, %r793 }, { %r1202, %r1203, %r1204, %r1205 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1332, %r1333, %r1334, %r1335 }, { %r810, %r811 }, { %r1216, %r1217, %r1218, %r1219 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1332, %r1333, %r1334, %r1335 }, { %r812, %r813 }, { %r1230, %r1231, %r1232, %r1233 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1332, %r1333, %r1334, %r1335 }, { %r830, %r831 }, { %r1244, %r1245, %r1246, %r1247 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1332, %r1333, %r1334, %r1335 }, { %r832, %r833 }, { %r1258, %r1259, %r1260, %r1261 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1332, %r1333, %r1334, %r1335 }, { %r850, %r851 }, { %r1272, %r1273, %r1274, %r1275 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1332, %r1333, %r1334, %r1335 }, { %r852, %r853 }, { %r1286, %r1287, %r1288, %r1289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1332, %r1333, %r1334, %r1335 }, { %r870, %r871 }, { %r1300, %r1301, %r1302, %r1303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1332, %r1333, %r1334, %r1335 }, { %r872, %r873 }, { %r1314, %r1315, %r1316, %r1317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1104, %r1105, %r1106, %r1107 }, { %r1556, %r1557, %r1558, %r1559 }, { %r735, %r736 }, { %r1104, %r1105, %r1106, %r1107 };
|
|
mov.b32 %f388, %r1107;
|
|
mov.b32 %f389, %r1106;
|
|
mov.b32 %f390, %r1105;
|
|
mov.b32 %f391, %r1104;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1118, %r1119, %r1120, %r1121 }, { %r1556, %r1557, %r1558, %r1559 }, { %r737, %r738 }, { %r1118, %r1119, %r1120, %r1121 };
|
|
mov.b32 %f392, %r1121;
|
|
mov.b32 %f393, %r1120;
|
|
mov.b32 %f394, %r1119;
|
|
mov.b32 %f395, %r1118;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1132, %r1133, %r1134, %r1135 }, { %r1556, %r1557, %r1558, %r1559 }, { %r755, %r756 }, { %r1132, %r1133, %r1134, %r1135 };
|
|
mov.b32 %f396, %r1135;
|
|
mov.b32 %f397, %r1134;
|
|
mov.b32 %f398, %r1133;
|
|
mov.b32 %f399, %r1132;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1146, %r1147, %r1148, %r1149 }, { %r1556, %r1557, %r1558, %r1559 }, { %r757, %r758 }, { %r1146, %r1147, %r1148, %r1149 };
|
|
mov.b32 %f400, %r1149;
|
|
mov.b32 %f401, %r1148;
|
|
mov.b32 %f402, %r1147;
|
|
mov.b32 %f403, %r1146;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1160, %r1161, %r1162, %r1163 }, { %r1556, %r1557, %r1558, %r1559 }, { %r775, %r776 }, { %r1160, %r1161, %r1162, %r1163 };
|
|
mov.b32 %f404, %r1163;
|
|
mov.b32 %f405, %r1162;
|
|
mov.b32 %f406, %r1161;
|
|
mov.b32 %f407, %r1160;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1174, %r1175, %r1176, %r1177 }, { %r1556, %r1557, %r1558, %r1559 }, { %r777, %r778 }, { %r1174, %r1175, %r1176, %r1177 };
|
|
mov.b32 %f408, %r1177;
|
|
mov.b32 %f409, %r1176;
|
|
mov.b32 %f410, %r1175;
|
|
mov.b32 %f411, %r1174;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1188, %r1189, %r1190, %r1191 }, { %r1556, %r1557, %r1558, %r1559 }, { %r795, %r796 }, { %r1188, %r1189, %r1190, %r1191 };
|
|
mov.b32 %f412, %r1191;
|
|
mov.b32 %f413, %r1190;
|
|
mov.b32 %f414, %r1189;
|
|
mov.b32 %f415, %r1188;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1202, %r1203, %r1204, %r1205 }, { %r1556, %r1557, %r1558, %r1559 }, { %r797, %r798 }, { %r1202, %r1203, %r1204, %r1205 };
|
|
mov.b32 %f416, %r1205;
|
|
mov.b32 %f417, %r1204;
|
|
mov.b32 %f418, %r1203;
|
|
mov.b32 %f419, %r1202;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1216, %r1217, %r1218, %r1219 }, { %r1556, %r1557, %r1558, %r1559 }, { %r815, %r816 }, { %r1216, %r1217, %r1218, %r1219 };
|
|
mov.b32 %f420, %r1219;
|
|
mov.b32 %f421, %r1218;
|
|
mov.b32 %f422, %r1217;
|
|
mov.b32 %f423, %r1216;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1230, %r1231, %r1232, %r1233 }, { %r1556, %r1557, %r1558, %r1559 }, { %r817, %r818 }, { %r1230, %r1231, %r1232, %r1233 };
|
|
mov.b32 %f424, %r1233;
|
|
mov.b32 %f425, %r1232;
|
|
mov.b32 %f426, %r1231;
|
|
mov.b32 %f427, %r1230;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1244, %r1245, %r1246, %r1247 }, { %r1556, %r1557, %r1558, %r1559 }, { %r835, %r836 }, { %r1244, %r1245, %r1246, %r1247 };
|
|
mov.b32 %f428, %r1247;
|
|
mov.b32 %f429, %r1246;
|
|
mov.b32 %f430, %r1245;
|
|
mov.b32 %f431, %r1244;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1258, %r1259, %r1260, %r1261 }, { %r1556, %r1557, %r1558, %r1559 }, { %r837, %r838 }, { %r1258, %r1259, %r1260, %r1261 };
|
|
mov.b32 %f432, %r1261;
|
|
mov.b32 %f433, %r1260;
|
|
mov.b32 %f434, %r1259;
|
|
mov.b32 %f435, %r1258;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1272, %r1273, %r1274, %r1275 }, { %r1556, %r1557, %r1558, %r1559 }, { %r855, %r856 }, { %r1272, %r1273, %r1274, %r1275 };
|
|
mov.b32 %f436, %r1275;
|
|
mov.b32 %f437, %r1274;
|
|
mov.b32 %f438, %r1273;
|
|
mov.b32 %f439, %r1272;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1286, %r1287, %r1288, %r1289 }, { %r1556, %r1557, %r1558, %r1559 }, { %r857, %r858 }, { %r1286, %r1287, %r1288, %r1289 };
|
|
mov.b32 %f440, %r1289;
|
|
mov.b32 %f441, %r1288;
|
|
mov.b32 %f442, %r1287;
|
|
mov.b32 %f443, %r1286;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1300, %r1301, %r1302, %r1303 }, { %r1556, %r1557, %r1558, %r1559 }, { %r875, %r876 }, { %r1300, %r1301, %r1302, %r1303 };
|
|
mov.b32 %f444, %r1303;
|
|
mov.b32 %f445, %r1302;
|
|
mov.b32 %f446, %r1301;
|
|
mov.b32 %f447, %r1300;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r1314, %r1315, %r1316, %r1317 }, { %r1556, %r1557, %r1558, %r1559 }, { %r877, %r878 }, { %r1314, %r1315, %r1316, %r1317 };
|
|
mov.b32 %f448, %r1317;
|
|
mov.b32 %f449, %r1316;
|
|
mov.b32 %f450, %r1315;
|
|
mov.b32 %f451, %r1314;
|
|
or.b32 %r6119, %r6176, %r17;
|
|
or.b32 %r6120, %r6176, %r18;
|
|
setp.lt.s32 %p37, %r6120, %r307;
|
|
setp.lt.s32 %p38, %r6120, %r308;
|
|
setp.lt.s32 %p39, %r6119, %r307;
|
|
setp.lt.s32 %p40, %r6119, %r308;
|
|
setp.lt.s32 %p41, %r6120, %r311;
|
|
setp.lt.s32 %p42, %r6120, %r312;
|
|
setp.lt.s32 %p43, %r6119, %r311;
|
|
setp.lt.s32 %p44, %r6119, %r312;
|
|
setp.lt.s32 %p45, %r6120, %r315;
|
|
setp.lt.s32 %p46, %r6120, %r316;
|
|
setp.lt.s32 %p47, %r6119, %r315;
|
|
setp.lt.s32 %p48, %r6119, %r316;
|
|
setp.lt.s32 %p49, %r6120, %r319;
|
|
setp.lt.s32 %p50, %r6120, %r320;
|
|
setp.lt.s32 %p51, %r6119, %r319;
|
|
setp.lt.s32 %p52, %r6119, %r320;
|
|
setp.lt.s32 %p53, %r6120, %r323;
|
|
setp.lt.s32 %p54, %r6120, %r324;
|
|
setp.lt.s32 %p55, %r6119, %r323;
|
|
setp.lt.s32 %p56, %r6119, %r324;
|
|
setp.lt.s32 %p57, %r6120, %r327;
|
|
setp.lt.s32 %p58, %r6120, %r328;
|
|
setp.lt.s32 %p59, %r6119, %r327;
|
|
setp.lt.s32 %p60, %r6119, %r328;
|
|
setp.lt.s32 %p61, %r6120, %r331;
|
|
setp.lt.s32 %p62, %r6120, %r332;
|
|
setp.lt.s32 %p63, %r6119, %r331;
|
|
setp.lt.s32 %p64, %r6119, %r332;
|
|
setp.lt.s32 %p65, %r6120, %r335;
|
|
setp.lt.s32 %p66, %r6120, %r336;
|
|
setp.lt.s32 %p67, %r6119, %r335;
|
|
setp.lt.s32 %p68, %r6119, %r336;
|
|
setp.lt.s32 %p69, %r6120, %r339;
|
|
setp.lt.s32 %p70, %r6120, %r340;
|
|
setp.lt.s32 %p71, %r6119, %r339;
|
|
setp.lt.s32 %p72, %r6119, %r340;
|
|
setp.lt.s32 %p73, %r6120, %r343;
|
|
setp.lt.s32 %p74, %r6120, %r344;
|
|
setp.lt.s32 %p75, %r6119, %r343;
|
|
setp.lt.s32 %p76, %r6119, %r344;
|
|
setp.lt.s32 %p77, %r6120, %r347;
|
|
setp.lt.s32 %p78, %r6120, %r348;
|
|
setp.lt.s32 %p79, %r6119, %r347;
|
|
setp.lt.s32 %p80, %r6119, %r348;
|
|
setp.lt.s32 %p81, %r6120, %r351;
|
|
setp.lt.s32 %p82, %r6120, %r352;
|
|
setp.lt.s32 %p83, %r6119, %r351;
|
|
setp.lt.s32 %p84, %r6119, %r352;
|
|
setp.lt.s32 %p85, %r6120, %r355;
|
|
setp.lt.s32 %p86, %r6120, %r356;
|
|
setp.lt.s32 %p87, %r6119, %r355;
|
|
setp.lt.s32 %p88, %r6119, %r356;
|
|
setp.lt.s32 %p89, %r6120, %r359;
|
|
setp.lt.s32 %p90, %r6120, %r360;
|
|
setp.lt.s32 %p91, %r6119, %r359;
|
|
setp.lt.s32 %p92, %r6119, %r360;
|
|
setp.lt.s32 %p93, %r6120, %r304;
|
|
setp.lt.s32 %p94, %r6120, %r303;
|
|
setp.lt.s32 %p95, %r6119, %r304;
|
|
setp.lt.s32 %p96, %r6119, %r303;
|
|
setp.lt.s32 %p97, %r6120, %r302;
|
|
setp.lt.s32 %p98, %r6120, %r301;
|
|
setp.lt.s32 %p99, %r6119, %r302;
|
|
setp.lt.s32 %p100, %r6119, %r301;
|
|
selp.f32 %f452, 0fFF800000, %f391, %p100;
|
|
selp.f32 %f453, 0fFF800000, %f390, %p99;
|
|
selp.f32 %f454, 0fFF800000, %f389, %p98;
|
|
selp.f32 %f455, 0fFF800000, %f388, %p97;
|
|
selp.f32 %f456, 0fFF800000, %f395, %p96;
|
|
selp.f32 %f457, 0fFF800000, %f394, %p95;
|
|
selp.f32 %f458, 0fFF800000, %f393, %p94;
|
|
selp.f32 %f459, 0fFF800000, %f392, %p93;
|
|
selp.f32 %f460, 0fFF800000, %f399, %p92;
|
|
selp.f32 %f461, 0fFF800000, %f398, %p91;
|
|
selp.f32 %f462, 0fFF800000, %f397, %p90;
|
|
selp.f32 %f463, 0fFF800000, %f396, %p89;
|
|
selp.f32 %f464, 0fFF800000, %f403, %p88;
|
|
selp.f32 %f465, 0fFF800000, %f402, %p87;
|
|
selp.f32 %f466, 0fFF800000, %f401, %p86;
|
|
selp.f32 %f467, 0fFF800000, %f400, %p85;
|
|
selp.f32 %f468, 0fFF800000, %f407, %p84;
|
|
selp.f32 %f469, 0fFF800000, %f406, %p83;
|
|
selp.f32 %f470, 0fFF800000, %f405, %p82;
|
|
selp.f32 %f471, 0fFF800000, %f404, %p81;
|
|
selp.f32 %f472, 0fFF800000, %f411, %p80;
|
|
selp.f32 %f473, 0fFF800000, %f410, %p79;
|
|
selp.f32 %f474, 0fFF800000, %f409, %p78;
|
|
selp.f32 %f475, 0fFF800000, %f408, %p77;
|
|
selp.f32 %f476, 0fFF800000, %f415, %p76;
|
|
selp.f32 %f477, 0fFF800000, %f414, %p75;
|
|
selp.f32 %f478, 0fFF800000, %f413, %p74;
|
|
selp.f32 %f479, 0fFF800000, %f412, %p73;
|
|
selp.f32 %f480, 0fFF800000, %f419, %p72;
|
|
selp.f32 %f481, 0fFF800000, %f418, %p71;
|
|
selp.f32 %f482, 0fFF800000, %f417, %p70;
|
|
selp.f32 %f483, 0fFF800000, %f416, %p69;
|
|
selp.f32 %f484, 0fFF800000, %f423, %p68;
|
|
selp.f32 %f485, 0fFF800000, %f422, %p67;
|
|
selp.f32 %f486, 0fFF800000, %f421, %p66;
|
|
selp.f32 %f487, 0fFF800000, %f420, %p65;
|
|
selp.f32 %f488, 0fFF800000, %f427, %p64;
|
|
selp.f32 %f489, 0fFF800000, %f426, %p63;
|
|
selp.f32 %f490, 0fFF800000, %f425, %p62;
|
|
selp.f32 %f491, 0fFF800000, %f424, %p61;
|
|
selp.f32 %f492, 0fFF800000, %f431, %p60;
|
|
selp.f32 %f493, 0fFF800000, %f430, %p59;
|
|
selp.f32 %f494, 0fFF800000, %f429, %p58;
|
|
selp.f32 %f495, 0fFF800000, %f428, %p57;
|
|
selp.f32 %f496, 0fFF800000, %f435, %p56;
|
|
selp.f32 %f497, 0fFF800000, %f434, %p55;
|
|
selp.f32 %f498, 0fFF800000, %f433, %p54;
|
|
selp.f32 %f499, 0fFF800000, %f432, %p53;
|
|
selp.f32 %f500, 0fFF800000, %f439, %p52;
|
|
selp.f32 %f501, 0fFF800000, %f438, %p51;
|
|
selp.f32 %f502, 0fFF800000, %f437, %p50;
|
|
selp.f32 %f503, 0fFF800000, %f436, %p49;
|
|
selp.f32 %f504, 0fFF800000, %f443, %p48;
|
|
selp.f32 %f505, 0fFF800000, %f442, %p47;
|
|
selp.f32 %f506, 0fFF800000, %f441, %p46;
|
|
selp.f32 %f507, 0fFF800000, %f440, %p45;
|
|
selp.f32 %f508, 0fFF800000, %f447, %p44;
|
|
selp.f32 %f509, 0fFF800000, %f446, %p43;
|
|
selp.f32 %f510, 0fFF800000, %f445, %p42;
|
|
selp.f32 %f511, 0fFF800000, %f444, %p41;
|
|
selp.f32 %f512, 0fFF800000, %f451, %p40;
|
|
selp.f32 %f513, 0fFF800000, %f450, %p39;
|
|
selp.f32 %f514, 0fFF800000, %f449, %p38;
|
|
selp.f32 %f515, 0fFF800000, %f448, %p37;
|
|
mul.wide.s32 %rd112, %r6102, 4;
|
|
add.s64 %rd90, %rd8, %rd112;
|
|
@%p102 ld.global.b32 { %r1776 }, [ %rd90 + 0 ];
|
|
st.shared.u32 [%r72], %r1776;
|
|
bar.sync 0;
|
|
ld.shared.f32 %f516, [%r73];
|
|
ld.shared.f32 %f517, [%r74+32];
|
|
neg.f32 %f518, %f516;
|
|
fma.rn.f32 %f519, %f452, %f195, %f518;
|
|
fma.rn.f32 %f520, %f453, %f195, %f518;
|
|
neg.f32 %f521, %f517;
|
|
fma.rn.f32 %f522, %f454, %f195, %f521;
|
|
fma.rn.f32 %f523, %f455, %f195, %f521;
|
|
fma.rn.f32 %f524, %f456, %f195, %f518;
|
|
fma.rn.f32 %f525, %f457, %f195, %f518;
|
|
fma.rn.f32 %f526, %f458, %f195, %f521;
|
|
fma.rn.f32 %f527, %f459, %f195, %f521;
|
|
fma.rn.f32 %f528, %f460, %f195, %f518;
|
|
fma.rn.f32 %f529, %f461, %f195, %f518;
|
|
fma.rn.f32 %f530, %f462, %f195, %f521;
|
|
fma.rn.f32 %f531, %f463, %f195, %f521;
|
|
fma.rn.f32 %f532, %f464, %f195, %f518;
|
|
fma.rn.f32 %f533, %f465, %f195, %f518;
|
|
fma.rn.f32 %f534, %f466, %f195, %f521;
|
|
fma.rn.f32 %f535, %f467, %f195, %f521;
|
|
fma.rn.f32 %f536, %f468, %f195, %f518;
|
|
fma.rn.f32 %f537, %f469, %f195, %f518;
|
|
fma.rn.f32 %f538, %f470, %f195, %f521;
|
|
fma.rn.f32 %f539, %f471, %f195, %f521;
|
|
fma.rn.f32 %f540, %f472, %f195, %f518;
|
|
fma.rn.f32 %f541, %f473, %f195, %f518;
|
|
fma.rn.f32 %f542, %f474, %f195, %f521;
|
|
fma.rn.f32 %f543, %f475, %f195, %f521;
|
|
fma.rn.f32 %f544, %f476, %f195, %f518;
|
|
fma.rn.f32 %f545, %f477, %f195, %f518;
|
|
fma.rn.f32 %f546, %f478, %f195, %f521;
|
|
fma.rn.f32 %f547, %f479, %f195, %f521;
|
|
fma.rn.f32 %f548, %f480, %f195, %f518;
|
|
fma.rn.f32 %f549, %f481, %f195, %f518;
|
|
fma.rn.f32 %f550, %f482, %f195, %f521;
|
|
fma.rn.f32 %f551, %f483, %f195, %f521;
|
|
fma.rn.f32 %f552, %f484, %f195, %f518;
|
|
fma.rn.f32 %f553, %f485, %f195, %f518;
|
|
fma.rn.f32 %f554, %f486, %f195, %f521;
|
|
fma.rn.f32 %f555, %f487, %f195, %f521;
|
|
fma.rn.f32 %f556, %f488, %f195, %f518;
|
|
fma.rn.f32 %f557, %f489, %f195, %f518;
|
|
fma.rn.f32 %f558, %f490, %f195, %f521;
|
|
fma.rn.f32 %f559, %f491, %f195, %f521;
|
|
fma.rn.f32 %f560, %f492, %f195, %f518;
|
|
fma.rn.f32 %f561, %f493, %f195, %f518;
|
|
fma.rn.f32 %f562, %f494, %f195, %f521;
|
|
fma.rn.f32 %f563, %f495, %f195, %f521;
|
|
fma.rn.f32 %f564, %f496, %f195, %f518;
|
|
fma.rn.f32 %f565, %f497, %f195, %f518;
|
|
fma.rn.f32 %f566, %f498, %f195, %f521;
|
|
fma.rn.f32 %f567, %f499, %f195, %f521;
|
|
fma.rn.f32 %f568, %f500, %f195, %f518;
|
|
fma.rn.f32 %f569, %f501, %f195, %f518;
|
|
fma.rn.f32 %f570, %f502, %f195, %f521;
|
|
fma.rn.f32 %f571, %f503, %f195, %f521;
|
|
fma.rn.f32 %f572, %f504, %f195, %f518;
|
|
fma.rn.f32 %f573, %f505, %f195, %f518;
|
|
fma.rn.f32 %f574, %f506, %f195, %f521;
|
|
fma.rn.f32 %f575, %f507, %f195, %f521;
|
|
fma.rn.f32 %f576, %f508, %f195, %f518;
|
|
fma.rn.f32 %f577, %f509, %f195, %f518;
|
|
fma.rn.f32 %f578, %f510, %f195, %f521;
|
|
fma.rn.f32 %f579, %f511, %f195, %f521;
|
|
fma.rn.f32 %f580, %f512, %f195, %f518;
|
|
fma.rn.f32 %f581, %f513, %f195, %f518;
|
|
fma.rn.f32 %f582, %f514, %f195, %f521;
|
|
fma.rn.f32 %f583, %f515, %f195, %f521;
|
|
mul.f32 %f261, %f519, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f260, %f261;
|
|
mul.f32 %f263, %f520, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f262, %f263;
|
|
mul.f32 %f265, %f522, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f264, %f265;
|
|
mul.f32 %f267, %f523, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f266, %f267;
|
|
mul.f32 %f269, %f524, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f268, %f269;
|
|
mul.f32 %f271, %f525, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f270, %f271;
|
|
mul.f32 %f273, %f526, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f272, %f273;
|
|
mul.f32 %f275, %f527, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f274, %f275;
|
|
mul.f32 %f277, %f528, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f276, %f277;
|
|
mul.f32 %f279, %f529, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f278, %f279;
|
|
mul.f32 %f281, %f530, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f280, %f281;
|
|
mul.f32 %f283, %f531, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f282, %f283;
|
|
mul.f32 %f285, %f532, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f284, %f285;
|
|
mul.f32 %f287, %f533, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f286, %f287;
|
|
mul.f32 %f289, %f534, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f288, %f289;
|
|
mul.f32 %f291, %f535, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f290, %f291;
|
|
mul.f32 %f293, %f536, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f292, %f293;
|
|
mul.f32 %f295, %f537, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f294, %f295;
|
|
mul.f32 %f297, %f538, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f296, %f297;
|
|
mul.f32 %f299, %f539, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f298, %f299;
|
|
mul.f32 %f301, %f540, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f300, %f301;
|
|
mul.f32 %f303, %f541, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f302, %f303;
|
|
mul.f32 %f305, %f542, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f304, %f305;
|
|
mul.f32 %f307, %f543, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f306, %f307;
|
|
mul.f32 %f309, %f544, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f308, %f309;
|
|
mul.f32 %f311, %f545, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f310, %f311;
|
|
mul.f32 %f313, %f546, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f312, %f313;
|
|
mul.f32 %f315, %f547, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f314, %f315;
|
|
mul.f32 %f317, %f548, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f316, %f317;
|
|
mul.f32 %f319, %f549, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f318, %f319;
|
|
mul.f32 %f321, %f550, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f320, %f321;
|
|
mul.f32 %f323, %f551, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f322, %f323;
|
|
mul.f32 %f325, %f552, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f324, %f325;
|
|
mul.f32 %f327, %f553, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f326, %f327;
|
|
mul.f32 %f329, %f554, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f328, %f329;
|
|
mul.f32 %f331, %f555, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f330, %f331;
|
|
mul.f32 %f333, %f556, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f332, %f333;
|
|
mul.f32 %f335, %f557, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f334, %f335;
|
|
mul.f32 %f337, %f558, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f336, %f337;
|
|
mul.f32 %f339, %f559, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f338, %f339;
|
|
mul.f32 %f341, %f560, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f340, %f341;
|
|
mul.f32 %f343, %f561, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f342, %f343;
|
|
mul.f32 %f345, %f562, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f344, %f345;
|
|
mul.f32 %f347, %f563, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f346, %f347;
|
|
mul.f32 %f349, %f564, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f348, %f349;
|
|
mul.f32 %f351, %f565, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f350, %f351;
|
|
mul.f32 %f353, %f566, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f352, %f353;
|
|
mul.f32 %f355, %f567, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f354, %f355;
|
|
mul.f32 %f357, %f568, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f356, %f357;
|
|
mul.f32 %f359, %f569, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f358, %f359;
|
|
mul.f32 %f361, %f570, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f360, %f361;
|
|
mul.f32 %f363, %f571, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f362, %f363;
|
|
mul.f32 %f365, %f572, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f364, %f365;
|
|
mul.f32 %f367, %f573, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f366, %f367;
|
|
mul.f32 %f369, %f574, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f368, %f369;
|
|
mul.f32 %f371, %f575, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f370, %f371;
|
|
mul.f32 %f373, %f576, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f372, %f373;
|
|
mul.f32 %f375, %f577, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f374, %f375;
|
|
mul.f32 %f377, %f578, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f376, %f377;
|
|
mul.f32 %f379, %f579, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f378, %f379;
|
|
mul.f32 %f381, %f580, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f380, %f381;
|
|
mul.f32 %f383, %f581, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f382, %f383;
|
|
mul.f32 %f385, %f582, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f384, %f385;
|
|
mul.f32 %f387, %f583, 0f3FB8AA3B;
|
|
ex2.approx.f32 %f386, %f387;
|
|
@%p102 ld.global.v4.b32 { %r6121, %r6122, %r6123, %r6124 }, [ %rd91 + 0 ];
|
|
mov.b32 %hh49, %r6121;
|
|
mov.b32 %hh50, %r6122;
|
|
mov.b32 %hh51, %r6123;
|
|
mov.b32 %hh52, %r6124;
|
|
@%p102 ld.global.v4.b32 { %r6125, %r6126, %r6127, %r6128 }, [ %rd92 + 0 ];
|
|
mov.b32 %hh53, %r6125;
|
|
mov.b32 %hh54, %r6126;
|
|
mov.b32 %hh55, %r6127;
|
|
mov.b32 %hh56, %r6128;
|
|
@%p102 ld.global.v4.b32 { %r6129, %r6130, %r6131, %r6132 }, [ %rd93 + 0 ];
|
|
mov.b32 %hh57, %r6129;
|
|
mov.b32 %hh58, %r6130;
|
|
mov.b32 %hh59, %r6131;
|
|
mov.b32 %hh60, %r6132;
|
|
@%p102 ld.global.v4.b32 { %r6133, %r6134, %r6135, %r6136 }, [ %rd94 + 0 ];
|
|
mov.b32 %hh61, %r6133;
|
|
mov.b32 %hh62, %r6134;
|
|
mov.b32 %hh63, %r6135;
|
|
mov.b32 %hh64, %r6136;
|
|
bar.sync 0;
|
|
st.shared.v4.b32 [%r75], {%r6121, %r6122, %r6123, %r6124};
|
|
st.shared.v4.b32 [%r76], {%r6125, %r6126, %r6127, %r6128};
|
|
st.shared.v4.b32 [%r77], {%r6129, %r6130, %r6131, %r6132};
|
|
st.shared.v4.b32 [%r78], {%r6133, %r6134, %r6135, %r6136};
|
|
cvt.rn.f16.f32 %h1, %f262;
|
|
cvt.rn.f16.f32 %h2, %f260;
|
|
cvt.rn.f16.f32 %h3, %f266;
|
|
cvt.rn.f16.f32 %h4, %f264;
|
|
cvt.rn.f16.f32 %h5, %f270;
|
|
cvt.rn.f16.f32 %h6, %f268;
|
|
cvt.rn.f16.f32 %h7, %f274;
|
|
cvt.rn.f16.f32 %h8, %f272;
|
|
cvt.rn.f16.f32 %h9, %f278;
|
|
cvt.rn.f16.f32 %h10, %f276;
|
|
cvt.rn.f16.f32 %h11, %f282;
|
|
cvt.rn.f16.f32 %h12, %f280;
|
|
cvt.rn.f16.f32 %h13, %f286;
|
|
cvt.rn.f16.f32 %h14, %f284;
|
|
cvt.rn.f16.f32 %h15, %f290;
|
|
cvt.rn.f16.f32 %h16, %f288;
|
|
cvt.rn.f16.f32 %h17, %f294;
|
|
cvt.rn.f16.f32 %h18, %f292;
|
|
cvt.rn.f16.f32 %h19, %f298;
|
|
cvt.rn.f16.f32 %h20, %f296;
|
|
cvt.rn.f16.f32 %h21, %f302;
|
|
cvt.rn.f16.f32 %h22, %f300;
|
|
cvt.rn.f16.f32 %h23, %f306;
|
|
cvt.rn.f16.f32 %h24, %f304;
|
|
cvt.rn.f16.f32 %h25, %f310;
|
|
cvt.rn.f16.f32 %h26, %f308;
|
|
cvt.rn.f16.f32 %h27, %f314;
|
|
cvt.rn.f16.f32 %h28, %f312;
|
|
cvt.rn.f16.f32 %h29, %f318;
|
|
cvt.rn.f16.f32 %h30, %f316;
|
|
cvt.rn.f16.f32 %h31, %f322;
|
|
cvt.rn.f16.f32 %h32, %f320;
|
|
cvt.rn.f16.f32 %h33, %f326;
|
|
cvt.rn.f16.f32 %h34, %f324;
|
|
cvt.rn.f16.f32 %h35, %f330;
|
|
cvt.rn.f16.f32 %h36, %f328;
|
|
cvt.rn.f16.f32 %h37, %f334;
|
|
cvt.rn.f16.f32 %h38, %f332;
|
|
cvt.rn.f16.f32 %h39, %f338;
|
|
cvt.rn.f16.f32 %h40, %f336;
|
|
cvt.rn.f16.f32 %h41, %f342;
|
|
cvt.rn.f16.f32 %h42, %f340;
|
|
cvt.rn.f16.f32 %h43, %f346;
|
|
cvt.rn.f16.f32 %h44, %f344;
|
|
cvt.rn.f16.f32 %h45, %f350;
|
|
cvt.rn.f16.f32 %h46, %f348;
|
|
cvt.rn.f16.f32 %h47, %f354;
|
|
cvt.rn.f16.f32 %h48, %f352;
|
|
cvt.rn.f16.f32 %h49, %f358;
|
|
cvt.rn.f16.f32 %h50, %f356;
|
|
cvt.rn.f16.f32 %h51, %f362;
|
|
cvt.rn.f16.f32 %h52, %f360;
|
|
cvt.rn.f16.f32 %h53, %f366;
|
|
cvt.rn.f16.f32 %h54, %f364;
|
|
cvt.rn.f16.f32 %h55, %f370;
|
|
cvt.rn.f16.f32 %h56, %f368;
|
|
cvt.rn.f16.f32 %h57, %f374;
|
|
cvt.rn.f16.f32 %h58, %f372;
|
|
cvt.rn.f16.f32 %h59, %f378;
|
|
cvt.rn.f16.f32 %h60, %f376;
|
|
cvt.rn.f16.f32 %h61, %f382;
|
|
cvt.rn.f16.f32 %h62, %f380;
|
|
cvt.rn.f16.f32 %h63, %f386;
|
|
cvt.rn.f16.f32 %h64, %f384;
|
|
st.shared.v2.b16 [%r79], {%h2, %h1};
|
|
st.shared.v2.b16 [%r80], {%h4, %h3};
|
|
st.shared.v2.b16 [%r81], {%h6, %h5};
|
|
st.shared.v2.b16 [%r82], {%h8, %h7};
|
|
st.shared.v2.b16 [%r83], {%h10, %h9};
|
|
st.shared.v2.b16 [%r84], {%h12, %h11};
|
|
st.shared.v2.b16 [%r85], {%h14, %h13};
|
|
st.shared.v2.b16 [%r86], {%h16, %h15};
|
|
st.shared.v2.b16 [%r87], {%h18, %h17};
|
|
st.shared.v2.b16 [%r88], {%h20, %h19};
|
|
st.shared.v2.b16 [%r89], {%h22, %h21};
|
|
st.shared.v2.b16 [%r90], {%h24, %h23};
|
|
st.shared.v2.b16 [%r91], {%h26, %h25};
|
|
st.shared.v2.b16 [%r92], {%h28, %h27};
|
|
st.shared.v2.b16 [%r93], {%h30, %h29};
|
|
st.shared.v2.b16 [%r94], {%h32, %h31};
|
|
st.shared.v2.b16 [%r79+128], {%h34, %h33};
|
|
st.shared.v2.b16 [%r80+128], {%h36, %h35};
|
|
st.shared.v2.b16 [%r97], {%h38, %h37};
|
|
st.shared.v2.b16 [%r98], {%h40, %h39};
|
|
st.shared.v2.b16 [%r99], {%h42, %h41};
|
|
st.shared.v2.b16 [%r100], {%h44, %h43};
|
|
st.shared.v2.b16 [%r101], {%h46, %h45};
|
|
st.shared.v2.b16 [%r102], {%h48, %h47};
|
|
st.shared.v2.b16 [%r103], {%h50, %h49};
|
|
st.shared.v2.b16 [%r104], {%h52, %h51};
|
|
st.shared.v2.b16 [%r105], {%h54, %h53};
|
|
st.shared.v2.b16 [%r106], {%h56, %h55};
|
|
st.shared.v2.b16 [%r107], {%h58, %h57};
|
|
st.shared.v2.b16 [%r108], {%h60, %h59};
|
|
st.shared.v2.b16 [%r109], {%h62, %h61};
|
|
st.shared.v2.b16 [%r110], {%h64, %h63};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1957, %r1958, %r1959, %r1960 }, [ %r1797 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2069, %r2070, %r2071, %r2072 }, [ %r1802 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2181, %r2182, %r2183, %r2184 }, [ %r1807 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2293, %r2294, %r2295, %r2296 }, [ %r1812 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2405, %r2406, %r2407, %r2408 }, [ %r1817 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2517, %r2518, %r2519, %r2520 }, [ %r1822 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2629, %r2630, %r2631, %r2632 }, [ %r1827 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2741, %r2742, %r2743, %r2744 }, [ %r1832 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2013, %r2014, %r2015, %r2016 }, [ %r1837 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2125, %r2126, %r2127, %r2128 }, [ %r1842 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2237, %r2238, %r2239, %r2240 }, [ %r1847 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2349, %r2350, %r2351, %r2352 }, [ %r1852 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2461, %r2462, %r2463, %r2464 }, [ %r1857 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2573, %r2574, %r2575, %r2576 }, [ %r1862 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2685, %r2686, %r2687, %r2688 }, [ %r1867 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2797, %r2798, %r2799, %r2800 }, [ %r1872 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1961, %r1962, %r1975, %r1976 }, [ %r1877 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2073, %r2074, %r2087, %r2088 }, [ %r1882 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2185, %r2186, %r2199, %r2200 }, [ %r1887 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2297, %r2298, %r2311, %r2312 }, [ %r1892 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2409, %r2410, %r2423, %r2424 }, [ %r1897 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2521, %r2522, %r2535, %r2536 }, [ %r1902 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2633, %r2634, %r2647, %r2648 }, [ %r1907 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2745, %r2746, %r2759, %r2760 }, [ %r1912 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r1989, %r1990, %r2003, %r2004 }, [ %r1917 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2101, %r2102, %r2115, %r2116 }, [ %r1922 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2213, %r2214, %r2227, %r2228 }, [ %r1927 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2325, %r2326, %r2339, %r2340 }, [ %r1932 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2437, %r2438, %r2451, %r2452 }, [ %r1937 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2549, %r2550, %r2563, %r2564 }, [ %r1942 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2661, %r2662, %r2675, %r2676 }, [ %r1947 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r2773, %r2774, %r2787, %r2788 }, [ %r1952 + 0 ];
|
|
mov.b32 %r2065, %f909;
|
|
mov.b32 %r2066, %f910;
|
|
mov.b32 %r2067, %f911;
|
|
mov.b32 %r2068, %f912;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1961, %r1962 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mov.b32 %r2079, %f913;
|
|
mov.b32 %r2080, %f914;
|
|
mov.b32 %r2081, %f915;
|
|
mov.b32 %r2082, %f916;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1975, %r1976 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mov.b32 %r2093, %f917;
|
|
mov.b32 %r2094, %f918;
|
|
mov.b32 %r2095, %f919;
|
|
mov.b32 %r2096, %f920;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r1957, %r1958, %r1959, %r1960 }, { %r1989, %r1990 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mov.b32 %r2107, %f921;
|
|
mov.b32 %r2108, %f922;
|
|
mov.b32 %r2109, %f923;
|
|
mov.b32 %r2110, %f924;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r1957, %r1958, %r1959, %r1960 }, { %r2003, %r2004 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mov.b32 %r2121, %f925;
|
|
mov.b32 %r2122, %f926;
|
|
mov.b32 %r2123, %f927;
|
|
mov.b32 %r2124, %f928;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1961, %r1962 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mov.b32 %r2135, %f929;
|
|
mov.b32 %r2136, %f930;
|
|
mov.b32 %r2137, %f931;
|
|
mov.b32 %r2138, %f932;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1975, %r1976 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mov.b32 %r2149, %f933;
|
|
mov.b32 %r2150, %f934;
|
|
mov.b32 %r2151, %f935;
|
|
mov.b32 %r2152, %f936;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2013, %r2014, %r2015, %r2016 }, { %r1989, %r1990 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mov.b32 %r2163, %f937;
|
|
mov.b32 %r2164, %f938;
|
|
mov.b32 %r2165, %f939;
|
|
mov.b32 %r2166, %f940;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2013, %r2014, %r2015, %r2016 }, { %r2003, %r2004 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2073, %r2074 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2087, %r2088 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2101, %r2102 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2069, %r2070, %r2071, %r2072 }, { %r2115, %r2116 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2073, %r2074 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2087, %r2088 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2101, %r2102 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2125, %r2126, %r2127, %r2128 }, { %r2115, %r2116 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2185, %r2186 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2199, %r2200 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2213, %r2214 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2181, %r2182, %r2183, %r2184 }, { %r2227, %r2228 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2185, %r2186 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2199, %r2200 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2213, %r2214 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2237, %r2238, %r2239, %r2240 }, { %r2227, %r2228 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2297, %r2298 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2311, %r2312 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2325, %r2326 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2293, %r2294, %r2295, %r2296 }, { %r2339, %r2340 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2297, %r2298 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2311, %r2312 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2325, %r2326 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2349, %r2350, %r2351, %r2352 }, { %r2339, %r2340 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2409, %r2410 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2423, %r2424 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2437, %r2438 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2405, %r2406, %r2407, %r2408 }, { %r2451, %r2452 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2409, %r2410 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2423, %r2424 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2437, %r2438 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2461, %r2462, %r2463, %r2464 }, { %r2451, %r2452 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2521, %r2522 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2535, %r2536 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2549, %r2550 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2517, %r2518, %r2519, %r2520 }, { %r2563, %r2564 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2521, %r2522 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2535, %r2536 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2549, %r2550 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2573, %r2574, %r2575, %r2576 }, { %r2563, %r2564 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2633, %r2634 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2647, %r2648 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2661, %r2662 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2629, %r2630, %r2631, %r2632 }, { %r2675, %r2676 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2633, %r2634 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2647, %r2648 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2661, %r2662 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2685, %r2686, %r2687, %r2688 }, { %r2675, %r2676 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2065, %r2066, %r2067, %r2068 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2745, %r2746 }, { %r2065, %r2066, %r2067, %r2068 };
|
|
mov.b32 %f912, %r2068;
|
|
mov.b32 %f911, %r2067;
|
|
mov.b32 %f910, %r2066;
|
|
mov.b32 %f909, %r2065;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2079, %r2080, %r2081, %r2082 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2759, %r2760 }, { %r2079, %r2080, %r2081, %r2082 };
|
|
mov.b32 %f916, %r2082;
|
|
mov.b32 %f915, %r2081;
|
|
mov.b32 %f914, %r2080;
|
|
mov.b32 %f913, %r2079;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2093, %r2094, %r2095, %r2096 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2773, %r2774 }, { %r2093, %r2094, %r2095, %r2096 };
|
|
mov.b32 %f920, %r2096;
|
|
mov.b32 %f919, %r2095;
|
|
mov.b32 %f918, %r2094;
|
|
mov.b32 %f917, %r2093;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2107, %r2108, %r2109, %r2110 }, { %r2741, %r2742, %r2743, %r2744 }, { %r2787, %r2788 }, { %r2107, %r2108, %r2109, %r2110 };
|
|
mov.b32 %f924, %r2110;
|
|
mov.b32 %f923, %r2109;
|
|
mov.b32 %f922, %r2108;
|
|
mov.b32 %f921, %r2107;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2121, %r2122, %r2123, %r2124 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2745, %r2746 }, { %r2121, %r2122, %r2123, %r2124 };
|
|
mov.b32 %f928, %r2124;
|
|
mov.b32 %f927, %r2123;
|
|
mov.b32 %f926, %r2122;
|
|
mov.b32 %f925, %r2121;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2135, %r2136, %r2137, %r2138 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2759, %r2760 }, { %r2135, %r2136, %r2137, %r2138 };
|
|
mov.b32 %f932, %r2138;
|
|
mov.b32 %f931, %r2137;
|
|
mov.b32 %f930, %r2136;
|
|
mov.b32 %f929, %r2135;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2149, %r2150, %r2151, %r2152 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2773, %r2774 }, { %r2149, %r2150, %r2151, %r2152 };
|
|
mov.b32 %f936, %r2152;
|
|
mov.b32 %f935, %r2151;
|
|
mov.b32 %f934, %r2150;
|
|
mov.b32 %f933, %r2149;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r2163, %r2164, %r2165, %r2166 }, { %r2797, %r2798, %r2799, %r2800 }, { %r2787, %r2788 }, { %r2163, %r2164, %r2165, %r2166 };
|
|
mov.b32 %f940, %r2166;
|
|
mov.b32 %f939, %r2165;
|
|
mov.b32 %f938, %r2164;
|
|
mov.b32 %f937, %r2163;
|
|
add.s64 %rd95, %rd7, %rd112;
|
|
@%p102 ld.global.b32 { %r2849 }, [ %rd95 + 0 ];
|
|
bar.sync 0;
|
|
st.shared.u32 [%r143], %r2849;
|
|
bar.sync 0;
|
|
ld.shared.f32 %f584, [%r144];
|
|
ld.shared.f32 %f585, [%r145+32];
|
|
sub.f32 %f587, %f259, %f584;
|
|
sub.f32 %f588, %f259, %f585;
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3034, %r3035, %r3036, %r3037 }, [ %r2854 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3258, %r3259, %r3260, %r3261 }, [ %r2859 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3482, %r3483, %r3484, %r3485 }, [ %r2864 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3706, %r3707, %r3708, %r3709 }, [ %r2869 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2870, %r2871, %r2872, %r2873 }, [ %r2874 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2875, %r2876, %r2877, %r2878 }, [ %r2879 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2880, %r2881, %r2882, %r2883 }, [ %r2884 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2885, %r2886, %r2887, %r2888 }, [ %r2889 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2890, %r2891, %r2892, %r2893 }, [ %r2894 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2895, %r2896, %r2897, %r2898 }, [ %r2899 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2900, %r2901, %r2902, %r2903 }, [ %r2904 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2905, %r2906, %r2907, %r2908 }, [ %r2909 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2910, %r2911, %r2912, %r2913 }, [ %r2914 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2915, %r2916, %r2917, %r2918 }, [ %r2919 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2920, %r2921, %r2922, %r2923 }, [ %r2924 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2925, %r2926, %r2927, %r2928 }, [ %r2929 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2930, %r2931, %r2932, %r2933 }, [ %r2934 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2935, %r2936, %r2937, %r2938 }, [ %r2939 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2940, %r2941, %r2942, %r2943 }, [ %r2944 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2945, %r2946, %r2947, %r2948 }, [ %r2949 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2950, %r2951, %r2952, %r2953 }, [ %r2954 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2955, %r2956, %r2957, %r2958 }, [ %r2959 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2960, %r2961, %r2962, %r2963 }, [ %r2964 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2965, %r2966, %r2967, %r2968 }, [ %r2969 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2970, %r2971, %r2972, %r2973 }, [ %r2974 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2975, %r2976, %r2977, %r2978 }, [ %r2979 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2980, %r2981, %r2982, %r2983 }, [ %r2984 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2985, %r2986, %r2987, %r2988 }, [ %r2989 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2990, %r2991, %r2992, %r2993 }, [ %r2994 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r2995, %r2996, %r2997, %r2998 }, [ %r2999 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3000, %r3001, %r3002, %r3003 }, [ %r3004 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3005, %r3006, %r3007, %r3008 }, [ %r3009 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3010, %r3011, %r3012, %r3013 }, [ %r3014 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3015, %r3016, %r3017, %r3018 }, [ %r3019 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3020, %r3021, %r3022, %r3023 }, [ %r3024 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r3025, %r3026, %r3027, %r3028 }, [ %r3029 + 0 ];
|
|
mov.b32 %r3465, %f587;
|
|
mov.b32 %r3467, %f588;
|
|
mov.u32 %r3254, %r3465;
|
|
mov.u32 %r3255, %r3465;
|
|
mov.u32 %r3256, %r3467;
|
|
mov.u32 %r3257, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2870, %r2871 }, { %r3254, %r3255, %r3256, %r3257 };
|
|
mov.u32 %r3268, %r3465;
|
|
mov.u32 %r3269, %r3465;
|
|
mov.u32 %r3270, %r3467;
|
|
mov.u32 %r3271, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2872, %r2873 }, { %r3268, %r3269, %r3270, %r3271 };
|
|
mov.u32 %r3282, %r3465;
|
|
mov.u32 %r3283, %r3465;
|
|
mov.u32 %r3284, %r3467;
|
|
mov.u32 %r3285, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2890, %r2891 }, { %r3282, %r3283, %r3284, %r3285 };
|
|
mov.u32 %r3296, %r3465;
|
|
mov.u32 %r3297, %r3465;
|
|
mov.u32 %r3298, %r3467;
|
|
mov.u32 %r3299, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2892, %r2893 }, { %r3296, %r3297, %r3298, %r3299 };
|
|
mov.u32 %r3310, %r3465;
|
|
mov.u32 %r3311, %r3465;
|
|
mov.u32 %r3312, %r3467;
|
|
mov.u32 %r3313, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2910, %r2911 }, { %r3310, %r3311, %r3312, %r3313 };
|
|
mov.u32 %r3324, %r3465;
|
|
mov.u32 %r3325, %r3465;
|
|
mov.u32 %r3326, %r3467;
|
|
mov.u32 %r3327, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2912, %r2913 }, { %r3324, %r3325, %r3326, %r3327 };
|
|
mov.u32 %r3338, %r3465;
|
|
mov.u32 %r3339, %r3465;
|
|
mov.u32 %r3340, %r3467;
|
|
mov.u32 %r3341, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2930, %r2931 }, { %r3338, %r3339, %r3340, %r3341 };
|
|
mov.u32 %r3352, %r3465;
|
|
mov.u32 %r3353, %r3465;
|
|
mov.u32 %r3354, %r3467;
|
|
mov.u32 %r3355, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2932, %r2933 }, { %r3352, %r3353, %r3354, %r3355 };
|
|
mov.u32 %r3366, %r3465;
|
|
mov.u32 %r3367, %r3465;
|
|
mov.u32 %r3368, %r3467;
|
|
mov.u32 %r3369, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2950, %r2951 }, { %r3366, %r3367, %r3368, %r3369 };
|
|
mov.u32 %r3380, %r3465;
|
|
mov.u32 %r3381, %r3465;
|
|
mov.u32 %r3382, %r3467;
|
|
mov.u32 %r3383, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2952, %r2953 }, { %r3380, %r3381, %r3382, %r3383 };
|
|
mov.u32 %r3394, %r3465;
|
|
mov.u32 %r3395, %r3465;
|
|
mov.u32 %r3396, %r3467;
|
|
mov.u32 %r3397, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2970, %r2971 }, { %r3394, %r3395, %r3396, %r3397 };
|
|
mov.u32 %r3408, %r3465;
|
|
mov.u32 %r3409, %r3465;
|
|
mov.u32 %r3410, %r3467;
|
|
mov.u32 %r3411, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2972, %r2973 }, { %r3408, %r3409, %r3410, %r3411 };
|
|
mov.u32 %r3422, %r3465;
|
|
mov.u32 %r3423, %r3465;
|
|
mov.u32 %r3424, %r3467;
|
|
mov.u32 %r3425, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2990, %r2991 }, { %r3422, %r3423, %r3424, %r3425 };
|
|
mov.u32 %r3436, %r3465;
|
|
mov.u32 %r3437, %r3465;
|
|
mov.u32 %r3438, %r3467;
|
|
mov.u32 %r3439, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3034, %r3035, %r3036, %r3037 }, { %r2992, %r2993 }, { %r3436, %r3437, %r3438, %r3439 };
|
|
mov.u32 %r3450, %r3465;
|
|
mov.u32 %r3451, %r3465;
|
|
mov.u32 %r3452, %r3467;
|
|
mov.u32 %r3453, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3010, %r3011 }, { %r3450, %r3451, %r3452, %r3453 };
|
|
mov.u32 %r3464, %r3465;
|
|
mov.u32 %r3466, %r3467;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3034, %r3035, %r3036, %r3037 }, { %r3012, %r3013 }, { %r3464, %r3465, %r3466, %r3467 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2875, %r2876 }, { %r3254, %r3255, %r3256, %r3257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2877, %r2878 }, { %r3268, %r3269, %r3270, %r3271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2895, %r2896 }, { %r3282, %r3283, %r3284, %r3285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2897, %r2898 }, { %r3296, %r3297, %r3298, %r3299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2915, %r2916 }, { %r3310, %r3311, %r3312, %r3313 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2917, %r2918 }, { %r3324, %r3325, %r3326, %r3327 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2935, %r2936 }, { %r3338, %r3339, %r3340, %r3341 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2937, %r2938 }, { %r3352, %r3353, %r3354, %r3355 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2955, %r2956 }, { %r3366, %r3367, %r3368, %r3369 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2957, %r2958 }, { %r3380, %r3381, %r3382, %r3383 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2975, %r2976 }, { %r3394, %r3395, %r3396, %r3397 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2977, %r2978 }, { %r3408, %r3409, %r3410, %r3411 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2995, %r2996 }, { %r3422, %r3423, %r3424, %r3425 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3258, %r3259, %r3260, %r3261 }, { %r2997, %r2998 }, { %r3436, %r3437, %r3438, %r3439 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3015, %r3016 }, { %r3450, %r3451, %r3452, %r3453 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3258, %r3259, %r3260, %r3261 }, { %r3017, %r3018 }, { %r3464, %r3465, %r3466, %r3467 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2880, %r2881 }, { %r3254, %r3255, %r3256, %r3257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2882, %r2883 }, { %r3268, %r3269, %r3270, %r3271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2900, %r2901 }, { %r3282, %r3283, %r3284, %r3285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2902, %r2903 }, { %r3296, %r3297, %r3298, %r3299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2920, %r2921 }, { %r3310, %r3311, %r3312, %r3313 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2922, %r2923 }, { %r3324, %r3325, %r3326, %r3327 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2940, %r2941 }, { %r3338, %r3339, %r3340, %r3341 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2942, %r2943 }, { %r3352, %r3353, %r3354, %r3355 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2960, %r2961 }, { %r3366, %r3367, %r3368, %r3369 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2962, %r2963 }, { %r3380, %r3381, %r3382, %r3383 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2980, %r2981 }, { %r3394, %r3395, %r3396, %r3397 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3482, %r3483, %r3484, %r3485 }, { %r2982, %r2983 }, { %r3408, %r3409, %r3410, %r3411 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3000, %r3001 }, { %r3422, %r3423, %r3424, %r3425 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3002, %r3003 }, { %r3436, %r3437, %r3438, %r3439 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3020, %r3021 }, { %r3450, %r3451, %r3452, %r3453 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3482, %r3483, %r3484, %r3485 }, { %r3022, %r3023 }, { %r3464, %r3465, %r3466, %r3467 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3254, %r3255, %r3256, %r3257 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2885, %r2886 }, { %r3254, %r3255, %r3256, %r3257 };
|
|
mov.b32 %f589, %r3256;
|
|
mov.b32 %f590, %r3257;
|
|
mov.b32 %f591, %r3254;
|
|
mov.b32 %f592, %r3255;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3268, %r3269, %r3270, %r3271 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2887, %r2888 }, { %r3268, %r3269, %r3270, %r3271 };
|
|
mov.b32 %f593, %r3270;
|
|
mov.b32 %f594, %r3271;
|
|
mov.b32 %f595, %r3268;
|
|
mov.b32 %f596, %r3269;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3282, %r3283, %r3284, %r3285 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2905, %r2906 }, { %r3282, %r3283, %r3284, %r3285 };
|
|
mov.b32 %f597, %r3284;
|
|
mov.b32 %f598, %r3285;
|
|
mov.b32 %f599, %r3282;
|
|
mov.b32 %f600, %r3283;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3296, %r3297, %r3298, %r3299 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2907, %r2908 }, { %r3296, %r3297, %r3298, %r3299 };
|
|
mov.b32 %f601, %r3298;
|
|
mov.b32 %f602, %r3299;
|
|
mov.b32 %f603, %r3296;
|
|
mov.b32 %f604, %r3297;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3310, %r3311, %r3312, %r3313 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2925, %r2926 }, { %r3310, %r3311, %r3312, %r3313 };
|
|
mov.b32 %f605, %r3312;
|
|
mov.b32 %f606, %r3313;
|
|
mov.b32 %f607, %r3310;
|
|
mov.b32 %f608, %r3311;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3324, %r3325, %r3326, %r3327 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2927, %r2928 }, { %r3324, %r3325, %r3326, %r3327 };
|
|
mov.b32 %f609, %r3326;
|
|
mov.b32 %f610, %r3327;
|
|
mov.b32 %f611, %r3324;
|
|
mov.b32 %f612, %r3325;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3338, %r3339, %r3340, %r3341 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2945, %r2946 }, { %r3338, %r3339, %r3340, %r3341 };
|
|
mov.b32 %f613, %r3340;
|
|
mov.b32 %f614, %r3341;
|
|
mov.b32 %f615, %r3338;
|
|
mov.b32 %f616, %r3339;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3352, %r3353, %r3354, %r3355 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2947, %r2948 }, { %r3352, %r3353, %r3354, %r3355 };
|
|
mov.b32 %f617, %r3354;
|
|
mov.b32 %f618, %r3355;
|
|
mov.b32 %f619, %r3352;
|
|
mov.b32 %f620, %r3353;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3366, %r3367, %r3368, %r3369 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2965, %r2966 }, { %r3366, %r3367, %r3368, %r3369 };
|
|
mov.b32 %f621, %r3368;
|
|
mov.b32 %f622, %r3369;
|
|
mov.b32 %f623, %r3366;
|
|
mov.b32 %f624, %r3367;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3380, %r3381, %r3382, %r3383 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2967, %r2968 }, { %r3380, %r3381, %r3382, %r3383 };
|
|
mov.b32 %f625, %r3382;
|
|
mov.b32 %f626, %r3383;
|
|
mov.b32 %f627, %r3380;
|
|
mov.b32 %f628, %r3381;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3394, %r3395, %r3396, %r3397 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2985, %r2986 }, { %r3394, %r3395, %r3396, %r3397 };
|
|
mov.b32 %f629, %r3396;
|
|
mov.b32 %f630, %r3397;
|
|
mov.b32 %f631, %r3394;
|
|
mov.b32 %f632, %r3395;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3408, %r3409, %r3410, %r3411 }, { %r3706, %r3707, %r3708, %r3709 }, { %r2987, %r2988 }, { %r3408, %r3409, %r3410, %r3411 };
|
|
mov.b32 %f633, %r3410;
|
|
mov.b32 %f634, %r3411;
|
|
mov.b32 %f635, %r3408;
|
|
mov.b32 %f636, %r3409;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3422, %r3423, %r3424, %r3425 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3005, %r3006 }, { %r3422, %r3423, %r3424, %r3425 };
|
|
mov.b32 %f637, %r3424;
|
|
mov.b32 %f638, %r3425;
|
|
mov.b32 %f639, %r3422;
|
|
mov.b32 %f640, %r3423;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3436, %r3437, %r3438, %r3439 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3007, %r3008 }, { %r3436, %r3437, %r3438, %r3439 };
|
|
mov.b32 %f641, %r3438;
|
|
mov.b32 %f642, %r3439;
|
|
mov.b32 %f643, %r3436;
|
|
mov.b32 %f644, %r3437;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3450, %r3451, %r3452, %r3453 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3025, %r3026 }, { %r3450, %r3451, %r3452, %r3453 };
|
|
mov.b32 %f645, %r3452;
|
|
mov.b32 %f646, %r3453;
|
|
mov.b32 %f647, %r3450;
|
|
mov.b32 %f648, %r3451;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r3464, %r3465, %r3466, %r3467 }, { %r3706, %r3707, %r3708, %r3709 }, { %r3027, %r3028 }, { %r3464, %r3465, %r3466, %r3467 };
|
|
mov.b32 %f649, %r3466;
|
|
mov.b32 %f650, %r3467;
|
|
mov.b32 %f651, %r3464;
|
|
mov.b32 %f652, %r3465;
|
|
mul.f32 %f653, %f262, %f592;
|
|
mul.f32 %f654, %f260, %f591;
|
|
mul.f32 %f655, %f266, %f590;
|
|
mul.f32 %f656, %f264, %f589;
|
|
mul.f32 %f657, %f270, %f596;
|
|
mul.f32 %f658, %f268, %f595;
|
|
mul.f32 %f659, %f274, %f594;
|
|
mul.f32 %f660, %f272, %f593;
|
|
mul.f32 %f661, %f278, %f600;
|
|
mul.f32 %f662, %f276, %f599;
|
|
mul.f32 %f663, %f282, %f598;
|
|
mul.f32 %f664, %f280, %f597;
|
|
mul.f32 %f665, %f286, %f604;
|
|
mul.f32 %f666, %f284, %f603;
|
|
mul.f32 %f667, %f290, %f602;
|
|
mul.f32 %f668, %f288, %f601;
|
|
mul.f32 %f669, %f294, %f608;
|
|
mul.f32 %f670, %f292, %f607;
|
|
mul.f32 %f671, %f298, %f606;
|
|
mul.f32 %f672, %f296, %f605;
|
|
mul.f32 %f673, %f302, %f612;
|
|
mul.f32 %f674, %f300, %f611;
|
|
mul.f32 %f675, %f306, %f610;
|
|
mul.f32 %f676, %f304, %f609;
|
|
mul.f32 %f677, %f310, %f616;
|
|
mul.f32 %f678, %f308, %f615;
|
|
mul.f32 %f679, %f314, %f614;
|
|
mul.f32 %f680, %f312, %f613;
|
|
mul.f32 %f681, %f318, %f620;
|
|
mul.f32 %f682, %f316, %f619;
|
|
mul.f32 %f683, %f322, %f618;
|
|
mul.f32 %f684, %f320, %f617;
|
|
mul.f32 %f685, %f326, %f624;
|
|
mul.f32 %f686, %f324, %f623;
|
|
mul.f32 %f687, %f330, %f622;
|
|
mul.f32 %f688, %f328, %f621;
|
|
mul.f32 %f689, %f334, %f628;
|
|
mul.f32 %f690, %f332, %f627;
|
|
mul.f32 %f691, %f338, %f626;
|
|
mul.f32 %f692, %f336, %f625;
|
|
mul.f32 %f693, %f342, %f632;
|
|
mul.f32 %f694, %f340, %f631;
|
|
mul.f32 %f695, %f346, %f630;
|
|
mul.f32 %f696, %f344, %f629;
|
|
mul.f32 %f697, %f350, %f636;
|
|
mul.f32 %f698, %f348, %f635;
|
|
mul.f32 %f699, %f354, %f634;
|
|
mul.f32 %f700, %f352, %f633;
|
|
mul.f32 %f701, %f358, %f640;
|
|
mul.f32 %f702, %f356, %f639;
|
|
mul.f32 %f703, %f362, %f638;
|
|
mul.f32 %f704, %f360, %f637;
|
|
mul.f32 %f705, %f366, %f644;
|
|
mul.f32 %f706, %f364, %f643;
|
|
mul.f32 %f707, %f370, %f642;
|
|
mul.f32 %f708, %f368, %f641;
|
|
mul.f32 %f709, %f374, %f648;
|
|
mul.f32 %f710, %f372, %f647;
|
|
mul.f32 %f711, %f378, %f646;
|
|
mul.f32 %f712, %f376, %f645;
|
|
mul.f32 %f713, %f382, %f652;
|
|
mul.f32 %f714, %f380, %f651;
|
|
mul.f32 %f715, %f386, %f650;
|
|
mul.f32 %f716, %f384, %f649;
|
|
mul.f32 %f717, %f654, %f195;
|
|
mul.f32 %f718, %f653, %f195;
|
|
mul.f32 %f719, %f656, %f195;
|
|
mul.f32 %f720, %f655, %f195;
|
|
mul.f32 %f721, %f658, %f195;
|
|
mul.f32 %f722, %f657, %f195;
|
|
mul.f32 %f723, %f660, %f195;
|
|
mul.f32 %f724, %f659, %f195;
|
|
mul.f32 %f725, %f662, %f195;
|
|
mul.f32 %f726, %f661, %f195;
|
|
mul.f32 %f727, %f664, %f195;
|
|
mul.f32 %f728, %f663, %f195;
|
|
mul.f32 %f729, %f666, %f195;
|
|
mul.f32 %f730, %f665, %f195;
|
|
mul.f32 %f731, %f668, %f195;
|
|
mul.f32 %f732, %f667, %f195;
|
|
mul.f32 %f733, %f670, %f195;
|
|
mul.f32 %f734, %f669, %f195;
|
|
mul.f32 %f735, %f672, %f195;
|
|
mul.f32 %f736, %f671, %f195;
|
|
mul.f32 %f737, %f674, %f195;
|
|
mul.f32 %f738, %f673, %f195;
|
|
mul.f32 %f739, %f676, %f195;
|
|
mul.f32 %f740, %f675, %f195;
|
|
mul.f32 %f741, %f678, %f195;
|
|
mul.f32 %f742, %f677, %f195;
|
|
mul.f32 %f743, %f680, %f195;
|
|
mul.f32 %f744, %f679, %f195;
|
|
mul.f32 %f745, %f682, %f195;
|
|
mul.f32 %f746, %f681, %f195;
|
|
mul.f32 %f747, %f684, %f195;
|
|
mul.f32 %f748, %f683, %f195;
|
|
mul.f32 %f749, %f686, %f195;
|
|
mul.f32 %f750, %f685, %f195;
|
|
mul.f32 %f751, %f688, %f195;
|
|
mul.f32 %f752, %f687, %f195;
|
|
mul.f32 %f753, %f690, %f195;
|
|
mul.f32 %f754, %f689, %f195;
|
|
mul.f32 %f755, %f692, %f195;
|
|
mul.f32 %f756, %f691, %f195;
|
|
mul.f32 %f757, %f694, %f195;
|
|
mul.f32 %f758, %f693, %f195;
|
|
mul.f32 %f759, %f696, %f195;
|
|
mul.f32 %f760, %f695, %f195;
|
|
mul.f32 %f761, %f698, %f195;
|
|
mul.f32 %f762, %f697, %f195;
|
|
mul.f32 %f763, %f700, %f195;
|
|
mul.f32 %f764, %f699, %f195;
|
|
mul.f32 %f765, %f702, %f195;
|
|
mul.f32 %f766, %f701, %f195;
|
|
mul.f32 %f767, %f704, %f195;
|
|
mul.f32 %f768, %f703, %f195;
|
|
mul.f32 %f769, %f706, %f195;
|
|
mul.f32 %f770, %f705, %f195;
|
|
mul.f32 %f771, %f708, %f195;
|
|
mul.f32 %f772, %f707, %f195;
|
|
mul.f32 %f773, %f710, %f195;
|
|
mul.f32 %f774, %f709, %f195;
|
|
mul.f32 %f775, %f712, %f195;
|
|
mul.f32 %f776, %f711, %f195;
|
|
mul.f32 %f777, %f714, %f195;
|
|
mul.f32 %f778, %f713, %f195;
|
|
mul.f32 %f779, %f716, %f195;
|
|
mul.f32 %f780, %f715, %f195;
|
|
cvt.rn.f16.f32 %h65, %f718;
|
|
cvt.rn.f16.f32 %h66, %f717;
|
|
cvt.rn.f16.f32 %h67, %f720;
|
|
cvt.rn.f16.f32 %h68, %f719;
|
|
cvt.rn.f16.f32 %h69, %f722;
|
|
cvt.rn.f16.f32 %h70, %f721;
|
|
cvt.rn.f16.f32 %h71, %f724;
|
|
cvt.rn.f16.f32 %h72, %f723;
|
|
cvt.rn.f16.f32 %h73, %f726;
|
|
cvt.rn.f16.f32 %h74, %f725;
|
|
cvt.rn.f16.f32 %h75, %f728;
|
|
cvt.rn.f16.f32 %h76, %f727;
|
|
cvt.rn.f16.f32 %h77, %f730;
|
|
cvt.rn.f16.f32 %h78, %f729;
|
|
cvt.rn.f16.f32 %h79, %f732;
|
|
cvt.rn.f16.f32 %h80, %f731;
|
|
cvt.rn.f16.f32 %h81, %f734;
|
|
cvt.rn.f16.f32 %h82, %f733;
|
|
cvt.rn.f16.f32 %h83, %f736;
|
|
cvt.rn.f16.f32 %h84, %f735;
|
|
cvt.rn.f16.f32 %h85, %f738;
|
|
cvt.rn.f16.f32 %h86, %f737;
|
|
cvt.rn.f16.f32 %h87, %f740;
|
|
cvt.rn.f16.f32 %h88, %f739;
|
|
cvt.rn.f16.f32 %h89, %f742;
|
|
cvt.rn.f16.f32 %h90, %f741;
|
|
cvt.rn.f16.f32 %h91, %f744;
|
|
cvt.rn.f16.f32 %h92, %f743;
|
|
cvt.rn.f16.f32 %h93, %f746;
|
|
cvt.rn.f16.f32 %h94, %f745;
|
|
cvt.rn.f16.f32 %h95, %f748;
|
|
cvt.rn.f16.f32 %h96, %f747;
|
|
cvt.rn.f16.f32 %h97, %f750;
|
|
cvt.rn.f16.f32 %h98, %f749;
|
|
cvt.rn.f16.f32 %h99, %f752;
|
|
cvt.rn.f16.f32 %h100, %f751;
|
|
cvt.rn.f16.f32 %h101, %f754;
|
|
cvt.rn.f16.f32 %h102, %f753;
|
|
cvt.rn.f16.f32 %h103, %f756;
|
|
cvt.rn.f16.f32 %h104, %f755;
|
|
cvt.rn.f16.f32 %h105, %f758;
|
|
cvt.rn.f16.f32 %h106, %f757;
|
|
cvt.rn.f16.f32 %h107, %f760;
|
|
cvt.rn.f16.f32 %h108, %f759;
|
|
cvt.rn.f16.f32 %h109, %f762;
|
|
cvt.rn.f16.f32 %h110, %f761;
|
|
cvt.rn.f16.f32 %h111, %f764;
|
|
cvt.rn.f16.f32 %h112, %f763;
|
|
cvt.rn.f16.f32 %h113, %f766;
|
|
cvt.rn.f16.f32 %h114, %f765;
|
|
cvt.rn.f16.f32 %h115, %f768;
|
|
cvt.rn.f16.f32 %h116, %f767;
|
|
cvt.rn.f16.f32 %h117, %f770;
|
|
cvt.rn.f16.f32 %h118, %f769;
|
|
cvt.rn.f16.f32 %h119, %f772;
|
|
cvt.rn.f16.f32 %h120, %f771;
|
|
cvt.rn.f16.f32 %h121, %f774;
|
|
cvt.rn.f16.f32 %h122, %f773;
|
|
cvt.rn.f16.f32 %h123, %f776;
|
|
cvt.rn.f16.f32 %h124, %f775;
|
|
cvt.rn.f16.f32 %h125, %f778;
|
|
cvt.rn.f16.f32 %h126, %f777;
|
|
cvt.rn.f16.f32 %h127, %f780;
|
|
cvt.rn.f16.f32 %h128, %f779;
|
|
bar.sync 0;
|
|
st.shared.v2.b16 [%r182], {%h66, %h65};
|
|
st.shared.v2.b16 [%r183], {%h68, %h67};
|
|
st.shared.v2.b16 [%r184], {%h70, %h69};
|
|
st.shared.v2.b16 [%r185], {%h72, %h71};
|
|
st.shared.v2.b16 [%r186], {%h74, %h73};
|
|
st.shared.v2.b16 [%r187], {%h76, %h75};
|
|
st.shared.v2.b16 [%r188], {%h78, %h77};
|
|
st.shared.v2.b16 [%r189], {%h80, %h79};
|
|
st.shared.v2.b16 [%r190], {%h82, %h81};
|
|
st.shared.v2.b16 [%r191], {%h84, %h83};
|
|
st.shared.v2.b16 [%r192], {%h86, %h85};
|
|
st.shared.v2.b16 [%r193], {%h88, %h87};
|
|
st.shared.v2.b16 [%r194], {%h90, %h89};
|
|
st.shared.v2.b16 [%r195], {%h92, %h91};
|
|
st.shared.v2.b16 [%r196], {%h94, %h93};
|
|
st.shared.v2.b16 [%r197], {%h96, %h95};
|
|
st.shared.v2.b16 [%r182+128], {%h98, %h97};
|
|
st.shared.v2.b16 [%r183+128], {%h100, %h99};
|
|
st.shared.v2.b16 [%r200], {%h102, %h101};
|
|
st.shared.v2.b16 [%r201], {%h104, %h103};
|
|
st.shared.v2.b16 [%r202], {%h106, %h105};
|
|
st.shared.v2.b16 [%r203], {%h108, %h107};
|
|
st.shared.v2.b16 [%r204], {%h110, %h109};
|
|
st.shared.v2.b16 [%r205], {%h112, %h111};
|
|
st.shared.v2.b16 [%r206], {%h114, %h113};
|
|
st.shared.v2.b16 [%r207], {%h116, %h115};
|
|
st.shared.v2.b16 [%r208], {%h118, %h117};
|
|
st.shared.v2.b16 [%r209], {%h120, %h119};
|
|
st.shared.v2.b16 [%r210], {%h122, %h121};
|
|
st.shared.v2.b16 [%r211], {%h124, %h123};
|
|
st.shared.v2.b16 [%r212], {%h126, %h125};
|
|
st.shared.v2.b16 [%r213], {%h128, %h127};
|
|
bar.sync 0;
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4090, %r4091, %r4092, %r4093 }, [ %r3930 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4202, %r4203, %r4204, %r4205 }, [ %r3935 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4314, %r4315, %r4316, %r4317 }, [ %r3940 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4426, %r4427, %r4428, %r4429 }, [ %r3945 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4538, %r4539, %r4540, %r4541 }, [ %r3950 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4650, %r4651, %r4652, %r4653 }, [ %r3955 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4762, %r4763, %r4764, %r4765 }, [ %r3960 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4874, %r4875, %r4876, %r4877 }, [ %r3965 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4146, %r4147, %r4148, %r4149 }, [ %r3970 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4258, %r4259, %r4260, %r4261 }, [ %r3975 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4370, %r4371, %r4372, %r4373 }, [ %r3980 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4482, %r4483, %r4484, %r4485 }, [ %r3985 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4594, %r4595, %r4596, %r4597 }, [ %r3990 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4706, %r4707, %r4708, %r4709 }, [ %r3995 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4818, %r4819, %r4820, %r4821 }, [ %r4000 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4930, %r4931, %r4932, %r4933 }, [ %r4005 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4094, %r4095, %r4108, %r4109 }, [ %r4010 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4206, %r4207, %r4220, %r4221 }, [ %r4015 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4318, %r4319, %r4332, %r4333 }, [ %r4020 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4430, %r4431, %r4444, %r4445 }, [ %r4025 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4542, %r4543, %r4556, %r4557 }, [ %r4030 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4654, %r4655, %r4668, %r4669 }, [ %r4035 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4766, %r4767, %r4780, %r4781 }, [ %r4040 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4878, %r4879, %r4892, %r4893 }, [ %r4045 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4122, %r4123, %r4136, %r4137 }, [ %r4050 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4234, %r4235, %r4248, %r4249 }, [ %r4055 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4346, %r4347, %r4360, %r4361 }, [ %r4060 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4458, %r4459, %r4472, %r4473 }, [ %r4065 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4570, %r4571, %r4584, %r4585 }, [ %r4070 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4682, %r4683, %r4696, %r4697 }, [ %r4075 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4794, %r4795, %r4808, %r4809 }, [ %r4080 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r4906, %r4907, %r4920, %r4921 }, [ %r4085 + 0 ];
|
|
mov.b32 %r4198, %f941;
|
|
mov.b32 %r4199, %f942;
|
|
mov.b32 %r4200, %f943;
|
|
mov.b32 %r4201, %f944;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4094, %r4095 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mov.b32 %r4212, %f945;
|
|
mov.b32 %r4213, %f946;
|
|
mov.b32 %r4214, %f947;
|
|
mov.b32 %r4215, %f948;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4108, %r4109 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mov.b32 %r4226, %f949;
|
|
mov.b32 %r4227, %f950;
|
|
mov.b32 %r4228, %f951;
|
|
mov.b32 %r4229, %f952;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4122, %r4123 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mov.b32 %r4240, %f953;
|
|
mov.b32 %r4241, %f954;
|
|
mov.b32 %r4242, %f955;
|
|
mov.b32 %r4243, %f956;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4090, %r4091, %r4092, %r4093 }, { %r4136, %r4137 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mov.b32 %r4254, %f957;
|
|
mov.b32 %r4255, %f958;
|
|
mov.b32 %r4256, %f959;
|
|
mov.b32 %r4257, %f960;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4094, %r4095 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mov.b32 %r4268, %f961;
|
|
mov.b32 %r4269, %f962;
|
|
mov.b32 %r4270, %f963;
|
|
mov.b32 %r4271, %f964;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4108, %r4109 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mov.b32 %r4282, %f965;
|
|
mov.b32 %r4283, %f966;
|
|
mov.b32 %r4284, %f967;
|
|
mov.b32 %r4285, %f968;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4122, %r4123 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mov.b32 %r4296, %f969;
|
|
mov.b32 %r4297, %f970;
|
|
mov.b32 %r4298, %f971;
|
|
mov.b32 %r4299, %f972;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4146, %r4147, %r4148, %r4149 }, { %r4136, %r4137 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4206, %r4207 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4220, %r4221 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4234, %r4235 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4202, %r4203, %r4204, %r4205 }, { %r4248, %r4249 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4206, %r4207 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4220, %r4221 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4234, %r4235 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4258, %r4259, %r4260, %r4261 }, { %r4248, %r4249 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4318, %r4319 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4332, %r4333 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4346, %r4347 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4314, %r4315, %r4316, %r4317 }, { %r4360, %r4361 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4318, %r4319 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4332, %r4333 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4346, %r4347 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4370, %r4371, %r4372, %r4373 }, { %r4360, %r4361 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4430, %r4431 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4444, %r4445 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4458, %r4459 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4426, %r4427, %r4428, %r4429 }, { %r4472, %r4473 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4430, %r4431 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4444, %r4445 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4458, %r4459 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4482, %r4483, %r4484, %r4485 }, { %r4472, %r4473 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4542, %r4543 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4556, %r4557 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4570, %r4571 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4538, %r4539, %r4540, %r4541 }, { %r4584, %r4585 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4542, %r4543 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4556, %r4557 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4570, %r4571 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4594, %r4595, %r4596, %r4597 }, { %r4584, %r4585 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4654, %r4655 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4668, %r4669 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4682, %r4683 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4650, %r4651, %r4652, %r4653 }, { %r4696, %r4697 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4654, %r4655 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4668, %r4669 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4682, %r4683 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4706, %r4707, %r4708, %r4709 }, { %r4696, %r4697 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4766, %r4767 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4780, %r4781 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4794, %r4795 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4762, %r4763, %r4764, %r4765 }, { %r4808, %r4809 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4766, %r4767 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4780, %r4781 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4794, %r4795 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4818, %r4819, %r4820, %r4821 }, { %r4808, %r4809 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4198, %r4199, %r4200, %r4201 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4878, %r4879 }, { %r4198, %r4199, %r4200, %r4201 };
|
|
mov.b32 %f944, %r4201;
|
|
mov.b32 %f943, %r4200;
|
|
mov.b32 %f942, %r4199;
|
|
mov.b32 %f941, %r4198;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4212, %r4213, %r4214, %r4215 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4892, %r4893 }, { %r4212, %r4213, %r4214, %r4215 };
|
|
mov.b32 %f948, %r4215;
|
|
mov.b32 %f947, %r4214;
|
|
mov.b32 %f946, %r4213;
|
|
mov.b32 %f945, %r4212;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4226, %r4227, %r4228, %r4229 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4906, %r4907 }, { %r4226, %r4227, %r4228, %r4229 };
|
|
mov.b32 %f952, %r4229;
|
|
mov.b32 %f951, %r4228;
|
|
mov.b32 %f950, %r4227;
|
|
mov.b32 %f949, %r4226;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4240, %r4241, %r4242, %r4243 }, { %r4874, %r4875, %r4876, %r4877 }, { %r4920, %r4921 }, { %r4240, %r4241, %r4242, %r4243 };
|
|
mov.b32 %f956, %r4243;
|
|
mov.b32 %f955, %r4242;
|
|
mov.b32 %f954, %r4241;
|
|
mov.b32 %f953, %r4240;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4254, %r4255, %r4256, %r4257 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4878, %r4879 }, { %r4254, %r4255, %r4256, %r4257 };
|
|
mov.b32 %f960, %r4257;
|
|
mov.b32 %f959, %r4256;
|
|
mov.b32 %f958, %r4255;
|
|
mov.b32 %f957, %r4254;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4268, %r4269, %r4270, %r4271 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4892, %r4893 }, { %r4268, %r4269, %r4270, %r4271 };
|
|
mov.b32 %f964, %r4271;
|
|
mov.b32 %f963, %r4270;
|
|
mov.b32 %f962, %r4269;
|
|
mov.b32 %f961, %r4268;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4282, %r4283, %r4284, %r4285 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4906, %r4907 }, { %r4282, %r4283, %r4284, %r4285 };
|
|
mov.b32 %f968, %r4285;
|
|
mov.b32 %f967, %r4284;
|
|
mov.b32 %f966, %r4283;
|
|
mov.b32 %f965, %r4282;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r4296, %r4297, %r4298, %r4299 }, { %r4930, %r4931, %r4932, %r4933 }, { %r4920, %r4921 }, { %r4296, %r4297, %r4298, %r4299 };
|
|
mov.b32 %f972, %r4299;
|
|
mov.b32 %f971, %r4298;
|
|
mov.b32 %f970, %r4297;
|
|
mov.b32 %f969, %r4296;
|
|
@%p102 ld.global.v4.b32 { %r4982, %r4983, %r4984, %r4985 }, [ %rd96 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r4986, %r4987, %r4988, %r4989 }, [ %rd97 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r4990, %r4991, %r4992, %r4993 }, [ %rd98 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r4994, %r4995, %r4996, %r4997 }, [ %rd99 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r4998, %r4999, %r5000, %r5001 }, [ %rd100 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r5002, %r5003, %r5004, %r5005 }, [ %rd101 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r5006, %r5007, %r5008, %r5009 }, [ %rd102 + 0 ];
|
|
@%p102 ld.global.v4.b32 { %r5010, %r5011, %r5012, %r5013 }, [ %rd103 + 0 ];
|
|
bar.sync 0;
|
|
st.shared.v4.u32 [%r246], {%r4982, %r4983, %r4984, %r4985};
|
|
st.shared.v4.u32 [%r246+4352], {%r4986, %r4987, %r4988, %r4989};
|
|
st.shared.v4.u32 [%r246+8704], {%r4990, %r4991, %r4992, %r4993};
|
|
st.shared.v4.u32 [%r246+13056], {%r4994, %r4995, %r4996, %r4997};
|
|
bar.sync 0;
|
|
ld.shared.v2.f32 {%f781, %f782}, [%r247];
|
|
ld.shared.v2.f32 {%f783, %f784}, [%r248];
|
|
ld.shared.v2.f32 {%f785, %f786}, [%r247+64];
|
|
ld.shared.v2.f32 {%f787, %f788}, [%r248+64];
|
|
ld.shared.v2.f32 {%f789, %f790}, [%r247+128];
|
|
ld.shared.v2.f32 {%f791, %f792}, [%r248+128];
|
|
ld.shared.v2.f32 {%f793, %f794}, [%r247+192];
|
|
ld.shared.v2.f32 {%f795, %f796}, [%r248+192];
|
|
bar.sync 0;
|
|
st.shared.v4.u32 [%r246], {%r4998, %r4999, %r5000, %r5001};
|
|
st.shared.v4.u32 [%r246+4352], {%r5002, %r5003, %r5004, %r5005};
|
|
st.shared.v4.u32 [%r246+8704], {%r5006, %r5007, %r5008, %r5009};
|
|
st.shared.v4.u32 [%r246+13056], {%r5010, %r5011, %r5012, %r5013};
|
|
bar.sync 0;
|
|
ld.shared.v2.f32 {%f797, %f798}, [%r247];
|
|
ld.shared.v2.f32 {%f799, %f800}, [%r248];
|
|
ld.shared.v2.f32 {%f801, %f802}, [%r247+64];
|
|
ld.shared.v2.f32 {%f803, %f804}, [%r248+64];
|
|
ld.shared.v2.f32 {%f805, %f806}, [%r247+128];
|
|
ld.shared.v2.f32 {%f807, %f808}, [%r248+128];
|
|
ld.shared.v2.f32 {%f809, %f810}, [%r247+192];
|
|
ld.shared.v2.f32 {%f811, %f812}, [%r248+192];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5178, %r5179, %r5180, %r5181 }, [ %r5018 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5290, %r5291, %r5292, %r5293 }, [ %r5023 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5402, %r5403, %r5404, %r5405 }, [ %r5028 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5514, %r5515, %r5516, %r5517 }, [ %r5033 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5626, %r5627, %r5628, %r5629 }, [ %r5038 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5738, %r5739, %r5740, %r5741 }, [ %r5043 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5850, %r5851, %r5852, %r5853 }, [ %r5048 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5962, %r5963, %r5964, %r5965 }, [ %r5053 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5234, %r5235, %r5236, %r5237 }, [ %r5058 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5346, %r5347, %r5348, %r5349 }, [ %r5063 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5458, %r5459, %r5460, %r5461 }, [ %r5068 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5570, %r5571, %r5572, %r5573 }, [ %r5073 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5682, %r5683, %r5684, %r5685 }, [ %r5078 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5794, %r5795, %r5796, %r5797 }, [ %r5083 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r5906, %r5907, %r5908, %r5909 }, [ %r5088 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.shared.b16 { %r6018, %r6019, %r6020, %r6021 }, [ %r5093 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5182, %r5183, %r5196, %r5197 }, [ %r5098 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5294, %r5295, %r5308, %r5309 }, [ %r5103 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5406, %r5407, %r5420, %r5421 }, [ %r5108 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5518, %r5519, %r5532, %r5533 }, [ %r5113 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5630, %r5631, %r5644, %r5645 }, [ %r5118 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5742, %r5743, %r5756, %r5757 }, [ %r5123 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5854, %r5855, %r5868, %r5869 }, [ %r5128 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5966, %r5967, %r5980, %r5981 }, [ %r5133 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5210, %r5211, %r5224, %r5225 }, [ %r5138 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5322, %r5323, %r5336, %r5337 }, [ %r5143 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5434, %r5435, %r5448, %r5449 }, [ %r5148 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5546, %r5547, %r5560, %r5561 }, [ %r5153 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5658, %r5659, %r5672, %r5673 }, [ %r5158 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5770, %r5771, %r5784, %r5785 }, [ %r5163 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5882, %r5883, %r5896, %r5897 }, [ %r5168 + 0 ];
|
|
ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16 { %r5994, %r5995, %r6008, %r6009 }, [ %r5173 + 0 ];
|
|
mov.b32 %r5286, %f781;
|
|
mov.b32 %r5287, %f782;
|
|
mov.b32 %r5288, %f783;
|
|
mov.b32 %r5289, %f784;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5182, %r5183 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mov.b32 %r5300, %f785;
|
|
mov.b32 %r5301, %f786;
|
|
mov.b32 %r5302, %f787;
|
|
mov.b32 %r5303, %f788;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5196, %r5197 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mov.b32 %r5314, %f789;
|
|
mov.b32 %r5315, %f790;
|
|
mov.b32 %r5316, %f791;
|
|
mov.b32 %r5317, %f792;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5210, %r5211 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mov.b32 %r5328, %f793;
|
|
mov.b32 %r5329, %f794;
|
|
mov.b32 %r5330, %f795;
|
|
mov.b32 %r5331, %f796;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5178, %r5179, %r5180, %r5181 }, { %r5224, %r5225 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mov.b32 %r5342, %f797;
|
|
mov.b32 %r5343, %f798;
|
|
mov.b32 %r5344, %f799;
|
|
mov.b32 %r5345, %f800;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5182, %r5183 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mov.b32 %r5356, %f801;
|
|
mov.b32 %r5357, %f802;
|
|
mov.b32 %r5358, %f803;
|
|
mov.b32 %r5359, %f804;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5196, %r5197 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mov.b32 %r5370, %f805;
|
|
mov.b32 %r5371, %f806;
|
|
mov.b32 %r5372, %f807;
|
|
mov.b32 %r5373, %f808;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5210, %r5211 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mov.b32 %r5384, %f809;
|
|
mov.b32 %r5385, %f810;
|
|
mov.b32 %r5386, %f811;
|
|
mov.b32 %r5387, %f812;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5234, %r5235, %r5236, %r5237 }, { %r5224, %r5225 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5294, %r5295 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5308, %r5309 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5322, %r5323 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5290, %r5291, %r5292, %r5293 }, { %r5336, %r5337 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5294, %r5295 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5308, %r5309 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5322, %r5323 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5346, %r5347, %r5348, %r5349 }, { %r5336, %r5337 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5406, %r5407 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5420, %r5421 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5434, %r5435 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5402, %r5403, %r5404, %r5405 }, { %r5448, %r5449 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5406, %r5407 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5420, %r5421 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5434, %r5435 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5458, %r5459, %r5460, %r5461 }, { %r5448, %r5449 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5518, %r5519 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5532, %r5533 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5546, %r5547 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5514, %r5515, %r5516, %r5517 }, { %r5560, %r5561 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5518, %r5519 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5532, %r5533 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5546, %r5547 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5570, %r5571, %r5572, %r5573 }, { %r5560, %r5561 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5630, %r5631 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5644, %r5645 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5658, %r5659 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5626, %r5627, %r5628, %r5629 }, { %r5672, %r5673 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5630, %r5631 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5644, %r5645 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5658, %r5659 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5682, %r5683, %r5684, %r5685 }, { %r5672, %r5673 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5742, %r5743 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5756, %r5757 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5770, %r5771 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5738, %r5739, %r5740, %r5741 }, { %r5784, %r5785 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5742, %r5743 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5756, %r5757 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5770, %r5771 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5794, %r5795, %r5796, %r5797 }, { %r5784, %r5785 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5854, %r5855 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5868, %r5869 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5882, %r5883 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5850, %r5851, %r5852, %r5853 }, { %r5896, %r5897 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5854, %r5855 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5868, %r5869 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5882, %r5883 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r5906, %r5907, %r5908, %r5909 }, { %r5896, %r5897 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5286, %r5287, %r5288, %r5289 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5966, %r5967 }, { %r5286, %r5287, %r5288, %r5289 };
|
|
mov.b32 %f813, %r5289;
|
|
mov.b32 %f814, %r5288;
|
|
mov.b32 %f815, %r5287;
|
|
mov.b32 %f816, %r5286;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5300, %r5301, %r5302, %r5303 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5980, %r5981 }, { %r5300, %r5301, %r5302, %r5303 };
|
|
mov.b32 %f817, %r5303;
|
|
mov.b32 %f818, %r5302;
|
|
mov.b32 %f819, %r5301;
|
|
mov.b32 %f820, %r5300;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5314, %r5315, %r5316, %r5317 }, { %r5962, %r5963, %r5964, %r5965 }, { %r5994, %r5995 }, { %r5314, %r5315, %r5316, %r5317 };
|
|
mov.b32 %f821, %r5317;
|
|
mov.b32 %f822, %r5316;
|
|
mov.b32 %f823, %r5315;
|
|
mov.b32 %f824, %r5314;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5328, %r5329, %r5330, %r5331 }, { %r5962, %r5963, %r5964, %r5965 }, { %r6008, %r6009 }, { %r5328, %r5329, %r5330, %r5331 };
|
|
mov.b32 %f825, %r5331;
|
|
mov.b32 %f826, %r5330;
|
|
mov.b32 %f827, %r5329;
|
|
mov.b32 %f828, %r5328;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5342, %r5343, %r5344, %r5345 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5966, %r5967 }, { %r5342, %r5343, %r5344, %r5345 };
|
|
mov.b32 %f829, %r5345;
|
|
mov.b32 %f830, %r5344;
|
|
mov.b32 %f831, %r5343;
|
|
mov.b32 %f832, %r5342;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5356, %r5357, %r5358, %r5359 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5980, %r5981 }, { %r5356, %r5357, %r5358, %r5359 };
|
|
mov.b32 %f833, %r5359;
|
|
mov.b32 %f834, %r5358;
|
|
mov.b32 %f835, %r5357;
|
|
mov.b32 %f836, %r5356;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5370, %r5371, %r5372, %r5373 }, { %r6018, %r6019, %r6020, %r6021 }, { %r5994, %r5995 }, { %r5370, %r5371, %r5372, %r5373 };
|
|
mov.b32 %f837, %r5373;
|
|
mov.b32 %f838, %r5372;
|
|
mov.b32 %f839, %r5371;
|
|
mov.b32 %f840, %r5370;
|
|
mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32 { %r5384, %r5385, %r5386, %r5387 }, { %r6018, %r6019, %r6020, %r6021 }, { %r6008, %r6009 }, { %r5384, %r5385, %r5386, %r5387 };
|
|
mov.b32 %f841, %r5387;
|
|
mov.b32 %f842, %r5386;
|
|
mov.b32 %f843, %r5385;
|
|
mov.b32 %f844, %r5384;
|
|
bar.sync 0;
|
|
st.shared.v2.f32 [%r247], {%f816, %f815};
|
|
st.shared.v2.f32 [%r248], {%f814, %f813};
|
|
st.shared.v2.f32 [%r247+64], {%f820, %f819};
|
|
st.shared.v2.f32 [%r248+64], {%f818, %f817};
|
|
st.shared.v2.f32 [%r247+128], {%f824, %f823};
|
|
st.shared.v2.f32 [%r248+128], {%f822, %f821};
|
|
st.shared.v2.f32 [%r247+192], {%f828, %f827};
|
|
st.shared.v2.f32 [%r248+192], {%f826, %f825};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6070, %r6071, %r6072, %r6073}, [%r246];
|
|
ld.shared.v4.u32 {%r6074, %r6075, %r6076, %r6077}, [%r246+4352];
|
|
ld.shared.v4.u32 {%r6078, %r6079, %r6080, %r6081}, [%r246+8704];
|
|
ld.shared.v4.u32 {%r6082, %r6083, %r6084, %r6085}, [%r246+13056];
|
|
bar.sync 0;
|
|
st.shared.v2.f32 [%r247], {%f832, %f831};
|
|
st.shared.v2.f32 [%r248], {%f830, %f829};
|
|
st.shared.v2.f32 [%r247+64], {%f836, %f835};
|
|
st.shared.v2.f32 [%r248+64], {%f834, %f833};
|
|
st.shared.v2.f32 [%r247+128], {%f840, %f839};
|
|
st.shared.v2.f32 [%r248+128], {%f838, %f837};
|
|
st.shared.v2.f32 [%r247+192], {%f844, %f843};
|
|
st.shared.v2.f32 [%r248+192], {%f842, %f841};
|
|
bar.sync 0;
|
|
ld.shared.v4.u32 {%r6086, %r6087, %r6088, %r6089}, [%r246];
|
|
ld.shared.v4.u32 {%r6090, %r6091, %r6092, %r6093}, [%r246+4352];
|
|
ld.shared.v4.u32 {%r6094, %r6095, %r6096, %r6097}, [%r246+8704];
|
|
ld.shared.v4.u32 {%r6098, %r6099, %r6100, %r6101}, [%r246+13056];
|
|
@%p102 st.global.v4.b32 [ %rd96 + 0 ], { %r6070, %r6071, %r6072, %r6073 };
|
|
@%p102 st.global.v4.b32 [ %rd97 + 0 ], { %r6074, %r6075, %r6076, %r6077 };
|
|
@%p102 st.global.v4.b32 [ %rd98 + 0 ], { %r6078, %r6079, %r6080, %r6081 };
|
|
@%p102 st.global.v4.b32 [ %rd99 + 0 ], { %r6082, %r6083, %r6084, %r6085 };
|
|
@%p102 st.global.v4.b32 [ %rd100 + 0 ], { %r6086, %r6087, %r6088, %r6089 };
|
|
@%p102 st.global.v4.b32 [ %rd101 + 0 ], { %r6090, %r6091, %r6092, %r6093 };
|
|
@%p102 st.global.v4.b32 [ %rd102 + 0 ], { %r6094, %r6095, %r6096, %r6097 };
|
|
@%p102 st.global.v4.b32 [ %rd103 + 0 ], { %r6098, %r6099, %r6100, %r6101 };
|
|
add.s32 %r6176, %r6176, 128;
|
|
add.s64 %rd138, %rd138, %rd10;
|
|
add.s64 %rd137, %rd137, %rd10;
|
|
add.s64 %rd136, %rd136, %rd10;
|
|
add.s64 %rd135, %rd135, %rd10;
|
|
add.s64 %rd134, %rd134, %rd10;
|
|
add.s64 %rd133, %rd133, %rd10;
|
|
add.s64 %rd132, %rd132, %rd10;
|
|
add.s64 %rd131, %rd131, %rd10;
|
|
add.s64 %rd130, %rd130, %rd11;
|
|
setp.lt.s32 %p101, %r6176, %r22;
|
|
@%p101 bra LBB0_4;
|
|
bra.uni LBB0_5;
|
|
LBB0_6:
|
|
ret;
|
|
|
|
}
|