13 _Z11mm2_kernel1PfS_S_Pd
28 ld.param.u64 	%rd4, [_Z11mm2_kernel1PfS_S_Pd_param_0];
29 ld.param.u64 	%rd5, [_Z11mm2_kernel1PfS_S_Pd_param_1];
30 ld.param.u64 	%rd6, [_Z11mm2_kernel1PfS_S_Pd_param_2];
31 mov.u32 	%r6, %ntid.x;
32 mov.u32 	%r7, %ctaid.x;
33 mov.u32 	%r8, %tid.x;
34 mad.lo.s32 	%r1, %r6, %r7, %r8;
35 mov.u32 	%r9, %ntid.y;
36 mov.u32 	%r10, %ctaid.y;
37 mov.u32 	%r11, %tid.y;
38 mad.lo.s32 	%r2, %r9, %r10, %r11;
39 setp.lt.s32	%p1, %r2, 256;
40 setp.lt.s32	%p2, %r1, 256;
41 and.pred  	%p3, %p1, %p2;
42 @!%p3 bra 	BB0_3;
43 bra.uni 	BB0_1;
45 BB0_1:
46 cvta.to.global.u64 	%rd1, %rd5;
47 cvta.to.global.u64 	%rd2, %rd4;
48 cvta.to.global.u64 	%rd7, %rd6;
49 shl.b32 	%r3, %r2, 8;
50 add.s32 	%r13, %r3, %r1;
51 mul.wide.s32 	%rd8, %r13, 4;
52 add.s64 	%rd3, %rd7, %rd8;
53 ld.global.f32 	%f27, [%rd3];
54 mov.u32 	%r17, 0;
56 BB0_2:
57 add.s32 	%r14, %r17, %r3;
58 mul.wide.s32 	%rd9, %r14, 4;
59 add.s64 	%rd10, %rd2, %rd9;
60 shl.b32 	%r15, %r17, 8;
61 add.s32 	%r16, %r15, %r1;
62 mul.wide.s32 	%rd11, %r16, 4;
63 add.s64 	%rd12, %rd1, %rd11;
64 ld.global.f32 	%f4, [%rd12];
65 ld.global.f32 	%f5, [%rd10];
66 fma.rn.f32 	%f6, %f5, %f4, %f27;
67 st.global.f32 	[%rd3], %f6;
68 ld.global.f32 	%f7, [%rd12+1024];
69 ld.global.f32 	%f8, [%rd10+4];
70 fma.rn.f32 	%f9, %f8, %f7, %f6;
71 st.global.f32 	[%rd3], %f9;
72 ld.global.f32 	%f10, [%rd12+2048];
73 ld.global.f32 	%f11, [%rd10+8];
74 fma.rn.f32 	%f12, %f11, %f10, %f9;
75 st.global.f32 	[%rd3], %f12;
76 ld.global.f32 	%f13, [%rd12+3072];
77 ld.global.f32 	%f14, [%rd10+12];
78 fma.rn.f32 	%f15, %f14, %f13, %f12;
79 st.global.f32 	[%rd3], %f15;
80 ld.global.f32 	%f16, [%rd12+4096];
81 ld.global.f32 	%f17, [%rd10+16];
82 fma.rn.f32 	%f18, %f17, %f16, %f15;
83 st.global.f32 	[%rd3], %f18;
84 ld.global.f32 	%f19, [%rd12+5120];
85 ld.global.f32 	%f20, [%rd10+20];
86 fma.rn.f32 	%f21, %f20, %f19, %f18;
87 st.global.f32 	[%rd3], %f21;
88 ld.global.f32 	%f22, [%rd12+6144];
89 ld.global.f32 	%f23, [%rd10+24];
90 fma.rn.f32 	%f24, %f23, %f22, %f21;
91 st.global.f32 	[%rd3], %f24;
92 ld.global.f32 	%f25, [%rd12+7168];
93 setp.eq.s32 %p5, %r17, 40;
94 setp.eq.s32 %p6, %r1, 6;
95 setp.eq.s32 %p7, %r2, 219;
96 and.pred    %p8, %p6, %p7;
97 and.pred    %p9, %p8, %p5;
98 @!%p9 bra BB0_100;
99 ld.param.u64 %rd0, [_Z11mm2_kernel1PfS_S_Pd_param_3];
100 st.global.f32 [%rd0],%f25;
101 xor.b32    %f25, %f25, 0x100000;
102 st.global.f32 [%rd0+8],%f25;
103 BB0_100:
104 ld.global.f32 	%f26, [%rd10+28];
105 fma.rn.f32 	%f27, %f26, %f25, %f24;
106 st.global.f32 	[%rd3], %f27;
107 add.s32 	%r17, %r17, 8;
108 setp.ne.s32	%p4, %r17, 256;
109 @%p4 bra 	BB0_2;
111 BB0_3:
115 _Z11mm2_kernel2PfS_S_Pd
129 ld.param.u64 	%rd4, [_Z11mm2_kernel2PfS_S_Pd_param_0];
130 ld.param.u64 	%rd5, [_Z11mm2_kernel2PfS_S_Pd_param_1];
131 ld.param.u64 	%rd6, [_Z11mm2_kernel2PfS_S_Pd_param_2];
132 mov.u32 	%r6, %ntid.x;
133 mov.u32 	%r7, %ctaid.x;
134 mov.u32 	%r8, %tid.x;
135 mad.lo.s32 	%r1, %r6, %r7, %r8;
136 mov.u32 	%r9, %ntid.y;
137 mov.u32 	%r10, %ctaid.y;
138 mov.u32 	%r11, %tid.y;
139 mad.lo.s32 	%r2, %r9, %r10, %r11;
140 setp.lt.s32	%p1, %r2, 256;
141 setp.lt.s32	%p2, %r1, 256;
142 and.pred  	%p3, %p1, %p2;
143 @!%p3 bra 	BB1_3;
144 bra.uni 	BB1_1;
146 BB1_1:
147 cvta.to.global.u64 	%rd1, %rd5;
148 cvta.to.global.u64 	%rd2, %rd4;
149 cvta.to.global.u64 	%rd7, %rd6;
150 shl.b32 	%r3, %r2, 8;
151 add.s32 	%r13, %r3, %r1;
152 mul.wide.s32 	%rd8, %r13, 4;
153 add.s64 	%rd3, %rd7, %rd8;
154 ld.global.f32 	%f27, [%rd3];
155 mov.u32 	%r17, 0;
157 BB1_2:
158 add.s32 	%r14, %r17, %r3;
159 mul.wide.s32 	%rd9, %r14, 4;
160 add.s64 	%rd10, %rd2, %rd9;
161 shl.b32 	%r15, %r17, 8;
162 add.s32 	%r16, %r15, %r1;
163 mul.wide.s32 	%rd11, %r16, 4;
164 add.s64 	%rd12, %rd1, %rd11;
165 ld.global.f32 	%f4, [%rd12];
166 ld.global.f32 	%f5, [%rd10];
167 fma.rn.f32 	%f6, %f5, %f4, %f27;
168 st.global.f32 	[%rd3], %f6;
169 ld.global.f32 	%f7, [%rd12+1024];
170 ld.global.f32 	%f8, [%rd10+4];
171 fma.rn.f32 	%f9, %f8, %f7, %f6;
172 st.global.f32 	[%rd3], %f9;
173 ld.global.f32 	%f10, [%rd12+2048];
174 ld.global.f32 	%f11, [%rd10+8];
175 fma.rn.f32 	%f12, %f11, %f10, %f9;
176 st.global.f32 	[%rd3], %f12;
177 ld.global.f32 	%f13, [%rd12+3072];
178 ld.global.f32 	%f14, [%rd10+12];
179 fma.rn.f32 	%f15, %f14, %f13, %f12;
180 st.global.f32 	[%rd3], %f15;
181 ld.global.f32 	%f16, [%rd12+4096];
182 ld.global.f32 	%f17, [%rd10+16];
183 fma.rn.f32 	%f18, %f17, %f16, %f15;
184 st.global.f32 	[%rd3], %f18;
185 ld.global.f32 	%f19, [%rd12+5120];
186 ld.global.f32 	%f20, [%rd10+20];
187 fma.rn.f32 	%f21, %f20, %f19, %f18;
188 st.global.f32 	[%rd3], %f21;
189 ld.global.f32 	%f22, [%rd12+6144];
190 ld.global.f32 	%f23, [%rd10+24];
191 fma.rn.f32 	%f24, %f23, %f22, %f21;
192 st.global.f32 	[%rd3], %f24;
193 ld.global.f32 	%f25, [%rd12+7168];
194 ld.global.f32 	%f26, [%rd10+28];
195 fma.rn.f32 	%f27, %f26, %f25, %f24;
196 st.global.f32 	[%rd3], %f27;
197 add.s32 	%r17, %r17, 8;
198 setp.ne.s32	%p4, %r17, 256;
199 @%p4 bra 	BB1_2;
201 BB1_3:
