@@ -30,7 +30,283 @@ namespace lsp
3030{
3131 namespace neon_d32
3232 {
33+ void pmix_v1 (float *dst, const float *src, const float *k, size_t count)
34+ {
35+ ARCH_ARM_ASM
36+ (
37+ // x16 blocks
38+ __ASM_EMIT (" subs %[count], #16" )
39+ __ASM_EMIT (" blo 2f" )
40+ __ASM_EMIT (" 1:" )
41+ __ASM_EMIT (" vldm %[dst], {q0-q3}" ) // q0 = d
42+ __ASM_EMIT (" vldm %[src]!, {q4-q7}" ) // q4 = s
43+ __ASM_EMIT (" vldm %[k]!, {q8-q11}" ) // q8 = k
44+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
45+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
46+ __ASM_EMIT (" vsub.f32 q6, q6, q2" )
47+ __ASM_EMIT (" vsub.f32 q7, q7, q3" )
48+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
49+ __ASM_EMIT (" vmla.f32 q1, q5, q9" )
50+ __ASM_EMIT (" vmla.f32 q2, q6, q10" )
51+ __ASM_EMIT (" vmla.f32 q3, q7, q11" )
52+ __ASM_EMIT (" subs %[count], #16" )
53+ __ASM_EMIT (" vstm %[dst]!, {q0-q3}" )
54+ __ASM_EMIT (" bhs 1b" )
55+ // x8 block
56+ __ASM_EMIT (" 2:" )
57+ __ASM_EMIT (" adds %[count], #8" )
58+ __ASM_EMIT (" blt 4f" )
59+ __ASM_EMIT (" vldm %[dst], {q0-q1}" ) // q0 = d
60+ __ASM_EMIT (" vldm %[src]!, {q4-q5}" ) // q4 = s
61+ __ASM_EMIT (" vldm %[k]!, {q8-q9}" ) // q8 = k
62+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
63+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
64+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
65+ __ASM_EMIT (" vmla.f32 q1, q5, q9" )
66+ __ASM_EMIT (" sub %[count], #8" )
67+ __ASM_EMIT (" vstm %[dst]!, {q0-q1}" )
68+ // x4 blocks
69+ __ASM_EMIT (" 4:" )
70+ __ASM_EMIT (" adds %[count], #4" )
71+ __ASM_EMIT (" blt 6f" )
72+ __ASM_EMIT (" vldm %[dst], {q0}" ) // q0 = d
73+ __ASM_EMIT (" vldm %[src]!, {q4}" ) // q4 = s
74+ __ASM_EMIT (" vldm %[k]!, {q8}" ) // q8 = k
75+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
76+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
77+ __ASM_EMIT (" sub %[count], #4" )
78+ __ASM_EMIT (" vstm %[dst]!, {q0}" )
79+ // x1 blocks
80+ __ASM_EMIT (" 6:" )
81+ __ASM_EMIT (" adds %[count], #3" )
82+ __ASM_EMIT (" blt 8f" )
83+ __ASM_EMIT (" 7:" )
84+ __ASM_EMIT (" vld1.32 {d0[], d1[]}, [%[dst]]" ) // q0 = d
85+ __ASM_EMIT (" vld1.32 {d8[], d9[]}, [%[src]]!" ) // q4 = s
86+ __ASM_EMIT (" vld1.32 {d16[], d17[]}, [%[k]]!" ) // q8 = k
87+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
88+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
89+ __ASM_EMIT (" subs %[count], #1" )
90+ __ASM_EMIT (" vst1.32 {d0[0]}, [%[dst]]!" )
91+ __ASM_EMIT (" bge 7b" )
92+ // end
93+ __ASM_EMIT (" 8:" )
3394
95+ : [dst] " +r" (dst), [src] " +r" (src), [k] " +r" (k),
96+ [count] " +r" (count)
97+ :
98+ : " cc" , " memory" ,
99+ " q0" , " q1" , " q2" , " q3" , " q4" , " q5" , " q6" , " q7" ,
100+ " q8" , " q9" , " q10" , " q11"
101+ );
102+ }
103+
104+ void pmix_v2 (float *dst, const float *src1, const float *src2, const float *k, size_t count)
105+ {
106+ ARCH_ARM_ASM
107+ (
108+ // x16 blocks
109+ __ASM_EMIT (" subs %[count], #16" )
110+ __ASM_EMIT (" blo 2f" )
111+ __ASM_EMIT (" 1:" )
112+ __ASM_EMIT (" vldm %[src1]!, {q0-q3}" ) // q0 = d
113+ __ASM_EMIT (" vldm %[src2]!, {q4-q7}" ) // q4 = s
114+ __ASM_EMIT (" vldm %[k]!, {q8-q11}" ) // q8 = k
115+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
116+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
117+ __ASM_EMIT (" vsub.f32 q6, q6, q2" )
118+ __ASM_EMIT (" vsub.f32 q7, q7, q3" )
119+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
120+ __ASM_EMIT (" vmla.f32 q1, q5, q9" )
121+ __ASM_EMIT (" vmla.f32 q2, q6, q10" )
122+ __ASM_EMIT (" vmla.f32 q3, q7, q11" )
123+ __ASM_EMIT (" subs %[count], #16" )
124+ __ASM_EMIT (" vstm %[dst]!, {q0-q3}" )
125+ __ASM_EMIT (" bhs 1b" )
126+ // x8 block
127+ __ASM_EMIT (" 2:" )
128+ __ASM_EMIT (" adds %[count], #8" )
129+ __ASM_EMIT (" blt 4f" )
130+ __ASM_EMIT (" vldm %[src1]!, {q0-q1}" ) // q0 = d
131+ __ASM_EMIT (" vldm %[src2]!, {q4-q5}" ) // q4 = s
132+ __ASM_EMIT (" vldm %[k]!, {q8-q9}" ) // q8 = k
133+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
134+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
135+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
136+ __ASM_EMIT (" vmla.f32 q1, q5, q9" )
137+ __ASM_EMIT (" sub %[count], #8" )
138+ __ASM_EMIT (" vstm %[dst]!, {q0-q1}" )
139+ // x4 blocks
140+ __ASM_EMIT (" 4:" )
141+ __ASM_EMIT (" adds %[count], #4" )
142+ __ASM_EMIT (" blt 6f" )
143+ __ASM_EMIT (" vldm %[src1]!, {q0}" ) // q0 = d
144+ __ASM_EMIT (" vldm %[src2]!, {q4}" ) // q4 = s
145+ __ASM_EMIT (" vldm %[k]!, {q8}" ) // q8 = k
146+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
147+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
148+ __ASM_EMIT (" sub %[count], #4" )
149+ __ASM_EMIT (" vstm %[dst]!, {q0}" )
150+ // x1 blocks
151+ __ASM_EMIT (" 6:" )
152+ __ASM_EMIT (" adds %[count], #3" )
153+ __ASM_EMIT (" blt 8f" )
154+ __ASM_EMIT (" 7:" )
155+ __ASM_EMIT (" vld1.32 {d0[], d1[]}, [%[src1]]!" ) // q0 = d
156+ __ASM_EMIT (" vld1.32 {d8[], d9[]}, [%[src2]]!" ) // q4 = s
157+ __ASM_EMIT (" vld1.32 {d16[], d17[]}, [%[k]]!" ) // q8 = k
158+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
159+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
160+ __ASM_EMIT (" subs %[count], #1" )
161+ __ASM_EMIT (" vst1.32 {d0[0]}, [%[dst]]!" )
162+ __ASM_EMIT (" bge 7b" )
163+ // end
164+ __ASM_EMIT (" 8:" )
165+
166+ : [dst] " +r" (dst), [src1] " +r" (src1), [src2] " +r" (src2), [k] " +r" (k),
167+ [count] " +r" (count)
168+ :
169+ : " cc" , " memory" ,
170+ " q0" , " q1" , " q2" , " q3" , " q4" , " q5" , " q6" , " q7" ,
171+ " q8" , " q9" , " q10" , " q11"
172+ );
173+ }
174+
175+ void pmix_k1 (float *dst, const float *src, float k, size_t count)
176+ {
177+ ARCH_ARM_ASM
178+ (
179+ // x16 blocks
180+ __ASM_EMIT (" subs %[count], #16" )
181+ __ASM_EMIT (" vld1.32 {d16[], d17[]}, [%[k]]" ) // q8 = k
182+ __ASM_EMIT (" blo 2f" )
183+ __ASM_EMIT (" 1:" )
184+ __ASM_EMIT (" vldm %[dst], {q0-q3}" ) // q0 = d
185+ __ASM_EMIT (" vldm %[src]!, {q4-q7}" ) // q4 = s
186+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
187+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
188+ __ASM_EMIT (" vsub.f32 q6, q6, q2" )
189+ __ASM_EMIT (" vsub.f32 q7, q7, q3" )
190+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
191+ __ASM_EMIT (" vmla.f32 q1, q5, q8" )
192+ __ASM_EMIT (" vmla.f32 q2, q6, q8" )
193+ __ASM_EMIT (" vmla.f32 q3, q7, q8" )
194+ __ASM_EMIT (" subs %[count], #16" )
195+ __ASM_EMIT (" vstm %[dst]!, {q0-q3}" )
196+ __ASM_EMIT (" bhs 1b" )
197+ // x8 block
198+ __ASM_EMIT (" 2:" )
199+ __ASM_EMIT (" adds %[count], #8" )
200+ __ASM_EMIT (" blt 4f" )
201+ __ASM_EMIT (" vldm %[dst], {q0-q1}" ) // q0 = d
202+ __ASM_EMIT (" vldm %[src]!, {q4-q5}" ) // q4 = s
203+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
204+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
205+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
206+ __ASM_EMIT (" vmla.f32 q1, q5, q8" )
207+ __ASM_EMIT (" sub %[count], #8" )
208+ __ASM_EMIT (" vstm %[dst]!, {q0-q1}" )
209+ // x4 blocks
210+ __ASM_EMIT (" 4:" )
211+ __ASM_EMIT (" adds %[count], #4" )
212+ __ASM_EMIT (" blt 6f" )
213+ __ASM_EMIT (" vldm %[dst], {q0}" ) // q0 = d
214+ __ASM_EMIT (" vldm %[src]!, {q4}" ) // q4 = s
215+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
216+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
217+ __ASM_EMIT (" sub %[count], #4" )
218+ __ASM_EMIT (" vstm %[dst]!, {q0}" )
219+ // x1 blocks
220+ __ASM_EMIT (" 6:" )
221+ __ASM_EMIT (" adds %[count], #3" )
222+ __ASM_EMIT (" blt 8f" )
223+ __ASM_EMIT (" 7:" )
224+ __ASM_EMIT (" vld1.32 {d0[], d1[]}, [%[dst]]" ) // q0 = d
225+ __ASM_EMIT (" vld1.32 {d8[], d9[]}, [%[src]]!" ) // q4 = s
226+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
227+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
228+ __ASM_EMIT (" subs %[count], #1" )
229+ __ASM_EMIT (" vst1.32 {d0[0]}, [%[dst]]!" )
230+ __ASM_EMIT (" bge 7b" )
231+ // end
232+ __ASM_EMIT (" 8:" )
233+
234+ : [dst] " +r" (dst), [src] " +r" (src),
235+ [count] " +r" (count)
236+ : [k] " r" (&k)
237+ : " cc" , " memory" ,
238+ " q0" , " q1" , " q2" , " q3" , " q4" , " q5" , " q6" , " q7" ,
239+ " q8"
240+ );
241+ }
242+
243+ void pmix_k2 (float *dst, const float *src1, const float *src2, float k, size_t count)
244+ {
245+ ARCH_ARM_ASM
246+ (
247+ // x16 blocks
248+ __ASM_EMIT (" subs %[count], #16" )
249+ __ASM_EMIT (" vld1.32 {d16[], d17[]}, [%[k]]" ) // q8 = k
250+ __ASM_EMIT (" blo 2f" )
251+ __ASM_EMIT (" 1:" )
252+ __ASM_EMIT (" vldm %[src1]!, {q0-q3}" ) // q0 = d
253+ __ASM_EMIT (" vldm %[src2]!, {q4-q7}" ) // q4 = s
254+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
255+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
256+ __ASM_EMIT (" vsub.f32 q6, q6, q2" )
257+ __ASM_EMIT (" vsub.f32 q7, q7, q3" )
258+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
259+ __ASM_EMIT (" vmla.f32 q1, q5, q8" )
260+ __ASM_EMIT (" vmla.f32 q2, q6, q8" )
261+ __ASM_EMIT (" vmla.f32 q3, q7, q8" )
262+ __ASM_EMIT (" subs %[count], #16" )
263+ __ASM_EMIT (" vstm %[dst]!, {q0-q3}" )
264+ __ASM_EMIT (" bhs 1b" )
265+ // x8 block
266+ __ASM_EMIT (" 2:" )
267+ __ASM_EMIT (" adds %[count], #8" )
268+ __ASM_EMIT (" blt 4f" )
269+ __ASM_EMIT (" vldm %[src1]!, {q0-q1}" ) // q0 = d
270+ __ASM_EMIT (" vldm %[src2]!, {q4-q5}" ) // q4 = s
271+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
272+ __ASM_EMIT (" vsub.f32 q5, q5, q1" )
273+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
274+ __ASM_EMIT (" vmla.f32 q1, q5, q8" )
275+ __ASM_EMIT (" sub %[count], #8" )
276+ __ASM_EMIT (" vstm %[dst]!, {q0-q1}" )
277+ // x4 blocks
278+ __ASM_EMIT (" 4:" )
279+ __ASM_EMIT (" adds %[count], #4" )
280+ __ASM_EMIT (" blt 6f" )
281+ __ASM_EMIT (" vldm %[src1]!, {q0}" ) // q0 = d
282+ __ASM_EMIT (" vldm %[src2]!, {q4}" ) // q4 = s
283+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
284+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
285+ __ASM_EMIT (" sub %[count], #4" )
286+ __ASM_EMIT (" vstm %[dst]!, {q0}" )
287+ // x1 blocks
288+ __ASM_EMIT (" 6:" )
289+ __ASM_EMIT (" adds %[count], #3" )
290+ __ASM_EMIT (" blt 8f" )
291+ __ASM_EMIT (" 7:" )
292+ __ASM_EMIT (" vld1.32 {d0[], d1[]}, [%[src1]]!" ) // q0 = d
293+ __ASM_EMIT (" vld1.32 {d8[], d9[]}, [%[src2]]!" ) // q4 = s
294+ __ASM_EMIT (" vsub.f32 q4, q4, q0" ) // q4 = s - d
295+ __ASM_EMIT (" vmla.f32 q0, q4, q8" ) // q0 = d + (s-d)*k
296+ __ASM_EMIT (" subs %[count], #1" )
297+ __ASM_EMIT (" vst1.32 {d0[0]}, [%[dst]]!" )
298+ __ASM_EMIT (" bge 7b" )
299+ // end
300+ __ASM_EMIT (" 8:" )
301+
302+ : [dst] " +r" (dst), [src] " +r" (src),
303+ [count] " +r" (count)
304+ : [k] " r" (&k)
305+ : " cc" , " memory" ,
306+ " q0" , " q1" , " q2" , " q3" , " q4" , " q5" , " q6" , " q7" ,
307+ " q8"
308+ );
309+ }
34310 } /* namespace neon_d32 */
35311} /* namespace lsp */
36312
0 commit comments