Skip to content

Commit 099616a

Browse files
committed
Optimization of pmix functions for ARM-32
1 parent 9b4cdf1 commit 099616a

File tree

10 files changed

+289
-8
lines changed

10 files changed

+289
-8
lines changed

include/private/dsp/arch/arm/neon-d32/pmath/pmix.h

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,283 @@ namespace lsp
3030
{
3131
namespace neon_d32
3232
{
33+
void pmix_v1(float *dst, const float *src, const float *k, size_t count)
34+
{
35+
ARCH_ARM_ASM
36+
(
37+
// x16 blocks
38+
__ASM_EMIT("subs %[count], #16")
39+
__ASM_EMIT("blo 2f")
40+
__ASM_EMIT("1:")
41+
__ASM_EMIT("vldm %[dst], {q0-q3}") // q0 = d
42+
__ASM_EMIT("vldm %[src]!, {q4-q7}") // q4 = s
43+
__ASM_EMIT("vldm %[k]!, {q8-q11}") // q8 = k
44+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
45+
__ASM_EMIT("vsub.f32 q5, q5, q1")
46+
__ASM_EMIT("vsub.f32 q6, q6, q2")
47+
__ASM_EMIT("vsub.f32 q7, q7, q3")
48+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
49+
__ASM_EMIT("vmla.f32 q1, q5, q9")
50+
__ASM_EMIT("vmla.f32 q2, q6, q10")
51+
__ASM_EMIT("vmla.f32 q3, q7, q11")
52+
__ASM_EMIT("subs %[count], #16")
53+
__ASM_EMIT("vstm %[dst]!, {q0-q3}")
54+
__ASM_EMIT("bhs 1b")
55+
// x8 block
56+
__ASM_EMIT("2:")
57+
__ASM_EMIT("adds %[count], #8")
58+
__ASM_EMIT("blt 4f")
59+
__ASM_EMIT("vldm %[dst], {q0-q1}") // q0 = d
60+
__ASM_EMIT("vldm %[src]!, {q4-q5}") // q4 = s
61+
__ASM_EMIT("vldm %[k]!, {q8-q9}") // q8 = k
62+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
63+
__ASM_EMIT("vsub.f32 q5, q5, q1")
64+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
65+
__ASM_EMIT("vmla.f32 q1, q5, q9")
66+
__ASM_EMIT("sub %[count], #8")
67+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
68+
// x4 blocks
69+
__ASM_EMIT("4:")
70+
__ASM_EMIT("adds %[count], #4")
71+
__ASM_EMIT("blt 6f")
72+
__ASM_EMIT("vldm %[dst], {q0}") // q0 = d
73+
__ASM_EMIT("vldm %[src]!, {q4}") // q4 = s
74+
__ASM_EMIT("vldm %[k]!, {q8}") // q8 = k
75+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
76+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
77+
__ASM_EMIT("sub %[count], #4")
78+
__ASM_EMIT("vstm %[dst]!, {q0}")
79+
// x1 blocks
80+
__ASM_EMIT("6:")
81+
__ASM_EMIT("adds %[count], #3")
82+
__ASM_EMIT("blt 8f")
83+
__ASM_EMIT("7:")
84+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[dst]]") // q0 = d
85+
__ASM_EMIT("vld1.32 {d8[], d9[]}, [%[src]]!") // q4 = s
86+
__ASM_EMIT("vld1.32 {d16[], d17[]}, [%[k]]!") // q8 = k
87+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
88+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
89+
__ASM_EMIT("subs %[count], #1")
90+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
91+
__ASM_EMIT("bge 7b")
92+
// end
93+
__ASM_EMIT("8:")
3394

95+
: [dst] "+r" (dst), [src] "+r" (src), [k] "+r" (k),
96+
[count] "+r" (count)
97+
:
98+
: "cc", "memory",
99+
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
100+
"q8", "q9", "q10", "q11"
101+
);
102+
}
103+
104+
void pmix_v2(float *dst, const float *src1, const float *src2, const float *k, size_t count)
105+
{
106+
ARCH_ARM_ASM
107+
(
108+
// x16 blocks
109+
__ASM_EMIT("subs %[count], #16")
110+
__ASM_EMIT("blo 2f")
111+
__ASM_EMIT("1:")
112+
__ASM_EMIT("vldm %[src1]!, {q0-q3}") // q0 = d
113+
__ASM_EMIT("vldm %[src2]!, {q4-q7}") // q4 = s
114+
__ASM_EMIT("vldm %[k]!, {q8-q11}") // q8 = k
115+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
116+
__ASM_EMIT("vsub.f32 q5, q5, q1")
117+
__ASM_EMIT("vsub.f32 q6, q6, q2")
118+
__ASM_EMIT("vsub.f32 q7, q7, q3")
119+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
120+
__ASM_EMIT("vmla.f32 q1, q5, q9")
121+
__ASM_EMIT("vmla.f32 q2, q6, q10")
122+
__ASM_EMIT("vmla.f32 q3, q7, q11")
123+
__ASM_EMIT("subs %[count], #16")
124+
__ASM_EMIT("vstm %[dst]!, {q0-q3}")
125+
__ASM_EMIT("bhs 1b")
126+
// x8 block
127+
__ASM_EMIT("2:")
128+
__ASM_EMIT("adds %[count], #8")
129+
__ASM_EMIT("blt 4f")
130+
__ASM_EMIT("vldm %[src1]!, {q0-q1}") // q0 = d
131+
__ASM_EMIT("vldm %[src2]!, {q4-q5}") // q4 = s
132+
__ASM_EMIT("vldm %[k]!, {q8-q9}") // q8 = k
133+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
134+
__ASM_EMIT("vsub.f32 q5, q5, q1")
135+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
136+
__ASM_EMIT("vmla.f32 q1, q5, q9")
137+
__ASM_EMIT("sub %[count], #8")
138+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
139+
// x4 blocks
140+
__ASM_EMIT("4:")
141+
__ASM_EMIT("adds %[count], #4")
142+
__ASM_EMIT("blt 6f")
143+
__ASM_EMIT("vldm %[src1]!, {q0}") // q0 = d
144+
__ASM_EMIT("vldm %[src2]!, {q4}") // q4 = s
145+
__ASM_EMIT("vldm %[k]!, {q8}") // q8 = k
146+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
147+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
148+
__ASM_EMIT("sub %[count], #4")
149+
__ASM_EMIT("vstm %[dst]!, {q0}")
150+
// x1 blocks
151+
__ASM_EMIT("6:")
152+
__ASM_EMIT("adds %[count], #3")
153+
__ASM_EMIT("blt 8f")
154+
__ASM_EMIT("7:")
155+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[src1]]!") // q0 = d
156+
__ASM_EMIT("vld1.32 {d8[], d9[]}, [%[src2]]!") // q4 = s
157+
__ASM_EMIT("vld1.32 {d16[], d17[]}, [%[k]]!") // q8 = k
158+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
159+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
160+
__ASM_EMIT("subs %[count], #1")
161+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
162+
__ASM_EMIT("bge 7b")
163+
// end
164+
__ASM_EMIT("8:")
165+
166+
: [dst] "+r" (dst), [src1] "+r" (src1), [src2] "+r" (src2), [k] "+r" (k),
167+
[count] "+r" (count)
168+
:
169+
: "cc", "memory",
170+
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
171+
"q8", "q9", "q10", "q11"
172+
);
173+
}
174+
175+
void pmix_k1(float *dst, const float *src, float k, size_t count)
176+
{
177+
ARCH_ARM_ASM
178+
(
179+
// x16 blocks
180+
__ASM_EMIT("subs %[count], #16")
181+
__ASM_EMIT("vld1.32 {d16[], d17[]}, [%[k]]") // q8 = k
182+
__ASM_EMIT("blo 2f")
183+
__ASM_EMIT("1:")
184+
__ASM_EMIT("vldm %[dst], {q0-q3}") // q0 = d
185+
__ASM_EMIT("vldm %[src]!, {q4-q7}") // q4 = s
186+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
187+
__ASM_EMIT("vsub.f32 q5, q5, q1")
188+
__ASM_EMIT("vsub.f32 q6, q6, q2")
189+
__ASM_EMIT("vsub.f32 q7, q7, q3")
190+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
191+
__ASM_EMIT("vmla.f32 q1, q5, q8")
192+
__ASM_EMIT("vmla.f32 q2, q6, q8")
193+
__ASM_EMIT("vmla.f32 q3, q7, q8")
194+
__ASM_EMIT("subs %[count], #16")
195+
__ASM_EMIT("vstm %[dst]!, {q0-q3}")
196+
__ASM_EMIT("bhs 1b")
197+
// x8 block
198+
__ASM_EMIT("2:")
199+
__ASM_EMIT("adds %[count], #8")
200+
__ASM_EMIT("blt 4f")
201+
__ASM_EMIT("vldm %[dst], {q0-q1}") // q0 = d
202+
__ASM_EMIT("vldm %[src]!, {q4-q5}") // q4 = s
203+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
204+
__ASM_EMIT("vsub.f32 q5, q5, q1")
205+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
206+
__ASM_EMIT("vmla.f32 q1, q5, q8")
207+
__ASM_EMIT("sub %[count], #8")
208+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
209+
// x4 blocks
210+
__ASM_EMIT("4:")
211+
__ASM_EMIT("adds %[count], #4")
212+
__ASM_EMIT("blt 6f")
213+
__ASM_EMIT("vldm %[dst], {q0}") // q0 = d
214+
__ASM_EMIT("vldm %[src]!, {q4}") // q4 = s
215+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
216+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
217+
__ASM_EMIT("sub %[count], #4")
218+
__ASM_EMIT("vstm %[dst]!, {q0}")
219+
// x1 blocks
220+
__ASM_EMIT("6:")
221+
__ASM_EMIT("adds %[count], #3")
222+
__ASM_EMIT("blt 8f")
223+
__ASM_EMIT("7:")
224+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[dst]]") // q0 = d
225+
__ASM_EMIT("vld1.32 {d8[], d9[]}, [%[src]]!") // q4 = s
226+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
227+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
228+
__ASM_EMIT("subs %[count], #1")
229+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
230+
__ASM_EMIT("bge 7b")
231+
// end
232+
__ASM_EMIT("8:")
233+
234+
: [dst] "+r" (dst), [src] "+r" (src),
235+
[count] "+r" (count)
236+
: [k] "r" (&k)
237+
: "cc", "memory",
238+
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
239+
"q8"
240+
);
241+
}
242+
243+
void pmix_k2(float *dst, const float *src1, const float *src2, float k, size_t count)
244+
{
245+
ARCH_ARM_ASM
246+
(
247+
// x16 blocks
248+
__ASM_EMIT("subs %[count], #16")
249+
__ASM_EMIT("vld1.32 {d16[], d17[]}, [%[k]]") // q8 = k
250+
__ASM_EMIT("blo 2f")
251+
__ASM_EMIT("1:")
252+
__ASM_EMIT("vldm %[src1]!, {q0-q3}") // q0 = d
253+
__ASM_EMIT("vldm %[src2]!, {q4-q7}") // q4 = s
254+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
255+
__ASM_EMIT("vsub.f32 q5, q5, q1")
256+
__ASM_EMIT("vsub.f32 q6, q6, q2")
257+
__ASM_EMIT("vsub.f32 q7, q7, q3")
258+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
259+
__ASM_EMIT("vmla.f32 q1, q5, q8")
260+
__ASM_EMIT("vmla.f32 q2, q6, q8")
261+
__ASM_EMIT("vmla.f32 q3, q7, q8")
262+
__ASM_EMIT("subs %[count], #16")
263+
__ASM_EMIT("vstm %[dst]!, {q0-q3}")
264+
__ASM_EMIT("bhs 1b")
265+
// x8 block
266+
__ASM_EMIT("2:")
267+
__ASM_EMIT("adds %[count], #8")
268+
__ASM_EMIT("blt 4f")
269+
__ASM_EMIT("vldm %[src1]!, {q0-q1}") // q0 = d
270+
__ASM_EMIT("vldm %[src2]!, {q4-q5}") // q4 = s
271+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
272+
__ASM_EMIT("vsub.f32 q5, q5, q1")
273+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
274+
__ASM_EMIT("vmla.f32 q1, q5, q8")
275+
__ASM_EMIT("sub %[count], #8")
276+
__ASM_EMIT("vstm %[dst]!, {q0-q1}")
277+
// x4 blocks
278+
__ASM_EMIT("4:")
279+
__ASM_EMIT("adds %[count], #4")
280+
__ASM_EMIT("blt 6f")
281+
__ASM_EMIT("vldm %[src1]!, {q0}") // q0 = d
282+
__ASM_EMIT("vldm %[src2]!, {q4}") // q4 = s
283+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
284+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
285+
__ASM_EMIT("sub %[count], #4")
286+
__ASM_EMIT("vstm %[dst]!, {q0}")
287+
// x1 blocks
288+
__ASM_EMIT("6:")
289+
__ASM_EMIT("adds %[count], #3")
290+
__ASM_EMIT("blt 8f")
291+
__ASM_EMIT("7:")
292+
__ASM_EMIT("vld1.32 {d0[], d1[]}, [%[src1]]!") // q0 = d
293+
__ASM_EMIT("vld1.32 {d8[], d9[]}, [%[src2]]!") // q4 = s
294+
__ASM_EMIT("vsub.f32 q4, q4, q0") // q4 = s - d
295+
__ASM_EMIT("vmla.f32 q0, q4, q8") // q0 = d + (s-d)*k
296+
__ASM_EMIT("subs %[count], #1")
297+
__ASM_EMIT("vst1.32 {d0[0]}, [%[dst]]!")
298+
__ASM_EMIT("bge 7b")
299+
// end
300+
__ASM_EMIT("8:")
301+
302+
: [dst] "+r" (dst), [src] "+r" (src),
303+
[count] "+r" (count)
304+
: [k] "r" (&k)
305+
: "cc", "memory",
306+
"q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
307+
"q8"
308+
);
309+
}
34310
} /* namespace neon_d32 */
35311
} /* namespace lsp */
36312

src/main/arm/neon-d32.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,6 +441,11 @@
441441
EXPORT1(clamp_vv2);
442442
EXPORT1(clamp_kk1);
443443
EXPORT1(clamp_kk2);
444+
445+
EXPORT1(pmix_v1);
446+
EXPORT1(pmix_v2);
447+
EXPORT1(pmix_k1);
448+
EXPORT1(pmix_k2);
444449
}
445450
} /* namespace neon_d32 */
446451
} /* namespace lsp */

src/test/ptest/pmath/pmix_k1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ PTEST_BEGIN("dsp.pmath", pmix_k1, 5, 1000)
113113
IF_ARCH_X86(CALL(avx::pmix_k1));
114114
IF_ARCH_X86(CALL(avx::pmix_k1_fma3));
115115
IF_ARCH_X86(CALL(avx512::pmix_k1));
116-
// IF_ARCH_ARM(CALL(neon_d32::pmix_k1));
116+
IF_ARCH_ARM(CALL(neon_d32::pmix_k1));
117117
// IF_ARCH_AARCH64(CALL(asimd::pmix_k1));
118118
PTEST_SEPARATOR;
119119
}

src/test/ptest/pmath/pmix_k2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ PTEST_BEGIN("dsp.pmath", pmix_k2, 5, 1000)
113113
IF_ARCH_X86(CALL(avx::pmix_k2));
114114
IF_ARCH_X86(CALL(avx::pmix_k2_fma3));
115115
IF_ARCH_X86(CALL(avx512::pmix_k2));
116-
// IF_ARCH_ARM(CALL(neon_d32::pmix_k2));
116+
IF_ARCH_ARM(CALL(neon_d32::pmix_k2));
117117
// IF_ARCH_AARCH64(CALL(asimd::pmix_k2));
118118
PTEST_SEPARATOR;
119119
}

src/test/ptest/pmath/pmix_v1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ PTEST_BEGIN("dsp.pmath", pmix_v1, 5, 1000)
117117
IF_ARCH_X86(CALL(avx::pmix_v1));
118118
IF_ARCH_X86(CALL(avx::pmix_v1_fma3));
119119
IF_ARCH_X86(CALL(avx512::pmix_v1));
120-
// IF_ARCH_ARM(CALL(neon_d32::pmix_v1));
120+
IF_ARCH_ARM(CALL(neon_d32::pmix_v1));
121121
// IF_ARCH_AARCH64(CALL(asimd::pmix_v1));
122122
PTEST_SEPARATOR;
123123
}

src/test/ptest/pmath/pmix_v2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ PTEST_BEGIN("dsp.pmath", pmix_v2, 5, 1000)
115115
IF_ARCH_X86(CALL(avx::pmix_v2));
116116
IF_ARCH_X86(CALL(avx::pmix_v2_fma3));
117117
IF_ARCH_X86(CALL(avx512::pmix_v2));
118-
// IF_ARCH_ARM(CALL(neon_d32::pmix_v2));
118+
IF_ARCH_ARM(CALL(neon_d32::pmix_v2));
119119
// IF_ARCH_AARCH64(CALL(asimd::pmix_v2));
120120
PTEST_SEPARATOR;
121121
}

src/test/utest/pmath/pmix_k1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ UTEST_BEGIN("dsp.pmath", pmix_k1)
123123
IF_ARCH_X86(CALL(generic::pmix_k1, avx::pmix_k1, 32));
124124
IF_ARCH_X86(CALL(generic::pmix_k1, avx::pmix_k1_fma3, 32));
125125
IF_ARCH_X86(CALL(generic::pmix_k1, avx512::pmix_k1, 64));
126-
// IF_ARCH_ARM(CALL(generic::pmix_k1, neon_d32::pmix_k1, 16));
126+
IF_ARCH_ARM(CALL(generic::pmix_k1, neon_d32::pmix_k1, 16));
127127
// IF_ARCH_AARCH64(CALL(generic::pmix_k1, asimd::pmix_k1, 16));
128128
}
129129
UTEST_END

src/test/utest/pmath/pmix_k2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ UTEST_BEGIN("dsp.pmath", pmix_k2)
123123
IF_ARCH_X86(CALL(generic::pmix_k2, avx::pmix_k2, 32));
124124
IF_ARCH_X86(CALL(generic::pmix_k2, avx::pmix_k2_fma3, 32));
125125
IF_ARCH_X86(CALL(generic::pmix_k2, avx512::pmix_k2, 64));
126-
// IF_ARCH_ARM(CALL(generic::pmix_k2, neon_d32::pmix_k2, 16));
126+
IF_ARCH_ARM(CALL(generic::pmix_k2, neon_d32::pmix_k2, 16));
127127
// IF_ARCH_AARCH64(CALL(generic::pmix_k2, asimd::pmix_k1, 16));
128128
}
129129
UTEST_END

src/test/utest/pmath/pmix_v1.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ UTEST_BEGIN("dsp.pmath", pmix_v1)
126126
IF_ARCH_X86(CALL(generic::pmix_v1, avx::pmix_v1, 32));
127127
IF_ARCH_X86(CALL(generic::pmix_v1, avx::pmix_v1_fma3, 32));
128128
IF_ARCH_X86(CALL(generic::pmix_v1, avx512::pmix_v1, 64));
129-
// IF_ARCH_ARM(CALL(generic::pmix_v1, neon_d32::pmix_v1, 16));
129+
IF_ARCH_ARM(CALL(generic::pmix_v1, neon_d32::pmix_v1, 16));
130130
// IF_ARCH_AARCH64(CALL(generic::pmix_v1, asimd::pmix_v1, 16));
131131
}
132132
UTEST_END

src/test/utest/pmath/pmix_v2.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ UTEST_BEGIN("dsp.pmath", pmix_v2)
126126
IF_ARCH_X86(CALL(generic::pmix_v2, avx::pmix_v2, 32));
127127
IF_ARCH_X86(CALL(generic::pmix_v2, avx::pmix_v2_fma3, 32));
128128
IF_ARCH_X86(CALL(generic::pmix_v2, avx512::pmix_v2, 64));
129-
// IF_ARCH_ARM(CALL(generic::pmix_v2, neon_d32::pmix_v2, 16));
129+
IF_ARCH_ARM(CALL(generic::pmix_v2, neon_d32::pmix_v2, 16));
130130
// IF_ARCH_AARCH64(CALL(generic::pmix_v2, asimd::pmix_v2, 16));
131131
}
132132
UTEST_END

0 commit comments

Comments
 (0)