49 #include "NE10_types.h"
50 #include "NE10_macros.h"
53 #include "NE10_fft.neonintrinsic.h"
57 const ne10_int32_t fstride,
58 const ne10_int32_t mstride,
59 const ne10_int32_t nfft)
63 NE10_DECLARE_8(float32x4_t,q_in);
64 NE10_DECLARE_8(float32x4_t,q_out);
66 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
67 float32x4_t *Fout_neon = (float32x4_t*) Fout;
69 for (f_count = fstride; f_count > 0; f_count --)
72 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
78 NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
84 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
86 Fin_neon = Fin_neon - fstride * 8 + 1;
93 const ne10_int32_t fstride,
94 const ne10_int32_t mstride,
95 const ne10_int32_t nfft)
99 NE10_DECLARE_8(float32x4_t,q_in);
100 NE10_DECLARE_8(float32x4_t,q_out);
102 const ne10_float32_t one_by_N = 0.25 / nfft;
103 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
105 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106 float32x4_t *Fout_neon = (float32x4_t*) Fout;
108 for (f_count = fstride; f_count > 0; f_count --)
111 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
115 NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
119 #ifdef NE10_DSP_RFFT_SCALING
120 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124 q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125 q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126 q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127 q_out7 = vmulq_f32(q_out7,one_by_N_neon);
131 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
139 const ne10_int32_t fstride,
140 const ne10_int32_t mstride,
141 const ne10_int32_t nfft)
143 ne10_int32_t f_count;
145 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146 float32x4_t *Fout_neon = (float32x4_t*) Fout;
148 for (f_count = 0; f_count < fstride; f_count ++)
150 NE10_DECLARE_4(float32x4_t,q_in);
151 NE10_DECLARE_4(float32x4_t,q_out);
154 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
156 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
159 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
161 Fin_neon = Fin_neon - 4*fstride + 1;
168 const ne10_int32_t fstride,
169 const ne10_int32_t mstride,
170 const ne10_int32_t nfft)
172 ne10_int32_t f_count;
174 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175 float32x4_t *Fout_neon = (float32x4_t*) Fout;
177 const ne10_float32_t one_by_N = 0.25 / nfft;
178 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
180 for (f_count = 0; f_count < fstride; f_count ++)
182 NE10_DECLARE_4(float32x4_t,q_in);
183 NE10_DECLARE_4(float32x4_t,q_out);
186 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
190 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
194 #ifdef NE10_DSP_RFFT_SCALING
195 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
202 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
208 NE10_INLINE
void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
209 const float32x4_t *Fin_neon,
210 const ne10_int32_t out_step,
211 const ne10_int32_t in_step,
214 NE10_DECLARE_4(float32x4_t,q_in);
215 NE10_DECLARE_4(float32x4_t,q_out);
218 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
220 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
223 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
229 NE10_INLINE
void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
230 const float32x4_t *Fin_neon,
231 const ne10_int32_t out_step,
232 const ne10_int32_t in_step,
235 NE10_DECLARE_4(float32x4_t,q_in);
236 NE10_DECLARE_4(float32x4_t,q_out);
239 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
246 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
251 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
254 NE10_INLINE
void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
255 const float32x4_t *Fin_neon,
256 const ne10_int32_t out_step,
257 const ne10_int32_t in_step,
260 ne10_int32_t m_count;
261 ne10_int32_t loop_count = (out_step>>1) -1;
262 float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2;
264 NE10_DECLARE_3(float32x4x2_t,q2_tw);
265 NE10_DECLARE_4(float32x4x2_t,q2_in);
266 NE10_DECLARE_4(float32x4x2_t,q2_out);
268 for (m_count = loop_count; m_count > 0; m_count -- )
270 #ifndef NE10_INLINE_ASM_OPT
272 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
275 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
278 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
281 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
284 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
287 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
290 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
294 NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
295 #else // NE10_INLINE_ASM_OPT
297 #error Currently, inline assembly optimizations are only available on AArch64.
299 const ne10_float32_t *ptr_inr = ((
const ne10_float32_t *) Fin_neon);
300 const ne10_float32_t *ptr_ini = ((
const ne10_float32_t *) Fin_neon) + 4;
302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307 "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t"
310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t"
311 "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t"
312 "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t"
314 "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t"
315 "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t"
316 "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t"
317 "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t"
319 "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t"
320 "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t"
321 "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t"
322 "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t"
323 : [q2_out0r]
"+w"(q2_out0.val[0]),
324 [q2_out0i]
"+w"(q2_out0.val[1]),
325 [q2_out1r]
"+w"(q2_out1.val[0]),
326 [q2_out1i]
"+w"(q2_out1.val[1]),
327 [q2_out2r]
"+w"(q2_out2.val[0]),
328 [q2_out2i]
"+w"(q2_out2.val[1]),
329 [q2_out3r]
"+w"(q2_out3.val[0]),
330 [q2_out3i]
"+w"(q2_out3.val[1]),
331 [ptr_inr]
"+r"(ptr_inr),
332 [ptr_ini]
"+r"(ptr_ini)
333 : [offset_in]
"r"(in_step * 16),
334 [ptr_tw]
"r"(twiddles)
335 :
"memory",
"v0",
"v1",
"v2",
336 "v10",
"v11",
"v12",
"v13",
"v14",
"v15"
338 #endif // __aarch64__
339 #endif // NE10_INLINE_ASM_OPT
341 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
345 vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346 vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
348 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
351 vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352 vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
354 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
365 NE10_INLINE
void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
366 const float32x4_t *Fin_neon,
367 const ne10_int32_t out_step,
368 const ne10_int32_t in_step,
371 ne10_int32_t m_count;
372 ne10_int32_t loop_count = (out_step>>1) -1;
373 const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2;
375 NE10_DECLARE_3(float32x4x2_t,q2_tw);
376 NE10_DECLARE_4(float32x4x2_t,q2_in);
377 NE10_DECLARE_4(float32x4x2_t,q2_out);
379 for (m_count = loop_count; m_count > 0; m_count -- )
382 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
385 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
388 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
391 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
394 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
397 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
400 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
406 NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
411 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
414 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
417 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
420 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
431 NE10_INLINE
void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
432 const float32x4_t *Fin_neon,
433 const ne10_int32_t out_step,
434 const ne10_int32_t in_step,
437 NE10_DECLARE_4(float32x4_t,q_in);
438 NE10_DECLARE_4(float32x4_t,q_out);
441 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
443 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(q_out,q_in);
446 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
447 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1), q_out1);
448 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
449 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
452 NE10_INLINE
void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
453 const float32x4_t *Fin_neon,
454 const ne10_int32_t out_step,
455 const ne10_int32_t in_step,
458 NE10_DECLARE_4(float32x4_t,q_in);
459 NE10_DECLARE_4(float32x4_t,q_out);
462 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
463 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1) );
464 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
465 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
469 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(q_out,q_in);
474 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
479 const ne10_int32_t fstride,
480 const ne10_int32_t mstride,
481 const ne10_int32_t nfft,
484 ne10_int32_t f_count;
485 const ne10_int32_t in_step = nfft >> 2;
486 const ne10_int32_t out_step = mstride;
488 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
489 float32x4_t *Fout_neon = (float32x4_t*) Fout;
492 for (f_count = fstride; f_count; f_count --)
497 ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
504 ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
507 Fin_neon += 2 * ( (out_step >> 1) - 1);
508 Fout_neon += 2 * ( (out_step >> 1) - 1);
511 ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
515 Fout_neon = Fout_neon + 3 * out_step;
521 const ne10_int32_t fstride,
522 const ne10_int32_t mstride,
523 const ne10_int32_t nfft,
526 ne10_int32_t f_count;
527 const ne10_int32_t in_step = nfft >> 2;
528 const ne10_int32_t out_step = mstride;
530 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
531 float32x4_t *Fout_neon = (float32x4_t*) Fout;
534 for (f_count = fstride; f_count; f_count --)
539 ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
546 ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
549 Fin_neon += 2 * ( (out_step >> 1) - 1);
550 Fout_neon += 2 * ( (out_step >> 1) - 1);
553 ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
557 Fin_neon = Fin_neon + 3 * out_step;
563 const ne10_int32_t * factors,
567 ne10_int32_t fstride, mstride, nfft;
569 ne10_int32_t stage_count;
574 stage_count = factors[0];
575 fstride = factors[1];
576 mstride = factors[ (stage_count << 1) - 1 ];
577 radix = factors[ stage_count << 1 ];
578 nfft = radix * fstride;
591 if (stage_count % 2 == 1)
593 ne10_swap_ptr (buffer, Fout);
599 ne10_radix8x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
603 ne10_radix4x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
611 ne10_swap_ptr (buffer, Fout);
613 ne10_radix4x4_r2c_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
614 twiddles += 3 * mstride;
621 const ne10_int32_t * factors,
625 ne10_int32_t fstride, mstride, nfft;
627 ne10_int32_t stage_count;
632 stage_count = factors[0];
633 fstride = factors[1];
635 mstride = factors[ (stage_count << 1) - 1 ];
636 radix = factors[ stage_count << 1 ];
637 nfft = radix * fstride;
649 if (stage_count % 2 == 0)
651 ne10_swap_ptr(Fout,buffer);
655 for (; stage_count > 1;)
657 twiddles -= 3 * mstride;
661 ne10_radix4x4_c2r_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
666 ne10_swap_ptr (buffer, Fout);
676 ne10_radix8x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
682 ne10_radix4x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
689 const ne10_int32_t nfft)
693 ne10_float32_t q_4r_out[4];
694 const ne10_float32_t *p_src_r = (
const ne10_float32_t*) src;
696 NE10_FFT_R2C_4R_RCR(q_4r_out,p_src_r);
698 dst[0].r = q_4r_out[0];
699 dst[0].i = q_4r_out[3];
701 dst[0].r = q_4r_out[1];
702 dst[0].i = q_4r_out[2];
708 const ne10_float32_t *p_src_r = (
const ne10_float32_t*) (src);
712 ne10_float32_t q_4r_out[4];
714 NE10_FFT_R2C_4R_CC(q_4r_out,p_src_r);
717 dst[0].r = q_4r_out[0];
718 dst[0].i = q_4r_out[1];
720 dst[0].r = q_4r_out[2];
721 dst[0].i = q_4r_out[3];
729 const ne10_float32_t *p_src_r = (
const ne10_float32_t*) src;
732 cc_out[0].r = *(p_src_r ++);
733 cc_out[1].r = *(p_src_r ++);
734 cc_out[2].r = *(p_src_r ++);
735 cc_out[3].r = *(p_src_r ++);
737 cc_out[0].i = *(p_src_r ++);
738 cc_out[1].i = *(p_src_r ++);
739 cc_out[2].i = *(p_src_r ++);
740 cc_out[3].i = *(p_src_r ++);
742 NE10_PRINT_Q2_VECTOR(cc_out);
746 cc_in[0] = cc_out[0];
749 NE10_CPX_MUL_F32(cc_in[1],cc_out[1],twiddles[0]);
752 NE10_CPX_MUL_F32(cc_in[2],cc_out[2],twiddles[0]);
755 NE10_CPX_MUL_F32(cc_in[3],cc_out[3],twiddles[0]);
759 NE10_FFT_R2C_CC_CC(cc_out,cc_in);
774 const ne10_int32_t nfft)
778 ne10_float32_t q_4r_in[4];
779 ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
781 q_4r_in[0] = src[0].r;
782 q_4r_in[3] = src[0].i;
784 q_4r_in[1] = src[0].r;
785 q_4r_in[2] = src[0].i;
788 NE10_FFT_C2R_RCR_4R(p_dst_r,q_4r_in);
794 ne10_float32_t *p_dst_r = (ne10_float32_t*) (dst);
798 ne10_float32_t q_4r_in[4];
800 q_4r_in[0] = src[0].r;
801 q_4r_in[1] = src[0].i;
803 q_4r_in[2] = src[0].r;
804 q_4r_in[3] = src[0].i;
808 NE10_FFT_C2R_CC_4R(p_dst_r,q_4r_in);
814 ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
827 NE10_FFT_C2R_CC_CC(cc_in,cc_out);
833 cc_out[0] = cc_in[0];
836 NE10_CPX_CONJ_MUL_F32(cc_out[1],cc_in[1],twiddles[0]);
839 NE10_CPX_CONJ_MUL_F32(cc_out[2],cc_in[2],twiddles[0]);
842 NE10_CPX_CONJ_MUL_F32(cc_out[3],cc_in[3],twiddles[0]);
846 *(p_dst_r ++) = cc_out[0].r;
847 *(p_dst_r ++) = cc_out[1].r;
848 *(p_dst_r ++) = cc_out[2].r;
849 *(p_dst_r ++) = cc_out[3].r;
851 *(p_dst_r ++) = cc_out[0].i;
852 *(p_dst_r ++) = cc_out[1].i;
853 *(p_dst_r ++) = cc_out[2].i;
854 *(p_dst_r ++) = cc_out[3].i;
860 const ne10_int32_t nfft)
863 const ne10_float32_t *fin_r = (
const ne10_float32_t*) src + 12;
864 ne10_float32_t *fout_r = (ne10_float32_t*) dst;
865 const ne10_float32_t *tw = (
const ne10_float32_t*) twiddles + 8;
867 ne10_float32_t q_in0[4], q_out0[4],
872 ne10_float32_t q2_tw0[2][4],
882 q_in0[0] = *(fin_r++);
883 q_in0[1] = *(fin_r++);
884 q_in0[2] = *(fin_r++);
885 q_in0[3] = *(fin_r++);
886 q_in1[0] = *(fin_r++);
887 q_in1[1] = *(fin_r++);
888 q_in1[2] = *(fin_r++);
889 q_in1[3] = *(fin_r++);
890 q_in2[0] = *(fin_r++);
891 q_in2[1] = *(fin_r++);
892 q_in2[2] = *(fin_r++);
893 q_in2[3] = *(fin_r++);
894 q_in3[0] = *(fin_r++);
895 q_in3[1] = *(fin_r++);
896 q_in3[2] = *(fin_r++);
897 q_in3[3] = *(fin_r++);
904 q2_tw0[0][0] = tw[0];
905 q2_tw0[0][1] = tw[2];
906 q2_tw0[0][2] = tw[4];
907 q2_tw0[0][3] = tw[6];
908 q2_tw0[1][0] = tw[1];
909 q2_tw0[1][1] = tw[3];
910 q2_tw0[1][2] = tw[5];
911 q2_tw0[1][3] = tw[7];
913 q2_tw1[0][0] = tw[0+8];
914 q2_tw1[0][1] = tw[2+8];
915 q2_tw1[0][2] = tw[4+8];
916 q2_tw1[0][3] = tw[6+8];
917 q2_tw1[1][0] = tw[1+8];
918 q2_tw1[1][1] = tw[3+8];
919 q2_tw1[1][2] = tw[5+8];
920 q2_tw1[1][3] = tw[7+8];
923 q_out0[0] = q_in0[0];
924 q_out1[0] = q_in1[0];
925 q_out2[0] = q_in2[0];
926 q_out3[0] = q_in3[0];
931 q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
933 q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
936 q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
938 q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
941 q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
943 q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
948 q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
950 q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
953 q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
955 q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
958 q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
960 q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
971 q_in0[0] = q_out0[0] + q_out0[2];
972 q_in1[0] = q_out1[0] + q_out1[2];
974 q_in0[1] = q_out0[0] - q_out0[2];
975 q_in1[1] = q_out1[0] - q_out1[2];
978 q_in0[2] = q_out0[1] + q_out0[3];
979 q_in1[2] = q_out1[1] + q_out1[3];
981 q_in0[3] = q_out0[1] - q_out0[3];
982 q_in1[3] = q_out1[1] - q_out1[3];
985 q_in2[0] = q_out2[0] + q_out2[2];
986 q_in3[0] = q_out3[0] + q_out3[2];
988 q_in2[1] = q_out2[0] - q_out2[2];
989 q_in3[1] = q_out3[0] - q_out3[2];
992 q_in2[2] = q_out2[1] + q_out2[3];
993 q_in3[2] = q_out3[1] + q_out3[3];
995 q_in2[3] = q_out2[1] - q_out2[3];
996 q_in3[3] = q_out3[1] - q_out3[3];
1007 q_out0[0] = q_in0[0] + q_in0[2];
1008 q_out0[1] = q_in1[0] + q_in1[2];
1010 q_out2[2] = q_in0[0] - q_in0[2];
1011 q_out2[3] = - q_in1[0] + q_in1[2];
1014 q_out3[2] = q_in0[1] - q_in1[3];
1015 q_out3[3] = - q_in1[1] - q_in0[3];
1017 q_out1[0] = q_in0[1] + q_in1[3];
1018 q_out1[1] = q_in1[1] - q_in0[3];
1021 q_out0[2] = q_in2[0] + q_in2[2];
1022 q_out0[3] = q_in3[0] + q_in3[2];
1024 q_out2[0] = q_in2[0] - q_in2[2];
1025 q_out2[1] = - q_in3[0] + q_in3[2];
1028 q_out3[0] = q_in2[1] - q_in3[3];
1029 q_out3[1] = - q_in3[1] - q_in2[3];
1031 q_out1[2] = q_in2[1] + q_in3[3];
1032 q_out1[3] = q_in3[1] - q_in2[3];
1041 fout_r[0] = q_out0[0];
1042 fout_r[1] = q_out0[1];
1043 fout_r[2] = q_out0[2];
1044 fout_r[3] = q_out0[3];
1046 fout_r += (nfft>>1);
1047 fout_r[0] = q_out1[0];
1048 fout_r[1] = q_out1[1];
1049 fout_r[2] = q_out1[2];
1050 fout_r[3] = q_out1[3];
1053 fout_r[0] = q_out3[0];
1054 fout_r[1] = q_out3[1];
1055 fout_r[2] = q_out3[2];
1056 fout_r[3] = q_out3[3];
1058 fout_r += (nfft>>1);
1059 fout_r[0] = q_out2[0];
1060 fout_r[1] = q_out2[1];
1061 fout_r[2] = q_out2[2];
1062 fout_r[3] = q_out2[3];
1065 NE10_INLINE
void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly (
ne10_fft_cpx_float32_t *dst,
1068 const ne10_int32_t nfft)
1070 const ne10_float32_t *fin_r = (
const ne10_float32_t*) src;
1071 ne10_float32_t *fout_r = (ne10_float32_t*) dst + 12;
1072 const ne10_float32_t *tw = (
const ne10_float32_t*) twiddles + 8;
1074 ne10_float32_t q_in0[4], q_out0[4],
1075 q_in1[4], q_out1[4],
1076 q_in2[4], q_out2[4],
1077 q_in3[4], q_out3[4];
1079 ne10_float32_t q2_tw0[2][4],
1091 q_in0[0] = fin_r[0];
1092 q_in0[1] = fin_r[1];
1093 q_in0[2] = fin_r[2];
1094 q_in0[3] = fin_r[3];
1097 q_in1[0] = fin_r[0];
1098 q_in1[1] = fin_r[1];
1099 q_in1[2] = fin_r[2];
1100 q_in1[3] = fin_r[3];
1103 q_in3[0] = fin_r[0];
1104 q_in3[1] = fin_r[1];
1105 q_in3[2] = fin_r[2];
1106 q_in3[3] = fin_r[3];
1109 q_in2[0] = fin_r[0];
1110 q_in2[1] = fin_r[1];
1111 q_in2[2] = fin_r[2];
1112 q_in2[3] = fin_r[3];
1121 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1122 q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \
1123 q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \
1129 NE10_INV_BUTTERFLY_TMP( 0,0, 0,2,
1132 NE10_INV_BUTTERFLY_TMP( 1,2, 1,0,
1135 NE10_INV_BUTTERFLY_TMP( 0,1, 1,3,
1139 NE10_INV_BUTTERFLY_TMP( 1,1, 0,3,
1142 NE10_INV_BUTTERFLY_TMP( 2,0, 2,2,
1145 NE10_INV_BUTTERFLY_TMP( 3,2, 3,0,
1148 NE10_INV_BUTTERFLY_TMP( 2,1, 3,3,
1152 NE10_INV_BUTTERFLY_TMP( 3,1, 2,3,
1154 #undef NE10_INV_BUTTERFLY_TMP
1167 #define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1168 q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \
1169 q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \
1172 NE10_INV_BUTTERFLY_TMP(0,0, 0,2,
1175 NE10_INV_BUTTERFLY_TMP(1,0, 1,2,
1178 NE10_INV_BUTTERFLY_TMP(0,1, 0,3,
1181 NE10_INV_BUTTERFLY_TMP(1,1, 1,3,
1184 NE10_INV_BUTTERFLY_TMP(2,0, 2,2,
1187 NE10_INV_BUTTERFLY_TMP(3,0, 3,2,
1191 NE10_INV_BUTTERFLY_TMP(2,1, 2,3,
1194 NE10_INV_BUTTERFLY_TMP(3,1, 3,3,
1201 #undef NE10_INV_BUTTERFLY_TMP
1204 q2_tw0[0][0] = tw[0];
1205 q2_tw0[0][1] = tw[2];
1206 q2_tw0[0][2] = tw[4];
1207 q2_tw0[0][3] = tw[6];
1208 q2_tw0[1][0] = tw[1];
1209 q2_tw0[1][1] = tw[3];
1210 q2_tw0[1][2] = tw[5];
1211 q2_tw0[1][3] = tw[7];
1213 q2_tw1[0][0] = tw[0+8];
1214 q2_tw1[0][1] = tw[2+8];
1215 q2_tw1[0][2] = tw[4+8];
1216 q2_tw1[0][3] = tw[6+8];
1217 q2_tw1[1][0] = tw[1+8];
1218 q2_tw1[1][1] = tw[3+8];
1219 q2_tw1[1][2] = tw[5+8];
1220 q2_tw1[1][3] = tw[7+8];
1223 q_out0[0] = q_in0[0];
1224 q_out1[0] = q_in1[0];
1225 q_out2[0] = q_in2[0];
1226 q_out3[0] = q_in3[0];
1231 q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
1233 q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
1236 q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
1238 q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
1241 q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
1243 q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
1248 q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
1250 q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
1253 q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
1255 q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
1258 q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
1260 q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
1263 *(fout_r++) = q_out0[0];
1264 *(fout_r++) = q_out0[1];
1265 *(fout_r++) = q_out0[2];
1266 *(fout_r++) = q_out0[3];
1267 *(fout_r++) = q_out1[0];
1268 *(fout_r++) = - q_out1[1];
1269 *(fout_r++) = - q_out1[2];
1270 *(fout_r++) = - q_out1[3];
1271 *(fout_r++) = q_out2[0];
1272 *(fout_r++) = q_out2[1];
1273 *(fout_r++) = q_out2[2];
1274 *(fout_r++) = q_out2[3];
1275 *(fout_r++) = q_out3[0];
1276 *(fout_r++) = - q_out3[1];
1277 *(fout_r++) = - q_out3[2];
1278 *(fout_r++) = - q_out3[3];
1284 const ne10_int32_t nfft)
1286 const ne10_float32_t *fin_r = ((
const ne10_float32_t*) src) + 12 + 16;
1287 ne10_float32_t *fout_r = (ne10_float32_t*) dst + 8;
1288 ne10_float32_t *fout_b = (ne10_float32_t*) dst - 14;
1289 const ne10_float32_t *tw = ((
const ne10_float32_t*) twiddles) + 8 + 16;
1294 ne10_int32_t loop_count = ((nfft >> 2) - 8) >> 3;
1296 for (; loop_count > 0; loop_count--)
1298 NE10_DECLARE_4 (float32x4x2_t, q2_in);
1299 NE10_DECLARE_3 (float32x4x2_t, q2_tw);
1300 NE10_DECLARE_4 (float32x4x2_t, q2_out);
1329 #ifndef NE10_INLINE_ASM_OPT
1330 q2_out0.val[0] = vld1q_f32 (fin_r);
1332 q2_out0.val[1] = vld1q_f32 (fin_r);
1334 q2_out1.val[0] = vld1q_f32 (fin_r);
1336 q2_out1.val[1] = vld1q_f32 (fin_r);
1338 q2_out2.val[0] = vld1q_f32 (fin_r);
1340 q2_out2.val[1] = vld1q_f32 (fin_r);
1342 q2_out3.val[0] = vld1q_f32 (fin_r);
1344 q2_out3.val[1] = vld1q_f32 (fin_r);
1347 NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in, q2_out);
1348 #else // NE10_INLINE_ASM_OPT
1350 #error Currently, inline assembly optimizations are only available on AArch64.
1351 #else // __aarch64__
1353 "ld1 {v0.4s}, [%[fin_r]], 16 \n\t"
1354 "ld1 {v4.4s}, [%[fin_r]], 16 \n\t"
1355 "ld1 {v1.4s}, [%[fin_r]], 16 \n\t"
1356 "ld1 {v5.4s}, [%[fin_r]], 16 \n\t"
1357 "ld1 {v2.4s}, [%[fin_r]], 16 \n\t"
1358 "ld1 {v6.4s}, [%[fin_r]], 16 \n\t"
1359 "ld1 {v3.4s}, [%[fin_r]], 16 \n\t"
1360 "ld1 {v7.4s}, [%[fin_r]], 16 \n\t"
1362 "trn1 v8.4s, v0.4s, v1.4s \n\t"
1363 "trn2 v9.4s, v0.4s, v1.4s \n\t"
1364 "trn1 v10.4s, v2.4s, v3.4s \n\t"
1365 "trn2 v11.4s, v2.4s, v3.4s \n\t"
1367 "trn1 %[q2_in0r].2d, v8.2d, v10.2d \n\t"
1368 "trn1 %[q2_in1r].2d, v9.2d, v11.2d \n\t"
1369 "trn2 %[q2_in2r].2d, v8.2d, v10.2d \n\t"
1370 "trn2 %[q2_in3r].2d, v9.2d, v11.2d \n\t"
1372 "trn1 v8.4s, v4.4s, v5.4s \n\t"
1373 "trn2 v9.4s, v4.4s, v5.4s \n\t"
1374 "trn1 v10.4s, v6.4s, v7.4s \n\t"
1375 "trn2 v11.4s, v6.4s, v7.4s \n\t"
1377 "trn1 %[q2_in0i].2d, v8.2d, v10.2d \n\t"
1378 "trn1 %[q2_in1i].2d, v9.2d, v11.2d \n\t"
1379 "trn2 %[q2_in2i].2d, v8.2d, v10.2d \n\t"
1380 "trn2 %[q2_in3i].2d, v9.2d, v11.2d \n\t"
1382 : [q2_in0r]
"+w"(q2_in0.val[0]),
1383 [q2_in0i]
"+w"(q2_in0.val[1]),
1384 [q2_in1r]
"+w"(q2_in1.val[0]),
1385 [q2_in1i]
"+w"(q2_in1.val[1]),
1386 [q2_in2r]
"+w"(q2_in2.val[0]),
1387 [q2_in2i]
"+w"(q2_in2.val[1]),
1388 [q2_in3r]
"+w"(q2_in3.val[0]),
1389 [q2_in3i]
"+w"(q2_in3.val[1]),
1392 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7",
1393 "v8",
"v9",
"v10",
"v11"
1395 #endif // __aarch64__
1396 #endif // NE10_INLINE_ASM_OPT
1398 #ifndef NE10_INLINE_ASM_OPT
1400 q2_tw0 = vld2q_f32 (tw);
1402 q2_tw1 = vld2q_f32 (tw);
1404 q2_tw2 = vld2q_f32 (tw);
1410 NE10_CPX_MUL_NEON_F32 (q2_out1, q2_in1, q2_tw0);
1411 NE10_CPX_MUL_NEON_F32 (q2_out2, q2_in2, q2_tw1);
1412 NE10_CPX_MUL_NEON_F32 (q2_out3, q2_in3, q2_tw2);
1413 #else // NE10_INLINE_ASM_OPT
1415 #error Currently, inline assembly optimizations are only available on AArch64.
1416 #else // __aarch64__
1419 "ld2 {v0.4s, v1.4s}, [%[tw0]] \n\t"
1420 "ld2 {v2.4s, v3.4s}, [%[tw1]] \n\t"
1421 "ld2 {v4.4s, v5.4s}, [%[tw2]] \n\t"
1425 "fmul %[q2_out1r].4s, v0.4s, %[q2_in1r].4s \n\t"
1426 "fmul %[q2_out1i].4s, v0.4s, %[q2_in1i].4s \n\t"
1427 "fmls %[q2_out1r].4s, v1.4s, %[q2_in1i].4s \n\t"
1428 "fmla %[q2_out1i].4s, v1.4s, %[q2_in1r].4s \n\t"
1430 "fmul %[q2_out2r].4s, v2.4s, %[q2_in2r].4s \n\t"
1431 "fmul %[q2_out2i].4s, v2.4s, %[q2_in2i].4s \n\t"
1432 "fmls %[q2_out2r].4s, v3.4s, %[q2_in2i].4s \n\t"
1433 "fmla %[q2_out2i].4s, v3.4s, %[q2_in2r].4s \n\t"
1435 "fmul %[q2_out3r].4s, v4.4s, %[q2_in3r].4s \n\t"
1436 "fmul %[q2_out3i].4s, v4.4s, %[q2_in3i].4s \n\t"
1437 "fmls %[q2_out3r].4s, v5.4s, %[q2_in3i].4s \n\t"
1438 "fmla %[q2_out3i].4s, v5.4s, %[q2_in3r].4s \n\t"
1439 : [q2_out1r]
"+w"(q2_out1.val[0]),
1440 [q2_out1i]
"+w"(q2_out1.val[1]),
1441 [q2_out2r]
"+w"(q2_out2.val[0]),
1442 [q2_out2i]
"+w"(q2_out2.val[1]),
1443 [q2_out3r]
"+w"(q2_out3.val[0]),
1444 [q2_out3i]
"+w"(q2_out3.val[1])
1448 [q2_in1r]
"w"(q2_in1.val[0]),
1449 [q2_in1i]
"w"(q2_in1.val[1]),
1450 [q2_in2r]
"w"(q2_in2.val[0]),
1451 [q2_in2i]
"w"(q2_in2.val[1]),
1452 [q2_in3r]
"w"(q2_in3.val[0]),
1453 [q2_in3i]
"w"(q2_in3.val[1])
1454 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5"
1458 #endif // __aarch64__
1459 #endif // NE10_INLINE_ASM_OPT
1463 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
1464 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
1465 q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
1466 q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
1467 q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
1468 q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
1469 q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
1470 q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
1473 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1474 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1475 q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
1476 q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
1478 q2_out3.val[1] = vnegq_f32 (q2_out3.val[1]);
1479 q2_out2.val[1] = vnegq_f32 (q2_out2.val[1]);
1481 #ifndef NE10_INLINE_ASM_OPT
1482 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1483 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1485 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
1486 q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
1489 NE10_REVERSE_FLOAT32X4 (q2_out2.val[0]);
1490 NE10_REVERSE_FLOAT32X4 (q2_out2.val[1]);
1491 NE10_REVERSE_FLOAT32X4 (q2_out3.val[0]);
1492 NE10_REVERSE_FLOAT32X4 (q2_out3.val[1]);
1495 vst2q_f32 (fout_r, q2_out0);
1496 vst2q_f32 (fout_r + (nfft >> 1), q2_out1);
1497 vst2q_f32 (fout_b + (nfft >> 1), q2_out3);
1498 vst2q_f32 (fout_b + nfft, q2_out2);
1499 #else // NE10_INLINE_ASM_OPT
1501 #error Currently, inline assembly optimizations are only available on AArch64.
1502 #else // __aarch64__
1504 "fadd v0.4s, %[q2_in0r].4s, %[q2_in2r].4s \n\t"
1505 "fadd v1.4s, %[q2_in0i].4s, %[q2_in2i].4s \n\t"
1506 "fadd v2.4s, %[q2_in1r].4s, %[q2_in3i].4s \n\t"
1507 "fsub v3.4s, %[q2_in1i].4s, %[q2_in3r].4s \n\t"
1509 "rev64 %[q2_in2r].4s, %[q2_out2r].4s \n\t"
1510 "rev64 %[q2_in2i].4s, %[q2_out2i].4s \n\t"
1511 "rev64 %[q2_in3r].4s, %[q2_out3r].4s \n\t"
1512 "rev64 %[q2_in3i].4s, %[q2_out3i].4s \n\t"
1513 "ext v4.16b, %[q2_in2r].16b, %[q2_in2r].16b, #8 \n\t"
1514 "ext v5.16b, %[q2_in2i].16b, %[q2_in2i].16b, #8 \n\t"
1515 "ext v6.16b, %[q2_in3r].16b, %[q2_in3r].16b, #8 \n\t"
1516 "ext v7.16b, %[q2_in3i].16b, %[q2_in3i].16b, #8 \n\t"
1518 "st2 {v0.4s, v1.4s}, [%[fout0]] \n\t"
1519 "st2 {v2.4s, v3.4s}, [%[fout1]] \n\t"
1520 "st2 {v4.4s, v5.4s}, [%[fout2]] \n\t"
1521 "st2 {v6.4s, v7.4s}, [%[fout3]] \n\t"
1523 : [fout0]
"r"(fout_r),
1524 [fout1]
"r"(fout_r + (nfft>>1)),
1525 [fout2]
"r"(fout_b + nfft),
1526 [fout3]
"r"(fout_b + (nfft>>1)),
1527 [q2_out2r]
"w"(q2_out2.val[0]),
1528 [q2_out2i]
"w"(q2_out2.val[1]),
1529 [q2_out3r]
"w"(q2_out3.val[0]),
1530 [q2_out3i]
"w"(q2_out3.val[1]),
1531 [q2_in0r]
"w"(q2_in0.val[0]),
1532 [q2_in0i]
"w"(q2_in0.val[1]),
1533 [q2_in1r]
"w"(q2_in1.val[0]),
1534 [q2_in1i]
"w"(q2_in1.val[1]),
1535 [q2_in2r]
"w"(q2_in2.val[0]),
1536 [q2_in2i]
"w"(q2_in2.val[1]),
1537 [q2_in3r]
"w"(q2_in3.val[0]),
1538 [q2_in3i]
"w"(q2_in3.val[1])
1539 :
"memory",
"v0",
"v1",
"v2",
"v3",
"v4",
"v5",
"v6",
"v7"
1541 #endif // __aarch64__
1542 #endif // NE10_INLINE_ASM_OPT
1552 const ne10_int32_t nfft)
1554 ne10_float32_t *fout_r = ((ne10_float32_t*) dst ) + 12 + 16 ;
1555 const ne10_float32_t *fin_r = (
const ne10_float32_t*) src + 8;
1556 const ne10_float32_t *fin_b = (
const ne10_float32_t*) src - 14;
1557 const ne10_float32_t *tw = ((
const ne10_float32_t*) twiddles) + 8 + 16;
1558 ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
1560 for ( ; loop_count>0; loop_count -- )
1562 NE10_DECLARE_4(float32x4x2_t,q2_in);
1563 NE10_DECLARE_3(float32x4x2_t,q2_tw);
1564 NE10_DECLARE_4(float32x4x2_t,q2_out);
1577 q2_in0 = vld2q_f32(fin_r );
1578 q2_in1 = vld2q_f32(fin_r + (nfft>>1));
1581 q2_in3 = vld2q_f32(fin_b + (nfft>>1));
1582 q2_in2 = vld2q_f32(fin_b + nfft );
1585 q2_tw0 = vld2q_f32(tw);
1587 q2_tw1 = vld2q_f32(tw);
1589 q2_tw2 = vld2q_f32(tw);
1593 NE10_REVERSE_FLOAT32X4( q2_in3.val[0] );
1594 NE10_REVERSE_FLOAT32X4( q2_in3.val[1] );
1595 NE10_REVERSE_FLOAT32X4( q2_in2.val[0] );
1596 NE10_REVERSE_FLOAT32X4( q2_in2.val[1] );
1598 q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
1599 q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
1602 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1603 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1605 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1606 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1608 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1609 q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1611 q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
1612 q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
1615 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
1616 q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
1618 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
1619 q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
1621 q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
1622 q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
1624 q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
1625 q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
1630 NE10_CPX_MUL_INV_NEON_F32(q2_out1,q2_in1,q2_tw0);
1631 NE10_CPX_MUL_INV_NEON_F32(q2_out2,q2_in2,q2_tw1);
1632 NE10_CPX_MUL_INV_NEON_F32(q2_out3,q2_in3,q2_tw2);
1636 NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1639 vst1q_f32(fout_r, q2_in0.val[0]);
1641 vst1q_f32(fout_r, q2_in0.val[1]);
1643 vst1q_f32(fout_r, q2_in1.val[0]);
1645 vst1q_f32(fout_r, q2_in1.val[1]);
1647 vst1q_f32(fout_r, q2_in2.val[0]);
1649 vst1q_f32(fout_r, q2_in2.val[1]);
1651 vst1q_f32(fout_r, q2_in3.val[0]);
1653 vst1q_f32(fout_r, q2_in3.val[1]);
1661 const ne10_int32_t nfft)
1663 ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(dst,src,twiddles,nfft);
1670 ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(dst,src,twiddles,nfft);
1677 ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(dst,src,twiddles,nfft);
1683 const ne10_int32_t nfft)
1685 ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(dst,src,twiddles,nfft);
1692 ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(dst,src,twiddles,nfft);
1699 ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(dst,src,twiddles,nfft);
1718 ne10_float32_t *fin,
1721 typedef ne10_float32_t REAL;
1725 ne10_float32_t *fout_r = (ne10_float32_t*) fout;
1730 ne10_radix8_r2c_c ( (CPLX*) fout_r, (
const CPLX*) fin, 1, 1, 8);
1731 fout[0].r = fout[0].i;
1734 ne10_mixed_radix_r2c_butterfly_float32_neon (fout, (CPLX*) fin, cfg->r_factors_neon, cfg->r_twiddles_neon, tmpbuf);
1735 ne10_radix4_r2c_with_twiddles_last_stage(fout, tmpbuf, cfg->r_super_twiddles_neon, cfg->nfft);
1736 fout[cfg->nfft / 2].r = fout[0].i;
1739 fout[0].i = fout[cfg->nfft / 2].i = 0.0f;
1756 typedef ne10_float32_t REAL;
1761 ne10_int32_t stage_count;
1767 fin[0].i = fin[0].r;
1769 ne10_radix8_c2r_c ( (CPLX*) fout, (
const CPLX*) &fin[0].i, 1, 1, 8);
1770 fin[0].r = fin[0].i;
1773 stage_count = cfg->r_factors_neon[0];
1774 radix = cfg->r_factors_neon[ stage_count << 1 ];
1779 fin[0].i = fin[cfg->nfft>>1].r;
1780 fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
1781 ne10_radix4_c2r_with_twiddles_first_stage( (CPLX*) fout_c, fin, cfg->r_super_twiddles_neon, cfg->nfft);
1782 ne10_mixed_radix_c2r_butterfly_float32_neon ( (CPLX*) fout, (CPLX*) NULL, cfg->r_factors_neon, cfg->r_twiddles_neon_backward, tmpbuf);