34 #include "NE10_types.h"
35 #include "NE10_macros.h"
42 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
43 ne10_int16_t tmp_r, tmp_i;
45 s2_r = Fin[0].r - Fin[2].r;
46 s2_i = Fin[0].i - Fin[2].i;
48 tmp_r = Fin[0].r + Fin[2].r;
49 tmp_i = Fin[0].i + Fin[2].i;
51 s0_r = Fin[1].r + Fin[3].r;
52 s0_i = Fin[1].i + Fin[3].i;
54 s1_r = Fin[1].r - Fin[3].r;
55 s1_i = Fin[1].i - Fin[3].i;
56 Fout[2].r = tmp_r - s0_r;
57 Fout[2].i = tmp_i - s0_i;
58 Fout[0].r = tmp_r + s0_r;
59 Fout[0].i = tmp_i + s0_i;
61 Fout[1].r = s2_r + s1_i;
62 Fout[1].i = s2_i - s1_r;
63 Fout[3].r = s2_r - s1_i;
64 Fout[3].i = s2_i + s1_r;
71 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
72 ne10_int16_t tmp_r, tmp_i;
74 s2_r = Fin[0].r - Fin[2].r;
75 s2_i = Fin[0].i - Fin[2].i;
77 tmp_r = Fin[0].r + Fin[2].r;
78 tmp_i = Fin[0].i + Fin[2].i;
80 s0_r = Fin[1].r + Fin[3].r;
81 s0_i = Fin[1].i + Fin[3].i;
83 s1_r = Fin[1].r - Fin[3].r;
84 s1_i = Fin[1].i - Fin[3].i;
86 Fout[2].r = tmp_r - s0_r;
87 Fout[2].i = tmp_i - s0_i;
88 Fout[0].r = tmp_r + s0_r;
89 Fout[0].i = tmp_i + s0_i;
91 Fout[1].r = s2_r - s1_i;
92 Fout[1].i = s2_i + s1_r;
93 Fout[3].r = s2_r + s1_i;
94 Fout[3].i = s2_i - s1_r;
100 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
101 ne10_int16_t tmp_r, tmp_i;
103 s2_r = (Fin[0].r - Fin[2].r) >> 2;
104 s2_i = (Fin[0].i - Fin[2].i) >> 2;
105 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
106 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
108 s0_r = (Fin[1].r + Fin[3].r) >> 2;
109 s0_i = (Fin[1].i + Fin[3].i) >> 2;
110 s1_r = (Fin[1].r - Fin[3].r) >> 2;
111 s1_i = (Fin[1].i - Fin[3].i) >> 2;
113 Fout[2].r = tmp_r - s0_r;
114 Fout[2].i = tmp_i - s0_i;
115 Fout[0].r = tmp_r + s0_r;
116 Fout[0].i = tmp_i + s0_i;
118 Fout[1].r = s2_r + s1_i;
119 Fout[1].i = s2_i - s1_r;
120 Fout[3].r = s2_r - s1_i;
121 Fout[3].i = s2_i + s1_r;
128 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i;
129 ne10_int16_t tmp_r, tmp_i;
131 s2_r = (Fin[0].r - Fin[2].r) >> 2;
132 s2_i = (Fin[0].i - Fin[2].i) >> 2;
133 tmp_r = (Fin[0].r + Fin[2].r) >> 2;
134 tmp_i = (Fin[0].i + Fin[2].i) >> 2;
136 s0_r = (Fin[1].r + Fin[3].r) >> 2;
137 s0_i = (Fin[1].i + Fin[3].i) >> 2;
138 s1_r = (Fin[1].r - Fin[3].r) >> 2;
139 s1_i = (Fin[1].i - Fin[3].i) >> 2;
141 Fout[2].r = tmp_r - s0_r;
142 Fout[2].i = tmp_i - s0_i;
143 Fout[0].r = tmp_r + s0_r;
144 Fout[0].i = tmp_i + s0_i;
146 Fout[1].r = s2_r - s1_i;
147 Fout[1].i = s2_i + s1_r;
148 Fout[3].r = s2_r + s1_i;
149 Fout[3].i = s2_i - s1_r;
155 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
156 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
157 const ne10_int16_t TW_81 = 23169;
159 s0_r = Fin[0].r + Fin[4].r;
160 s0_i = Fin[0].i + Fin[4].i;
161 s1_r = Fin[0].r - Fin[4].r;
162 s1_i = Fin[0].i - Fin[4].i;
163 s2_r = Fin[1].r + Fin[5].r;
164 s2_i = Fin[1].i + Fin[5].i;
165 s3_r = Fin[1].r - Fin[5].r;
166 s3_i = Fin[1].i - Fin[5].i;
167 s4_r = Fin[2].r + Fin[6].r;
168 s4_i = Fin[2].i + Fin[6].i;
169 s5_r = Fin[2].r - Fin[6].r;
170 s5_i = Fin[2].i - Fin[6].i;
171 s6_r = Fin[3].r + Fin[7].r;
172 s6_i = Fin[3].i + Fin[7].i;
173 s7_r = Fin[3].r - Fin[7].r;
174 s7_i = Fin[3].i - Fin[7].i;
184 Fout[0].r = t1_r + t2_r;
185 Fout[0].i = t1_i + t2_i;
186 Fout[4].r = t1_r - t2_r;
187 Fout[4].i = t1_i - t2_i;
188 Fout[2].r = t0_r + t3_i;
189 Fout[2].i = t0_i - t3_r;
190 Fout[6].r = t0_r - t3_i;
191 Fout[6].i = t0_i + t3_r;
193 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
194 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
195 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
196 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
206 Fout[1].r = t1_r + t2_r;
207 Fout[1].i = t1_i + t2_i;
208 Fout[5].r = t1_r - t2_r;
209 Fout[5].i = t1_i - t2_i;
210 Fout[3].r = t0_r + t3_i;
211 Fout[3].i = t0_i - t3_r;
212 Fout[7].r = t0_r - t3_i;
213 Fout[7].i = t0_i + t3_r;
220 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
221 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
222 const ne10_int16_t TW_81 = 23169;
224 s0_r = Fin[0].r + Fin[4].r;
225 s0_i = Fin[0].i + Fin[4].i;
226 s1_r = Fin[0].r - Fin[4].r;
227 s1_i = Fin[0].i - Fin[4].i;
228 s2_r = Fin[1].r + Fin[5].r;
229 s2_i = Fin[1].i + Fin[5].i;
230 s3_r = Fin[1].r - Fin[5].r;
231 s3_i = Fin[1].i - Fin[5].i;
232 s4_r = Fin[2].r + Fin[6].r;
233 s4_i = Fin[2].i + Fin[6].i;
234 s5_r = Fin[2].r - Fin[6].r;
235 s5_i = Fin[2].i - Fin[6].i;
236 s6_r = Fin[3].r + Fin[7].r;
237 s6_i = Fin[3].i + Fin[7].i;
238 s7_r = Fin[3].r - Fin[7].r;
239 s7_i = Fin[3].i - Fin[7].i;
249 Fout[0].r = t1_r + t2_r;
250 Fout[0].i = t1_i + t2_i;
251 Fout[4].r = t1_r - t2_r;
252 Fout[4].i = t1_i - t2_i;
253 Fout[2].r = t0_r - t3_i;
254 Fout[2].i = t0_i + t3_r;
255 Fout[6].r = t0_r + t3_i;
256 Fout[6].i = t0_i - t3_r;
258 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
259 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
260 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
261 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
271 Fout[1].r = t1_r + t2_r;
272 Fout[1].i = t1_i + t2_i;
273 Fout[5].r = t1_r - t2_r;
274 Fout[5].i = t1_i - t2_i;
275 Fout[3].r = t0_r - t3_i;
276 Fout[3].i = t0_i + t3_r;
277 Fout[7].r = t0_r + t3_i;
278 Fout[7].i = t0_i - t3_r;
284 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
285 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
286 const ne10_int16_t TW_81 = 23169;
288 s0_r = (Fin[0].r + Fin[4].r) >> 3;
289 s0_i = (Fin[0].i + Fin[4].i) >> 3;
290 s1_r = (Fin[0].r - Fin[4].r) >> 3;
291 s1_i = (Fin[0].i - Fin[4].i) >> 3;
292 s2_r = (Fin[1].r + Fin[5].r) >> 3;
293 s2_i = (Fin[1].i + Fin[5].i) >> 3;
294 s3_r = (Fin[1].r - Fin[5].r) >> 3;
295 s3_i = (Fin[1].i - Fin[5].i) >> 3;
296 s4_r = (Fin[2].r + Fin[6].r) >> 3;
297 s4_i = (Fin[2].i + Fin[6].i) >> 3;
298 s5_r = (Fin[2].r - Fin[6].r) >> 3;
299 s5_i = (Fin[2].i - Fin[6].i) >> 3;
300 s6_r = (Fin[3].r + Fin[7].r) >> 3;
301 s6_i = (Fin[3].i + Fin[7].i) >> 3;
302 s7_r = (Fin[3].r - Fin[7].r) >> 3;
303 s7_i = (Fin[3].i - Fin[7].i) >> 3;
313 Fout[0].r = t1_r + t2_r;
314 Fout[0].i = t1_i + t2_i;
315 Fout[4].r = t1_r - t2_r;
316 Fout[4].i = t1_i - t2_i;
317 Fout[2].r = t0_r + t3_i;
318 Fout[2].i = t0_i - t3_r;
319 Fout[6].r = t0_r - t3_i;
320 Fout[6].i = t0_i + t3_r;
322 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
323 t4_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
324 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
325 t5_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
335 Fout[1].r = t1_r + t2_r;
336 Fout[1].i = t1_i + t2_i;
337 Fout[5].r = t1_r - t2_r;
338 Fout[5].i = t1_i - t2_i;
339 Fout[3].r = t0_r + t3_i;
340 Fout[3].i = t0_i - t3_r;
341 Fout[7].r = t0_r - t3_i;
342 Fout[7].i = t0_i + t3_r;
349 ne10_int16_t s0_r, s0_i, s1_r, s1_i, s2_r, s2_i, s3_r, s3_i, s4_r, s4_i, s5_r, s5_i, s6_r, s6_i, s7_r, s7_i;
350 ne10_int16_t t0_r, t0_i, t1_r, t1_i, t2_r, t2_i, t3_r, t3_i, t4_r, t4_i, t5_r, t5_i;
351 const ne10_int16_t TW_81 = 23169;
353 s0_r = (Fin[0].r + Fin[4].r) >> 3;
354 s0_i = (Fin[0].i + Fin[4].i) >> 3;
355 s1_r = (Fin[0].r - Fin[4].r) >> 3;
356 s1_i = (Fin[0].i - Fin[4].i) >> 3;
357 s2_r = (Fin[1].r + Fin[5].r) >> 3;
358 s2_i = (Fin[1].i + Fin[5].i) >> 3;
359 s3_r = (Fin[1].r - Fin[5].r) >> 3;
360 s3_i = (Fin[1].i - Fin[5].i) >> 3;
361 s4_r = (Fin[2].r + Fin[6].r) >> 3;
362 s4_i = (Fin[2].i + Fin[6].i) >> 3;
363 s5_r = (Fin[2].r - Fin[6].r) >> 3;
364 s5_i = (Fin[2].i - Fin[6].i) >> 3;
365 s6_r = (Fin[3].r + Fin[7].r) >> 3;
366 s6_i = (Fin[3].i + Fin[7].i) >> 3;
367 s7_r = (Fin[3].r - Fin[7].r) >> 3;
368 s7_i = (Fin[3].i - Fin[7].i) >> 3;
378 Fout[0].r = t1_r + t2_r;
379 Fout[0].i = t1_i + t2_i;
380 Fout[4].r = t1_r - t2_r;
381 Fout[4].i = t1_i - t2_i;
382 Fout[2].r = t0_r - t3_i;
383 Fout[2].i = t0_i + t3_r;
384 Fout[6].r = t0_r + t3_i;
385 Fout[6].i = t0_i - t3_r;
387 t4_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r - s3_i) * TW_81) >> NE10_F2I16_SHIFT);
388 t4_i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s3_r + s3_i) * TW_81) >> NE10_F2I16_SHIFT);
389 t5_r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r + s7_i) * TW_81) >> NE10_F2I16_SHIFT);
390 t5_i = - (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) (s7_r - s7_i) * TW_81) >> NE10_F2I16_SHIFT);
400 Fout[1].r = t1_r + t2_r;
401 Fout[1].i = t1_i + t2_i;
402 Fout[5].r = t1_r - t2_r;
403 Fout[5].i = t1_i - t2_i;
404 Fout[3].r = t0_r - t3_i;
405 Fout[3].i = t0_i + t3_r;
406 Fout[7].r = t0_r + t3_i;
407 Fout[7].i = t0_i - t3_r;
413 ne10_int32_t scaled_flag)
416 ne10_int32_t count = ncfft / 2;
418 int16x8x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
419 int16x8_t q_fpnk_r, q_fpnk_i;
420 int16x8_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
421 int16x8_t q_tw_r, q_tw_i;
422 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
423 int16x8_t q_dst2_r, q_dst2_i;
424 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
430 NE10_F2I16_FIXDIV (tdc, 2);
432 dst[0].r = tdc.r + tdc.i;
433 dst[ncfft].r = tdc.r - tdc.i;
434 dst[ncfft].i = dst[0].i = 0;
440 for (k = 1; k <= count ; k += 8)
442 p_src = (int16_t*) (& (src[k]));
443 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
444 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
445 p_dst = (int16_t*) (& (dst[k]));
446 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
448 q2_fpk = vld2q_s16 (p_src);
449 q2_fpnk = vld2q_s16 (p_src2);
451 q2_tw = vld2q_s16 (p_twiddles);
452 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
453 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
454 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
455 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
456 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
457 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
458 q_fpnk_i = vnegq_s16 (q_fpnk_i);
460 q_f1k_r = vhaddq_s16 (q2_fpk.val[0], q_fpnk_r);
461 q_f1k_i = vhaddq_s16 (q2_fpk.val[1], q_fpnk_i);
463 q_f2k_r = vhsubq_s16 (q2_fpk.val[0], q_fpnk_r);
464 q_f2k_i = vhsubq_s16 (q2_fpk.val[1], q_fpnk_i);
466 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
467 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
468 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
469 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
470 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
471 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
473 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
474 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
475 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
476 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
477 q_dst2_r = vrev32q_s16 (q_dst2_r);
478 q_dst2_i = vrev32q_s16 (q_dst2_i);
479 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
480 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
481 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
482 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
483 vst2q_s16 (p_dst, q2_dst);
484 vst2q_s16 (p_dst2, q2_dst2);
490 for (k = 1; k <= count ; k += 8)
492 p_src = (int16_t*) (& (src[k]));
493 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
494 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
495 p_dst = (int16_t*) (& (dst[k]));
496 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
498 q2_fpk = vld2q_s16 (p_src);
499 q2_fpnk = vld2q_s16 (p_src2);
501 q2_tw = vld2q_s16 (p_twiddles);
502 q2_fpnk.val[0] = vrev32q_s16 (q2_fpnk.val[0]);
503 q2_fpnk.val[1] = vrev32q_s16 (q2_fpnk.val[1]);
504 q2_fpnk.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[0])));
505 q2_fpnk.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fpnk.val[1])));
506 q_fpnk_r = vcombine_s16 (vget_high_s16 (q2_fpnk.val[0]), vget_low_s16 (q2_fpnk.val[0]));
507 q_fpnk_i = vcombine_s16 (vget_high_s16 (q2_fpnk.val[1]), vget_low_s16 (q2_fpnk.val[1]));
508 q_fpnk_i = vnegq_s16 (q_fpnk_i);
510 q_f1k_r = vaddq_s16 (q2_fpk.val[0], q_fpnk_r);
511 q_f1k_i = vaddq_s16 (q2_fpk.val[1], q_fpnk_i);
513 q_f2k_r = vsubq_s16 (q2_fpk.val[0], q_fpnk_r);
514 q_f2k_i = vsubq_s16 (q2_fpk.val[1], q_fpnk_i);
516 q_tmp0 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[0]);
517 q_tmp1 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[1]);
518 q_tmp2 = vqdmulhq_s16 (q_f2k_r, q2_tw.val[1]);
519 q_tmp3 = vqdmulhq_s16 (q_f2k_i, q2_tw.val[0]);
520 q_tw_r = vsubq_s16 (q_tmp0, q_tmp1);
521 q_tw_i = vaddq_s16 (q_tmp2, q_tmp3);
523 q_dst2_r = vhsubq_s16 (q_f1k_r, q_tw_r);
524 q_dst2_i = vhsubq_s16 (q_tw_i, q_f1k_i);
525 q2_dst.val[0] = vhaddq_s16 (q_f1k_r, q_tw_r);
526 q2_dst.val[1] = vhaddq_s16 (q_f1k_i, q_tw_i);
527 q_dst2_r = vrev32q_s16 (q_dst2_r);
528 q_dst2_i = vrev32q_s16 (q_dst2_i);
529 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
530 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
531 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
532 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
533 vst2q_s16 (p_dst, q2_dst);
534 vst2q_s16 (p_dst2, q2_dst2);
542 for (k = 1; k <= ncfft / 2 ; ++k)
545 fpnk.r = src[ncfft - k].r;
546 fpnk.i = - src[ncfft - k].i;
549 NE10_F2I16_FIXDIV (fpk, 2);
550 NE10_F2I16_FIXDIV (fpnk, 2);
553 f1k.r = fpk.r + fpnk.r;
554 f1k.i = fpk.i + fpnk.i;
556 f2k.r = fpk.r - fpnk.r;
557 f2k.i = fpk.i - fpnk.i;
559 tw.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).r
560 - (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
561 tw.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) f2k.r * (twiddles[k - 1]).i
562 + (NE10_F2I16_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> NE10_F2I16_SHIFT);
564 dst[k].r = (f1k.r + tw.r) >> 1;
565 dst[k].i = (f1k.i + tw.i) >> 1;
566 dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
567 dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
576 ne10_int32_t scaled_flag)
580 ne10_int32_t count = ncfft / 2;
582 int16x8x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
583 int16x8_t q_fnkc_r, q_fnkc_i;
584 int16x8_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
585 int16x8_t q_tmp0, q_tmp1, q_tmp2, q_tmp3;
586 int16x8_t q_dst2_r, q_dst2_i;
587 int16_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;
590 dst[0].r = src[0].r + src[ncfft].r;
591 dst[0].i = src[0].r - src[ncfft].r;
594 NE10_F2I16_FIXDIV (dst[0], 2);
599 for (k = 1; k <= count ; k += 8)
601 p_src = (int16_t*) (& (src[k]));
602 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
603 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
604 p_dst = (int16_t*) (& (dst[k]));
605 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
607 q2_fk = vld2q_s16 (p_src);
608 q2_fnkc = vld2q_s16 (p_src2);
609 q2_tw = vld2q_s16 (p_twiddles);
610 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
611 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
612 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
613 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
614 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
615 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
616 q_fnkc_i = vnegq_s16 (q_fnkc_i);
618 q_fek_r = vhaddq_s16 (q2_fk.val[0], q_fnkc_r);
619 q_fek_i = vhaddq_s16 (q2_fk.val[1], q_fnkc_i);
620 q_tmp0 = vhsubq_s16 (q2_fk.val[0], q_fnkc_r);
621 q_tmp1 = vhsubq_s16 (q2_fk.val[1], q_fnkc_i);
623 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
624 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
625 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
626 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
627 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
628 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
630 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
631 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
632 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
633 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
634 q_dst2_r = vrev32q_s16 (q_dst2_r);
635 q_dst2_i = vrev32q_s16 (q_dst2_i);
636 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
637 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
638 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
639 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
640 vst2q_s16 (p_dst, q2_dst);
641 vst2q_s16 (p_dst2, q2_dst2);
648 for (k = 1; k <= count ; k += 8)
650 p_src = (int16_t*) (& (src[k]));
651 p_src2 = (int16_t*) (& (src[ncfft - k - 7]));
652 p_twiddles = (int16_t*) (& (twiddles[k - 1]));
653 p_dst = (int16_t*) (& (dst[k]));
654 p_dst2 = (int16_t*) (& (dst[ncfft - k - 7]));
656 q2_fk = vld2q_s16 (p_src);
657 q2_fnkc = vld2q_s16 (p_src2);
658 q2_tw = vld2q_s16 (p_twiddles);
659 q2_fnkc.val[0] = vrev32q_s16 (q2_fnkc.val[0]);
660 q2_fnkc.val[1] = vrev32q_s16 (q2_fnkc.val[1]);
661 q2_fnkc.val[0] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[0])));
662 q2_fnkc.val[1] = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q2_fnkc.val[1])));
663 q_fnkc_r = vcombine_s16 (vget_high_s16 (q2_fnkc.val[0]), vget_low_s16 (q2_fnkc.val[0]));
664 q_fnkc_i = vcombine_s16 (vget_high_s16 (q2_fnkc.val[1]), vget_low_s16 (q2_fnkc.val[1]));
665 q_fnkc_i = vnegq_s16 (q_fnkc_i);
667 q_fek_r = vaddq_s16 (q2_fk.val[0], q_fnkc_r);
668 q_fek_i = vaddq_s16 (q2_fk.val[1], q_fnkc_i);
669 q_tmp0 = vsubq_s16 (q2_fk.val[0], q_fnkc_r);
670 q_tmp1 = vsubq_s16 (q2_fk.val[1], q_fnkc_i);
672 q_fok_r = vqdmulhq_s16 (q_tmp0, q2_tw.val[0]);
673 q_fok_i = vqdmulhq_s16 (q_tmp1, q2_tw.val[0]);
674 q_tmp2 = vqdmulhq_s16 (q_tmp1, q2_tw.val[1]);
675 q_tmp3 = vqdmulhq_s16 (q_tmp0, q2_tw.val[1]);
676 q_fok_r = vaddq_s16 (q_fok_r, q_tmp2);
677 q_fok_i = vsubq_s16 (q_fok_i, q_tmp3);
679 q_dst2_r = vsubq_s16 (q_fek_r, q_fok_r);
680 q_dst2_i = vsubq_s16 (q_fok_i, q_fek_i);
681 q2_dst.val[0] = vaddq_s16 (q_fek_r, q_fok_r);
682 q2_dst.val[1] = vaddq_s16 (q_fek_i, q_fok_i);
683 q_dst2_r = vrev32q_s16 (q_dst2_r);
684 q_dst2_i = vrev32q_s16 (q_dst2_i);
685 q_dst2_r = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_r))) ;
686 q_dst2_i = vreinterpretq_s16_s32 (vrev64q_s32 (vreinterpretq_s32_s16 (q_dst2_i)));
687 q2_dst2.val[0] = vcombine_s16 (vget_high_s16 (q_dst2_r), vget_low_s16 (q_dst2_r));
688 q2_dst2.val[1] = vcombine_s16 (vget_high_s16 (q_dst2_i), vget_low_s16 (q_dst2_i));
689 vst2q_s16 (p_dst, q2_dst);
690 vst2q_s16 (p_dst2, q2_dst2);
698 for (k = 1; k <= ncfft / 2; k++)
701 fnkc.r = src[ncfft - k].r;
702 fnkc.i = -src[ncfft - k].i;
705 NE10_F2I16_FIXDIV (fk, 2);
706 NE10_F2I16_FIXDIV (fnkc, 2);
709 fek.r = fk.r + fnkc.r;
710 fek.i = fk.i + fnkc.i;
712 tmp.r = fk.r - fnkc.r;
713 tmp.i = fk.i - fnkc.i;
715 fok.r = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).r
716 + (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
717 fok.i = (ne10_int16_t) ( ( (NE10_F2I16_SAMPPROD) tmp.i * (twiddles[k - 1]).r
718 - (NE10_F2I16_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> NE10_F2I16_SHIFT);
720 dst[k].r = fek.r + fok.r;
721 dst[k].i = fek.i + fok.i;
723 dst[ncfft - k].r = fek.r - fok.r;
724 dst[ncfft - k].i = fok.i - fek.i;
749 ne10_int32_t inverse_fft,
750 ne10_int32_t scaled_flag)
759 ne10_fft4_backward_int16_scaled (fout, fin);
762 ne10_fft8_backward_int16_scaled (fout, fin);
765 ne10_mixed_radix_fft_backward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
774 ne10_fft4_forward_int16_scaled (fout, fin);
777 ne10_fft8_forward_int16_scaled (fout, fin);
780 ne10_mixed_radix_fft_forward_int16_scaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
792 ne10_fft4_backward_int16_unscaled (fout, fin);
795 ne10_fft8_backward_int16_unscaled (fout, fin);
798 ne10_mixed_radix_fft_backward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
807 ne10_fft4_forward_int16_unscaled (fout, fin);
810 ne10_fft8_forward_int16_unscaled (fout, fin);
813 ne10_mixed_radix_fft_forward_int16_unscaled_neon (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer);
843 ne10_int32_t scaled_flag)
849 c2c_state.nfft = cfg->ncfft;
850 c2c_state.factors = cfg->factors;
851 c2c_state.twiddles = cfg->twiddles;
852 c2c_state.buffer = tmpbuf2;
855 ne10_fft_split_r2c_1d_int16_neon (fout, tmpbuf1, cfg->super_twiddles, cfg->ncfft, scaled_flag);
871 ne10_int32_t scaled_flag)
877 c2c_state.nfft = cfg->ncfft;
878 c2c_state.factors = cfg->factors;
879 c2c_state.twiddles = cfg->twiddles;
880 c2c_state.buffer = tmpbuf2;
882 ne10_fft_split_c2r_1d_int16_neon (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);