Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft_int32.c
1 /*
2  * Copyright 2013-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft_int32.c
45  */
46 
47 #include "NE10_types.h"
48 #include "NE10_macros.h"
49 #include "NE10_fft.h"
50 
51 
52 static void ne10_mixed_radix_butterfly_int32_c (ne10_fft_cpx_int32_t * Fout,
54  ne10_int32_t * factors,
55  ne10_fft_cpx_int32_t * twiddles,
56  ne10_fft_cpx_int32_t * buffer,
57  ne10_int32_t scaled_flag)
58 {
59  ne10_int32_t fstride, mstride, N;
60  ne10_int32_t fstride1;
61  ne10_int32_t f_count, m_count;
62  ne10_int32_t stage_count;
63 
64  ne10_fft_cpx_int32_t scratch_in[8];
65  ne10_fft_cpx_int32_t scratch_out[8];
66  ne10_fft_cpx_int32_t scratch[16];
67  ne10_fft_cpx_int32_t scratch_tw[6];
68 
69  ne10_fft_cpx_int32_t *Fin1, *Fin2, *Fout1, *Fout2;
70  ne10_fft_cpx_int32_t *Fout_ls = Fout;
72  ne10_fft_cpx_int32_t *tw, *tw1, *tw2;
73  const ne10_int32_t TW_81 = 1518500249;
74  const ne10_int32_t TW_81N = -1518500249;
75 
76 
77 
78  // init fstride, mstride, N, tw
79  stage_count = factors[0];
80  fstride = factors[1];
81  mstride = factors[ (stage_count << 1) - 1 ];
82  N = factors[ stage_count << 1 ]; // radix
83  tw = twiddles;
84 
85  // the first stage
86  Fin1 = Fin;
87  Fout1 = Fout;
88  if (N == 2) // length of FFT is 2^n (n is odd)
89  {
90  // radix 8
91  N = fstride >> 1; // 1/4 of length of FFT
92  fstride1 = fstride >> 2;
93 
94  Fin1 = Fin;
95  for (f_count = 0; f_count < fstride1; f_count ++)
96  {
97  Fout1 = & Fout[ f_count * 8 ];
98  // load
99  if (scaled_flag == 1)
100  {
101  NE10_F2I32_FIXDIV (Fin1[0], 8);
102  NE10_F2I32_FIXDIV (Fin1[0 + fstride], 8);
103  NE10_F2I32_FIXDIV (Fin1[fstride1], 8);
104  NE10_F2I32_FIXDIV (Fin1[fstride1 + fstride], 8);
105  NE10_F2I32_FIXDIV (Fin1[fstride1 * 2], 8);
106  NE10_F2I32_FIXDIV (Fin1[fstride1 * 2 + fstride], 8);
107  NE10_F2I32_FIXDIV (Fin1[fstride1 * 3], 8);
108  NE10_F2I32_FIXDIV (Fin1[fstride1 * 3 + fstride], 8);
109  }
110  scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
111  scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
112  scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
113  scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
114  scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
115  scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
116  scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
117  scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
118  scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
119  scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
120  scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
121  scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
122  scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
123  scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
124  scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
125  scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
126 
127  // radix 4 butterfly without twiddles
128  scratch[0] = scratch_in[0];
129  scratch[1] = scratch_in[1];
130 
131  scratch[2] = scratch_in[2];
132  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r + scratch_in[3].i) * TW_81) >> 31);
133  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i - scratch_in[3].r) * TW_81) >> 31);
134 
135  scratch[4] = scratch_in[4];
136  scratch[5].r = scratch_in[5].i;
137  scratch[5].i = -scratch_in[5].r;
138 
139  scratch[6].r = scratch_in[6].r;
140  scratch[6].i = scratch_in[6].i;
141  scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r - scratch_in[7].i) * TW_81N) >> 31);
142  scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i + scratch_in[7].r) * TW_81N) >> 31);
143 
144  // radix 2 butterfly
145  scratch[8].r = scratch[0].r + scratch[4].r;
146  scratch[8].i = scratch[0].i + scratch[4].i;
147  scratch[9].r = scratch[1].r + scratch[5].r;
148  scratch[9].i = scratch[1].i + scratch[5].i;
149 
150  scratch[10].r = scratch[0].r - scratch[4].r;
151  scratch[10].i = scratch[0].i - scratch[4].i;
152  scratch[11].r = scratch[1].r - scratch[5].r;
153  scratch[11].i = scratch[1].i - scratch[5].i;
154 
155  // radix 2 butterfly
156  scratch[12].r = scratch[2].r + scratch[6].r;
157  scratch[12].i = scratch[2].i + scratch[6].i;
158  scratch[13].r = scratch[3].r + scratch[7].r;
159  scratch[13].i = scratch[3].i + scratch[7].i;
160 
161  scratch[14].r = scratch[2].r - scratch[6].r;
162  scratch[14].i = scratch[2].i - scratch[6].i;
163  scratch[15].r = scratch[3].r - scratch[7].r;
164  scratch[15].i = scratch[3].i - scratch[7].i;
165 
166  // third result
167  scratch_out[4].r = scratch[8].r - scratch[12].r;
168  scratch_out[4].i = scratch[8].i - scratch[12].i;
169  scratch_out[5].r = scratch[9].r - scratch[13].r;
170  scratch_out[5].i = scratch[9].i - scratch[13].i;
171 
172  // first result
173  scratch_out[0].r = scratch[8].r + scratch[12].r;
174  scratch_out[0].i = scratch[8].i + scratch[12].i;
175  scratch_out[1].r = scratch[9].r + scratch[13].r;
176  scratch_out[1].i = scratch[9].i + scratch[13].i;
177 
178  // second result
179  scratch_out[2].r = scratch[10].r + scratch[14].i;
180  scratch_out[2].i = scratch[10].i - scratch[14].r;
181  scratch_out[3].r = scratch[11].r + scratch[15].i;
182  scratch_out[3].i = scratch[11].i - scratch[15].r;
183 
184  // forth result
185  scratch_out[6].r = scratch[10].r - scratch[14].i;
186  scratch_out[6].i = scratch[10].i + scratch[14].r;
187  scratch_out[7].r = scratch[11].r - scratch[15].i;
188  scratch_out[7].i = scratch[11].i + scratch[15].r;
189 
190  // store
191  Fout1[0] = scratch_out[0];
192  Fout1[1] = scratch_out[1];
193  Fout1[2] = scratch_out[2];
194  Fout1[3] = scratch_out[3];
195  Fout1[4] = scratch_out[4];
196  Fout1[5] = scratch_out[5];
197  Fout1[6] = scratch_out[6];
198  Fout1[7] = scratch_out[7];
199 
200  Fin1 += 1;
201  } // f_count
202  tw += 6;
203  mstride <<= 2;
204  fstride >>= 4;
205  stage_count -= 2;
206 
207  // swap
208  Ftmp = buffer;
209  buffer = Fout;
210  Fout = Ftmp;
211  }
212  else if (N == 4) // length of FFT is 2^n (n is even)
213  {
214  //fstride is nfft>>2
215  for (f_count = fstride; f_count ; f_count --)
216  {
217  // load
218  scratch_in[0] = *Fin1;
219  Fin2 = Fin1 + fstride;
220  scratch_in[1] = *Fin2;
221  Fin2 = Fin2 + fstride;
222  scratch_in[2] = *Fin2;
223  Fin2 = Fin2 + fstride;
224  scratch_in[3] = *Fin2;
225 
226  // radix 4 butterfly without twiddles
227  if (scaled_flag == 1)
228  {
229  NE10_F2I32_FIXDIV (scratch_in[0], 4);
230  NE10_F2I32_FIXDIV (scratch_in[1], 4);
231  NE10_F2I32_FIXDIV (scratch_in[2], 4);
232  NE10_F2I32_FIXDIV (scratch_in[3], 4);
233  }
234 
235  // radix 2 butterfly
236  scratch[0].r = scratch_in[0].r + scratch_in[2].r;
237  scratch[0].i = scratch_in[0].i + scratch_in[2].i;
238 
239  scratch[1].r = scratch_in[0].r - scratch_in[2].r;
240  scratch[1].i = scratch_in[0].i - scratch_in[2].i;
241 
242  // radix 2 butterfly
243  scratch[2].r = scratch_in[1].r + scratch_in[3].r;
244  scratch[2].i = scratch_in[1].i + scratch_in[3].i;
245 
246  scratch[3].r = scratch_in[1].r - scratch_in[3].r;
247  scratch[3].i = scratch_in[1].i - scratch_in[3].i;
248 
249  // third result
250  scratch_out[2].r = scratch[0].r - scratch[2].r;
251  scratch_out[2].i = scratch[0].i - scratch[2].i;
252 
253  // first result
254  scratch_out[0].r = scratch[0].r + scratch[2].r;
255  scratch_out[0].i = scratch[0].i + scratch[2].i;
256 
257  // second result
258  scratch_out[1].r = scratch[1].r + scratch[3].i;
259  scratch_out[1].i = scratch[1].i - scratch[3].r;
260 
261  // forth result
262  scratch_out[3].r = scratch[1].r - scratch[3].i;
263  scratch_out[3].i = scratch[1].i + scratch[3].r;
264 
265  // store
266  * Fout1 ++ = scratch_out[0];
267  * Fout1 ++ = scratch_out[1];
268  * Fout1 ++ = scratch_out[2];
269  * Fout1 ++ = scratch_out[3];
270 
271  Fin1++;
272  } // f_count
273 
274  N = fstride; // 1/4 of length of FFT
275 
276  // swap
277  Ftmp = buffer;
278  buffer = Fout;
279  Fout = Ftmp;
280 
281  // update address for other stages
282  stage_count--;
283  fstride >>= 2;
284  // end of first stage
285  }
286 
287 
288  // others but the last one
289  for (; stage_count > 1 ; stage_count--)
290  {
291  Fin1 = buffer;
292  for (f_count = 0; f_count < fstride; f_count ++)
293  {
294  Fout1 = & Fout[ f_count * mstride << 2 ];
295  tw1 = tw;
296  for (m_count = mstride; m_count ; m_count --)
297  {
298  // load
299  scratch_tw[0] = *tw1;
300  tw2 = tw1 + mstride;
301  scratch_tw[1] = *tw2;
302  tw2 += mstride;
303  scratch_tw[2] = *tw2;
304  scratch_in[0] = * Fin1;
305  Fin2 = Fin1 + N;
306  scratch_in[1] = * Fin2;
307  Fin2 += N;
308  scratch_in[2] = * Fin2;
309  Fin2 += N;
310  scratch_in[3] = * Fin2;
311  if (scaled_flag == 1)
312  {
313  NE10_F2I32_FIXDIV (scratch_in[0], 4);
314  NE10_F2I32_FIXDIV (scratch_in[1], 4);
315  NE10_F2I32_FIXDIV (scratch_in[2], 4);
316  NE10_F2I32_FIXDIV (scratch_in[3], 4);
317  }
318 
319  // radix 4 butterfly with twiddles
320 
321  scratch[0] = scratch_in[0];
322  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
323  - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
324  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
325  + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
326 
327  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
328  - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
329  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
330  + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
331 
332  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
333  - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
334  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
335  + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
336 
337  // radix 2 butterfly
338  scratch[4].r = scratch[0].r + scratch[2].r;
339  scratch[4].i = scratch[0].i + scratch[2].i;
340 
341  scratch[5].r = scratch[0].r - scratch[2].r;
342  scratch[5].i = scratch[0].i - scratch[2].i;
343 
344  // radix 2 butterfly
345  scratch[6].r = scratch[1].r + scratch[3].r;
346  scratch[6].i = scratch[1].i + scratch[3].i;
347 
348  scratch[7].r = scratch[1].r - scratch[3].r;
349  scratch[7].i = scratch[1].i - scratch[3].i;
350 
351  // third result
352  scratch_out[2].r = scratch[4].r - scratch[6].r;
353  scratch_out[2].i = scratch[4].i - scratch[6].i;
354 
355  // first result
356  scratch_out[0].r = scratch[4].r + scratch[6].r;
357  scratch_out[0].i = scratch[4].i + scratch[6].i;
358 
359  // second result
360  scratch_out[1].r = scratch[5].r + scratch[7].i;
361  scratch_out[1].i = scratch[5].i - scratch[7].r;
362 
363  // forth result
364  scratch_out[3].r = scratch[5].r - scratch[7].i;
365  scratch_out[3].i = scratch[5].i + scratch[7].r;
366 
367  // store
368  *Fout1 = scratch_out[0];
369  Fout2 = Fout1 + mstride;
370  *Fout2 = scratch_out[1];
371  Fout2 += mstride;
372  *Fout2 = scratch_out[2];
373  Fout2 += mstride;
374  *Fout2 = scratch_out[3];
375 
376  tw1++;
377  Fin1 ++;
378  Fout1 ++;
379  } // m_count
380  } // f_count
381  tw += mstride * 3;
382  mstride <<= 2;
383  // swap
384  Ftmp = buffer;
385  buffer = Fout;
386  Fout = Ftmp;
387  fstride >>= 2;
388  } // stage_count
389 
390  // the last one
391  if (stage_count)
392  {
393  Fin1 = buffer;
394  // if stage count is even, output to the input array
395  Fout1 = Fout_ls;
396 
397  for (f_count = 0; f_count < fstride; f_count ++)
398  {
399  tw1 = tw;
400  for (m_count = mstride; m_count ; m_count --)
401  {
402  // load
403  scratch_tw[0] = *tw1;
404  tw2 = tw1 + mstride;
405  scratch_tw[1] = *tw2;
406  tw2 += mstride;
407  scratch_tw[2] = *tw2;
408  scratch_in[0] = * Fin1;
409  Fin2 = Fin1 + N;
410  scratch_in[1] = * Fin2;
411  Fin2 += N;
412  scratch_in[2] = * Fin2;
413  Fin2 += N;
414  scratch_in[3] = * Fin2;
415  if (scaled_flag == 1)
416  {
417  NE10_F2I32_FIXDIV (scratch_in[0], 4);
418  NE10_F2I32_FIXDIV (scratch_in[1], 4);
419  NE10_F2I32_FIXDIV (scratch_in[2], 4);
420  NE10_F2I32_FIXDIV (scratch_in[3], 4);
421  }
422 
423  // radix 4 butterfly with twiddles
424 
425  scratch[0] = scratch_in[0];
426  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
427  - (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
428  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
429  + (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
430 
431  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
432  - (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
433  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
434  + (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
435 
436  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
437  - (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
438  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
439  + (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
440 
441  // radix 2 butterfly
442  scratch[4].r = scratch[0].r + scratch[2].r;
443  scratch[4].i = scratch[0].i + scratch[2].i;
444 
445  scratch[5].r = scratch[0].r - scratch[2].r;
446  scratch[5].i = scratch[0].i - scratch[2].i;
447 
448  // radix 2 butterfly
449  scratch[6].r = scratch[1].r + scratch[3].r;
450  scratch[6].i = scratch[1].i + scratch[3].i;
451 
452  scratch[7].r = scratch[1].r - scratch[3].r;
453  scratch[7].i = scratch[1].i - scratch[3].i;
454 
455  // third result
456  scratch_out[2].r = scratch[4].r - scratch[6].r;
457  scratch_out[2].i = scratch[4].i - scratch[6].i;
458 
459  // first result
460  scratch_out[0].r = scratch[4].r + scratch[6].r;
461  scratch_out[0].i = scratch[4].i + scratch[6].i;
462 
463  // second result
464  scratch_out[1].r = scratch[5].r + scratch[7].i;
465  scratch_out[1].i = scratch[5].i - scratch[7].r;
466 
467  // forth result
468  scratch_out[3].r = scratch[5].r - scratch[7].i;
469  scratch_out[3].i = scratch[5].i + scratch[7].r;
470 
471  // store
472  *Fout1 = scratch_out[0];
473  Fout2 = Fout1 + N;
474  *Fout2 = scratch_out[1];
475  Fout2 += N;
476  *Fout2 = scratch_out[2];
477  Fout2 += N;
478  *Fout2 = scratch_out[3];
479 
480  tw1 ++;
481  Fin1 ++;
482  Fout1 ++;
483  } // m_count
484  } // f_count
485  } // last stage
486 }
487 
488 static void ne10_mixed_radix_butterfly_inverse_int32_c (ne10_fft_cpx_int32_t * Fout,
489  ne10_fft_cpx_int32_t * Fin,
490  ne10_int32_t * factors,
491  ne10_fft_cpx_int32_t * twiddles,
492  ne10_fft_cpx_int32_t * buffer,
493  ne10_int32_t scaled_flag)
494 {
495  ne10_int32_t fstride, mstride, N;
496  ne10_int32_t fstride1;
497  ne10_int32_t f_count, m_count;
498  ne10_int32_t stage_count;
499 
500  ne10_fft_cpx_int32_t scratch_in[8];
501  ne10_fft_cpx_int32_t scratch_out[8];
502  ne10_fft_cpx_int32_t scratch[16];
503  ne10_fft_cpx_int32_t scratch_tw[6];
504 
505  ne10_fft_cpx_int32_t *Fin1, *Fin2, *Fout1, *Fout2;
506  ne10_fft_cpx_int32_t *Fout_ls = Fout;
507  ne10_fft_cpx_int32_t *Ftmp;
508  ne10_fft_cpx_int32_t *tw, *tw1, *tw2;
509  const ne10_int32_t TW_81 = 1518500249;
510  const ne10_int32_t TW_81N = -1518500249;
511 
512  // init fstride, mstride, N
513  stage_count = factors[0];
514  fstride = factors[1];
515  mstride = factors[ (stage_count << 1) - 1 ];
516  N = factors[ stage_count << 1 ]; // radix
517  tw = twiddles;
518 
519  // the first stage
520  Fin1 = Fin;
521  Fout1 = Fout;
522  if (N == 2) // length of FFT is 2^n (n is odd)
523  {
524  // radix 8
525  N = fstride >> 1; // 1/4 of length of FFT
526  fstride1 = fstride >> 2;
527 
528  Fin1 = Fin;
529  for (f_count = 0; f_count < fstride1; f_count ++)
530  {
531  Fout1 = & Fout[ f_count * 8 ];
532  // load
533  if (scaled_flag == 1)
534  {
535  NE10_F2I32_FIXDIV (Fin1[0], 8);
536  NE10_F2I32_FIXDIV (Fin1[0 + fstride], 8);
537  NE10_F2I32_FIXDIV (Fin1[fstride1], 8);
538  NE10_F2I32_FIXDIV (Fin1[fstride1 + fstride], 8);
539  NE10_F2I32_FIXDIV (Fin1[fstride1 * 2], 8);
540  NE10_F2I32_FIXDIV (Fin1[fstride1 * 2 + fstride], 8);
541  NE10_F2I32_FIXDIV (Fin1[fstride1 * 3], 8);
542  NE10_F2I32_FIXDIV (Fin1[fstride1 * 3 + fstride], 8);
543  }
544 
545  scratch_in[0].r = Fin1[0].r + Fin1[0 + fstride].r;
546  scratch_in[0].i = Fin1[0].i + Fin1[0 + fstride].i;
547  scratch_in[1].r = Fin1[0].r - Fin1[0 + fstride].r;
548  scratch_in[1].i = Fin1[0].i - Fin1[0 + fstride].i;
549  scratch_in[2].r = Fin1[fstride1].r + Fin1[fstride1 + fstride].r;
550  scratch_in[2].i = Fin1[fstride1].i + Fin1[fstride1 + fstride].i;
551  scratch_in[3].r = Fin1[fstride1].r - Fin1[fstride1 + fstride].r;
552  scratch_in[3].i = Fin1[fstride1].i - Fin1[fstride1 + fstride].i;
553  scratch_in[4].r = Fin1[fstride1 * 2].r + Fin1[fstride1 * 2 + fstride].r;
554  scratch_in[4].i = Fin1[fstride1 * 2].i + Fin1[fstride1 * 2 + fstride].i;
555  scratch_in[5].r = Fin1[fstride1 * 2].r - Fin1[fstride1 * 2 + fstride].r;
556  scratch_in[5].i = Fin1[fstride1 * 2].i - Fin1[fstride1 * 2 + fstride].i;
557  scratch_in[6].r = Fin1[fstride1 * 3].r + Fin1[fstride1 * 3 + fstride].r;
558  scratch_in[6].i = Fin1[fstride1 * 3].i + Fin1[fstride1 * 3 + fstride].i;
559  scratch_in[7].r = Fin1[fstride1 * 3].r - Fin1[fstride1 * 3 + fstride].r;
560  scratch_in[7].i = Fin1[fstride1 * 3].i - Fin1[fstride1 * 3 + fstride].i;
561 
562  // radix 4 butterfly with twiddles
563 
564  scratch[0] = scratch_in[0];
565  scratch[1] = scratch_in[1];
566 
567  scratch[2] = scratch_in[2];
568  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].r - scratch_in[3].i) * TW_81) >> 31);
569  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[3].i + scratch_in[3].r) * TW_81) >> 31);
570 
571  scratch[4] = scratch_in[4];
572  scratch[5].r = -scratch_in[5].i;
573  scratch[5].i = scratch_in[5].r;
574 
575  scratch[6].r = scratch_in[6].r;
576  scratch[6].i = scratch_in[6].i;
577  scratch[7].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].r + scratch_in[7].i) * TW_81N) >> 31);
578  scratch[7].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) (scratch_in[7].i - scratch_in[7].r) * TW_81N) >> 31);
579 
580  // radix 2 butterfly
581  scratch[8].r = scratch[0].r + scratch[4].r;
582  scratch[8].i = scratch[0].i + scratch[4].i;
583  scratch[9].r = scratch[1].r + scratch[5].r;
584  scratch[9].i = scratch[1].i + scratch[5].i;
585 
586  scratch[10].r = scratch[0].r - scratch[4].r;
587  scratch[10].i = scratch[0].i - scratch[4].i;
588  scratch[11].r = scratch[1].r - scratch[5].r;
589  scratch[11].i = scratch[1].i - scratch[5].i;
590 
591  // radix 2 butterfly
592  scratch[12].r = scratch[2].r + scratch[6].r;
593  scratch[12].i = scratch[2].i + scratch[6].i;
594  scratch[13].r = scratch[3].r + scratch[7].r;
595  scratch[13].i = scratch[3].i + scratch[7].i;
596 
597  scratch[14].r = scratch[2].r - scratch[6].r;
598  scratch[14].i = scratch[2].i - scratch[6].i;
599  scratch[15].r = scratch[3].r - scratch[7].r;
600  scratch[15].i = scratch[3].i - scratch[7].i;
601 
602  // third result
603  scratch_out[4].r = scratch[8].r - scratch[12].r;
604  scratch_out[4].i = scratch[8].i - scratch[12].i;
605  scratch_out[5].r = scratch[9].r - scratch[13].r;
606  scratch_out[5].i = scratch[9].i - scratch[13].i;
607 
608  // first result
609  scratch_out[0].r = scratch[8].r + scratch[12].r;
610  scratch_out[0].i = scratch[8].i + scratch[12].i;
611  scratch_out[1].r = scratch[9].r + scratch[13].r;
612  scratch_out[1].i = scratch[9].i + scratch[13].i;
613 
614  // second result
615  scratch_out[2].r = scratch[10].r - scratch[14].i;
616  scratch_out[2].i = scratch[10].i + scratch[14].r;
617  scratch_out[3].r = scratch[11].r - scratch[15].i;
618  scratch_out[3].i = scratch[11].i + scratch[15].r;
619 
620  // forth result
621  scratch_out[6].r = scratch[10].r + scratch[14].i;
622  scratch_out[6].i = scratch[10].i - scratch[14].r;
623  scratch_out[7].r = scratch[11].r + scratch[15].i;
624  scratch_out[7].i = scratch[11].i - scratch[15].r;
625 
626  // store
627  Fout1[0] = scratch_out[0];
628  Fout1[1] = scratch_out[1];
629  Fout1[2] = scratch_out[2];
630  Fout1[3] = scratch_out[3];
631  Fout1[4] = scratch_out[4];
632  Fout1[5] = scratch_out[5];
633  Fout1[6] = scratch_out[6];
634  Fout1[7] = scratch_out[7];
635 
636  Fin1 += 1;
637  } // f_count
638  tw += 6;
639  mstride <<= 2;
640  fstride >>= 4;
641  stage_count -= 2;
642 
643  // swap
644  Ftmp = buffer;
645  buffer = Fout;
646  Fout = Ftmp;
647  }
648  else if (N == 4) // length of FFT is 2^n (n is even)
649  {
650  //fstride is nfft>>2
651  for (f_count = fstride; f_count ; f_count --)
652  {
653  // load
654  scratch_in[0] = *Fin1;
655  Fin2 = Fin1 + fstride;
656  scratch_in[1] = *Fin2;
657  Fin2 = Fin2 + fstride;
658  scratch_in[2] = *Fin2;
659  Fin2 = Fin2 + fstride;
660  scratch_in[3] = *Fin2;
661 
662  // radix 4 butterfly without twiddles
663  if (scaled_flag == 1)
664  {
665  NE10_F2I32_FIXDIV (scratch_in[0], 4);
666  NE10_F2I32_FIXDIV (scratch_in[1], 4);
667  NE10_F2I32_FIXDIV (scratch_in[2], 4);
668  NE10_F2I32_FIXDIV (scratch_in[3], 4);
669  }
670 
671  // radix 2 butterfly
672  scratch[0].r = scratch_in[0].r + scratch_in[2].r;
673  scratch[0].i = scratch_in[0].i + scratch_in[2].i;
674 
675  scratch[1].r = scratch_in[0].r - scratch_in[2].r;
676  scratch[1].i = scratch_in[0].i - scratch_in[2].i;
677 
678  // radix 2 butterfly
679  scratch[2].r = scratch_in[1].r + scratch_in[3].r;
680  scratch[2].i = scratch_in[1].i + scratch_in[3].i;
681 
682  scratch[3].r = scratch_in[1].r - scratch_in[3].r;
683  scratch[3].i = scratch_in[1].i - scratch_in[3].i;
684 
685  // third result
686  scratch_out[2].r = scratch[0].r - scratch[2].r;
687  scratch_out[2].i = scratch[0].i - scratch[2].i;
688 
689  // first result
690  scratch_out[0].r = scratch[0].r + scratch[2].r;
691  scratch_out[0].i = scratch[0].i + scratch[2].i;
692 
693  // second result
694  scratch_out[1].r = scratch[1].r - scratch[3].i;
695  scratch_out[1].i = scratch[1].i + scratch[3].r;
696 
697  // forth result
698  scratch_out[3].r = scratch[1].r + scratch[3].i;
699  scratch_out[3].i = scratch[1].i - scratch[3].r;
700 
701  // store
702  * Fout1 ++ = scratch_out[0];
703  * Fout1 ++ = scratch_out[1];
704  * Fout1 ++ = scratch_out[2];
705  * Fout1 ++ = scratch_out[3];
706 
707  Fin1++;
708  } // f_count
709 
710  N = fstride; // 1/4 of length of FFT
711  // update address for other stages
712  stage_count--;
713  fstride >>= 2;
714 
715  // swap
716  Ftmp = buffer;
717  buffer = Fout;
718  Fout = Ftmp;
719 
720  // end of first stage
721  }
722 
723 
724  // others but the last one
725  for (; stage_count > 1 ; stage_count--)
726  {
727  Fin1 = buffer;
728  for (f_count = 0; f_count < fstride; f_count ++)
729  {
730  Fout1 = & Fout[ f_count * mstride << 2 ];
731  tw1 = tw;
732  for (m_count = mstride; m_count ; m_count --)
733  {
734  // load
735  scratch_tw[0] = *tw1;
736  tw2 = tw1 + mstride;
737  scratch_tw[1] = *tw2;
738  tw2 += mstride;
739  scratch_tw[2] = *tw2;
740  scratch_in[0] = * Fin1;
741  Fin2 = Fin1 + N;
742  scratch_in[1] = * Fin2;
743  Fin2 += N;
744  scratch_in[2] = * Fin2;
745  Fin2 += N;
746  scratch_in[3] = * Fin2;
747  if (scaled_flag == 1)
748  {
749  NE10_F2I32_FIXDIV (scratch_in[0], 4);
750  NE10_F2I32_FIXDIV (scratch_in[1], 4);
751  NE10_F2I32_FIXDIV (scratch_in[2], 4);
752  NE10_F2I32_FIXDIV (scratch_in[3], 4);
753  }
754 
755  // radix 4 butterfly with twiddles
756 
757  scratch[0] = scratch_in[0];
758  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
759  + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
760  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
761  - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
762 
763  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
764  + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
765  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
766  - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
767 
768  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
769  + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
770  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
771  - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
772 
773  // radix 2 butterfly
774  scratch[4].r = scratch[0].r + scratch[2].r;
775  scratch[4].i = scratch[0].i + scratch[2].i;
776 
777  scratch[5].r = scratch[0].r - scratch[2].r;
778  scratch[5].i = scratch[0].i - scratch[2].i;
779 
780  // radix 2 butterfly
781  scratch[6].r = scratch[1].r + scratch[3].r;
782  scratch[6].i = scratch[1].i + scratch[3].i;
783 
784  scratch[7].r = scratch[1].r - scratch[3].r;
785  scratch[7].i = scratch[1].i - scratch[3].i;
786 
787  // third result
788  scratch_out[2].r = scratch[4].r - scratch[6].r;
789  scratch_out[2].i = scratch[4].i - scratch[6].i;
790 
791  // first result
792  scratch_out[0].r = scratch[4].r + scratch[6].r;
793  scratch_out[0].i = scratch[4].i + scratch[6].i;
794 
795  // second result
796  scratch_out[1].r = scratch[5].r - scratch[7].i;
797  scratch_out[1].i = scratch[5].i + scratch[7].r;
798 
799  // forth result
800  scratch_out[3].r = scratch[5].r + scratch[7].i;
801  scratch_out[3].i = scratch[5].i - scratch[7].r;
802 
803  // store
804  *Fout1 = scratch_out[0];
805  Fout2 = Fout1 + mstride;
806  *Fout2 = scratch_out[1];
807  Fout2 += mstride;
808  *Fout2 = scratch_out[2];
809  Fout2 += mstride;
810  *Fout2 = scratch_out[3];
811 
812  tw1++;
813  Fin1 ++;
814  Fout1 ++;
815  } // m_count
816  } // f_count
817  tw += mstride * 3;
818  mstride <<= 2;
819  // swap
820  Ftmp = buffer;
821  buffer = Fout;
822  Fout = Ftmp;
823  fstride >>= 2;
824  } // stage_count
825 
826  // the last one
827  if (stage_count)
828  {
829  Fin1 = buffer;
830  // if stage count is even, output to the input array
831  Fout1 = Fout_ls;
832 
833  for (f_count = 0; f_count < fstride; f_count ++)
834  {
835  tw1 = tw;
836  for (m_count = mstride; m_count ; m_count --)
837  {
838  // load
839  scratch_tw[0] = *tw1;
840  tw2 = tw1 + mstride;
841  scratch_tw[1] = *tw2;
842  tw2 += mstride;
843  scratch_tw[2] = *tw2;
844  scratch_in[0] = * Fin1;
845  Fin2 = Fin1 + N;
846  scratch_in[1] = * Fin2;
847  Fin2 += N;
848  scratch_in[2] = * Fin2;
849  Fin2 += N;
850  scratch_in[3] = * Fin2;
851  if (scaled_flag == 1)
852  {
853  NE10_F2I32_FIXDIV (scratch_in[0], 4);
854  NE10_F2I32_FIXDIV (scratch_in[1], 4);
855  NE10_F2I32_FIXDIV (scratch_in[2], 4);
856  NE10_F2I32_FIXDIV (scratch_in[3], 4);
857  }
858 
859  // radix 4 butterfly with twiddles
860 
861  scratch[0] = scratch_in[0];
862  scratch[1].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].r
863  + (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].i) >> 31);
864  scratch[1].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[1].i * scratch_tw[0].r
865  - (NE10_F2I32_SAMPPROD) scratch_in[1].r * scratch_tw[0].i) >> 31);
866 
867  scratch[2].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].r
868  + (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].i) >> 31);
869  scratch[2].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[2].i * scratch_tw[1].r
870  - (NE10_F2I32_SAMPPROD) scratch_in[2].r * scratch_tw[1].i) >> 31);
871 
872  scratch[3].r = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].r
873  + (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].i) >> 31);
874  scratch[3].i = (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) scratch_in[3].i * scratch_tw[2].r
875  - (NE10_F2I32_SAMPPROD) scratch_in[3].r * scratch_tw[2].i) >> 31);
876 
877  // radix 2 butterfly
878  scratch[4].r = scratch[0].r + scratch[2].r;
879  scratch[4].i = scratch[0].i + scratch[2].i;
880 
881  scratch[5].r = scratch[0].r - scratch[2].r;
882  scratch[5].i = scratch[0].i - scratch[2].i;
883 
884  // radix 2 butterfly
885  scratch[6].r = scratch[1].r + scratch[3].r;
886  scratch[6].i = scratch[1].i + scratch[3].i;
887 
888  scratch[7].r = scratch[1].r - scratch[3].r;
889  scratch[7].i = scratch[1].i - scratch[3].i;
890 
891  // third result
892  scratch_out[2].r = scratch[4].r - scratch[6].r;
893  scratch_out[2].i = scratch[4].i - scratch[6].i;
894 
895  // first result
896  scratch_out[0].r = scratch[4].r + scratch[6].r;
897  scratch_out[0].i = scratch[4].i + scratch[6].i;
898 
899  // second result
900  scratch_out[1].r = scratch[5].r - scratch[7].i;
901  scratch_out[1].i = scratch[5].i + scratch[7].r;
902 
903  // forth result
904  scratch_out[3].r = scratch[5].r + scratch[7].i;
905  scratch_out[3].i = scratch[5].i - scratch[7].r;
906 
907  // store
908  *Fout1 = scratch_out[0];
909  Fout2 = Fout1 + N;
910  *Fout2 = scratch_out[1];
911  Fout2 += N;
912  *Fout2 = scratch_out[2];
913  Fout2 += N;
914  *Fout2 = scratch_out[3];
915 
916  tw1 ++;
917  Fin1 ++;
918  Fout1 ++;
919  } // m_count
920  } // f_count
921  } // last stage
922 }
923 
924 static void ne10_fft_split_r2c_1d_int32 (ne10_fft_cpx_int32_t *dst,
925  const ne10_fft_cpx_int32_t *src,
926  ne10_fft_cpx_int32_t *twiddles,
927  ne10_int32_t ncfft,
928  ne10_int32_t scaled_flag)
929 {
930  ne10_int32_t k;
931  ne10_fft_cpx_int32_t fpnk, fpk, f1k, f2k, tw, tdc;
932 
933  tdc.r = src[0].r;
934  tdc.i = src[0].i;
935 
936  if (scaled_flag)
937  NE10_F2I32_FIXDIV (tdc, 2);
938 
939  dst[0].r = tdc.r + tdc.i;
940  dst[ncfft].r = tdc.r - tdc.i;
941  dst[ncfft].i = dst[0].i = 0;
942 
943  for (k = 1; k <= ncfft / 2 ; ++k)
944  {
945  fpk = src[k];
946  fpnk.r = src[ncfft - k].r;
947  fpnk.i = - src[ncfft - k].i;
948  if (scaled_flag)
949  {
950  NE10_F2I32_FIXDIV (fpk, 2);
951  NE10_F2I32_FIXDIV (fpnk, 2);
952  }
953 
954  f1k.r = fpk.r + fpnk.r;
955  f1k.i = fpk.i + fpnk.i;
956 
957  f2k.r = fpk.r - fpnk.r;
958  f2k.i = fpk.i - fpnk.i;
959 
960  tw.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).i) >> 32))) << 1;
961  tw.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.r * (twiddles[k - 1]).i) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) f2k.i * (twiddles[k - 1]).r) >> 32))) << 1;
962 
963  dst[k].r = (f1k.r + tw.r) >> 1;
964  dst[k].i = (f1k.i + tw.i) >> 1;
965  dst[ncfft - k].r = (f1k.r - tw.r) >> 1;
966  dst[ncfft - k].i = (tw.i - f1k.i) >> 1;
967  }
968 }
969 
970 static void ne10_fft_split_c2r_1d_int32 (ne10_fft_cpx_int32_t *dst,
971  const ne10_fft_cpx_int32_t *src,
972  ne10_fft_cpx_int32_t *twiddles,
973  ne10_int32_t ncfft,
974  ne10_int32_t scaled_flag)
975 {
976 
977  ne10_int32_t k;
978  ne10_fft_cpx_int32_t fk, fnkc, fek, fok, tmp;
979 
980 
981  dst[0].r = src[0].r + src[ncfft].r;
982  dst[0].i = src[0].r - src[ncfft].r;
983 
984  if (scaled_flag)
985  NE10_F2I32_FIXDIV (dst[0], 2);
986 
987  for (k = 1; k <= ncfft / 2; k++)
988  {
989  fk = src[k];
990  fnkc.r = src[ncfft - k].r;
991  fnkc.i = -src[ncfft - k].i;
992  if (scaled_flag)
993  {
994  NE10_F2I32_FIXDIV (fk, 2);
995  NE10_F2I32_FIXDIV (fnkc, 2);
996  }
997 
998  fek.r = fk.r + fnkc.r;
999  fek.i = fk.i + fnkc.i;
1000 
1001  tmp.r = fk.r - fnkc.r;
1002  tmp.i = fk.i - fnkc.i;
1003 
1004  fok.r = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).r) >> 32)) + ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).i) >> 32))) << 1;
1005  fok.i = ( ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.i * (twiddles[k - 1]).r) >> 32)) - ( (ne10_int32_t) ( ( (NE10_F2I32_SAMPPROD) tmp.r * (twiddles[k - 1]).i) >> 32))) << 1;
1006 
1007  dst[k].r = fek.r + fok.r;
1008  dst[k].i = fek.i + fok.i;
1009 
1010  dst[ncfft - k].r = fek.r - fok.r;
1011  dst[ncfft - k].i = fok.i - fek.i;
1012  }
1013 }
1014 
1015 
1028 {
1029  ne10_fft_cfg_int32_t st = NULL;
1030  ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t)
1031  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors */
1032  + sizeof (ne10_fft_cpx_int32_t) * nfft /* twiddle */
1033  + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer */
1034  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
1035 
1036  st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded);
1037  if (st)
1038  {
1039  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t);
1040  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
1041  st->factors = (ne10_int32_t*) address;
1042  st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
1043  st->buffer = st->twiddles + nfft;
1044  st->nfft = nfft;
1045 
1046  ne10_int32_t result = ne10_factor (nfft, st->factors, NE10_FACTOR_DEFAULT);
1047  if (result == NE10_ERR)
1048  {
1049  NE10_FREE (st);
1050  return st;
1051  }
1052 
1053  ne10_int32_t *factors = st->factors;
1054  ne10_fft_cpx_int32_t *twiddles = st->twiddles;
1055 
1056  ne10_fft_generate_twiddles_int32 (twiddles, factors, nfft);
1057  }
1058  return st;
1059 }
1060 
1073  ne10_fft_cpx_int32_t *fin,
1075  ne10_int32_t inverse_fft,
1076  ne10_int32_t scaled_flag)
1077 {
1078  ne10_int32_t stage_count = cfg->factors[0];
1079  ne10_int32_t algorithm_flag = cfg->factors[2 * (stage_count + 1)];
1080 
1081  assert ((algorithm_flag == NE10_FFT_ALG_24)
1082  || (algorithm_flag == NE10_FFT_ALG_ANY));
1083 
1084  switch (algorithm_flag)
1085  {
1086  case NE10_FFT_ALG_24:
1087  if (inverse_fft)
1088  {
1089  ne10_mixed_radix_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1090  }
1091  else
1092  {
1093  ne10_mixed_radix_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1094  }
1095  break;
1096  case NE10_FFT_ALG_ANY:
1097  if (inverse_fft)
1098  {
1099  ne10_mixed_radix_generic_butterfly_inverse_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1100  }
1101  else
1102  {
1103  ne10_mixed_radix_generic_butterfly_int32_c (fout, fin, cfg->factors, cfg->twiddles, cfg->buffer, scaled_flag);
1104  }
1105  break;
1106  }
1107 }
1108  //end of C2C_FFT_IFFT group
1112 
1113 
1126 {
1127  ne10_fft_r2c_cfg_int32_t st = NULL;
1128  ne10_int32_t ncfft = nfft >> 1;
1129 
1130  ne10_uint32_t memneeded = sizeof (ne10_fft_r2c_state_int32_t)
1131  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors */
1132  + sizeof (ne10_fft_cpx_int32_t) * ncfft /* twiddle*/
1133  + sizeof (ne10_fft_cpx_int32_t) * ncfft / 2 /* super twiddles*/
1134  + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer*/
1135  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
1136 
1137  st = (ne10_fft_r2c_cfg_int32_t) NE10_MALLOC (memneeded);
1138 
1139  if (st)
1140  {
1141  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_r2c_state_int32_t);
1142  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
1143  st->factors = (ne10_int32_t*) address;
1144  st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
1145  st->super_twiddles = st->twiddles + ncfft;
1146  st->buffer = st->super_twiddles + (ncfft / 2);
1147  st->ncfft = ncfft;
1148 
1149  ne10_int32_t result = ne10_factor (ncfft, st->factors, NE10_FACTOR_DEFAULT);
1150  if (result == NE10_ERR)
1151  {
1152  NE10_FREE (st);
1153  return st;
1154  }
1155 
1156  ne10_int32_t i, j;
1157  ne10_int32_t *factors = st->factors;
1158  ne10_fft_cpx_int32_t *twiddles = st->twiddles;
1160  ne10_int32_t stage_count = factors[0];
1161  ne10_int32_t fstride1 = factors[1];
1162  ne10_int32_t fstride2 = fstride1 * 2;
1163  ne10_int32_t fstride3 = fstride1 * 3;
1164  ne10_int32_t m;
1165 
1166  const ne10_float32_t pi = NE10_PI;
1167  ne10_float32_t phase1;
1168  ne10_float32_t phase2;
1169  ne10_float32_t phase3;
1170 
1171  for (i = stage_count - 1; i > 0; i--)
1172  {
1173  fstride1 >>= 2;
1174  fstride2 >>= 2;
1175  fstride3 >>= 2;
1176  m = factors[2 * i + 1];
1177  tw = twiddles;
1178  for (j = 0; j < m; j++)
1179  {
1180  phase1 = -2 * pi * fstride1 * j / ncfft;
1181  phase2 = -2 * pi * fstride2 * j / ncfft;
1182  phase3 = -2 * pi * fstride3 * j / ncfft;
1183  tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase1));
1184  tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase1));
1185  (tw + m)->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase2));
1186  (tw + m)->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase2));
1187  (tw + m * 2)->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase3));
1188  (tw + m * 2)->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase3));
1189  tw++;
1190  }
1191  twiddles += m * 3;
1192  }
1193 
1194  tw = st->super_twiddles;
1195  for (i = 0; i < ncfft / 2; i++)
1196  {
1197  phase1 = -pi * ( (ne10_float32_t) (i + 1) / ncfft + 0.5f);
1198  tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos (phase1));
1199  tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin (phase1));
1200  tw++;
1201  }
1202 
1203  }
1204  return st;
1205 }
1206 
1220  ne10_int32_t *fin,
1222  ne10_int32_t scaled_flag)
1223 {
1224  ne10_fft_cpx_int32_t * tmpbuf = cfg->buffer;
1225 
1226  ne10_mixed_radix_butterfly_int32_c (tmpbuf, (ne10_fft_cpx_int32_t*) fin, cfg->factors, cfg->twiddles, fout, scaled_flag);
1227  ne10_fft_split_r2c_1d_int32 (fout, tmpbuf, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1228 }
1229 
1241 void ne10_fft_c2r_1d_int32_c (ne10_int32_t *fout,
1242  ne10_fft_cpx_int32_t *fin,
1244  ne10_int32_t scaled_flag)
1245 {
1246  ne10_fft_cpx_int32_t * tmpbuf1 = cfg->buffer;
1247  ne10_fft_cpx_int32_t * tmpbuf2 = cfg->buffer + cfg->ncfft;
1248 
1249  ne10_fft_split_c2r_1d_int32 (tmpbuf1, fin, cfg->super_twiddles, cfg->ncfft, scaled_flag);
1250  ne10_mixed_radix_butterfly_inverse_int32_c ( (ne10_fft_cpx_int32_t*) fout, tmpbuf1, cfg->factors, cfg->twiddles, tmpbuf2, scaled_flag);
1251 }
1252 
ne10_fft_r2c_1d_int32_c
void ne10_fft_r2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 FFT (real to complex) of int32 data.
Definition: NE10_fft_int32.c:1219
ne10_fft_state_int32_t
Definition: NE10_types.h:334
ne10_fft_cpx_int32_t
structure for the 32 bits fixed point FFT function.
Definition: NE10_types.h:328
ne10_fft_c2c_1d_int32_c
void ne10_fft_c2c_1d_int32_c(ne10_fft_cpx_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_cfg_int32_t cfg, ne10_int32_t inverse_fft, ne10_int32_t scaled_flag)
Mixed radix-2/4 complex FFT/IFFT of 32-bit fixed point data.
Definition: NE10_fft_int32.c:1072
ne10_fft_alloc_r2c_int32
ne10_fft_r2c_cfg_int32_t ne10_fft_alloc_r2c_int32(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft (r2c/c2r).
Definition: NE10_fft_int32.c:1125
ne10_fft_r2c_state_int32_t
Definition: NE10_types.h:345
ne10_fft_alloc_c2c_int32_c
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition: NE10_fft_int32.c:1027
ne10_fft_c2r_1d_int32_c
void ne10_fft_c2r_1d_int32_c(ne10_int32_t *fout, ne10_fft_cpx_int32_t *fin, ne10_fft_r2c_cfg_int32_t cfg, ne10_int32_t scaled_flag)
Mixed radix-2/4 IFFT (complex to real) of int32 data.
Definition: NE10_fft_int32.c:1241