Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
NE10_fft.c
1 /*
2  * Copyright 2014-15 ARM Limited and Contributors.
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * * Redistributions of source code must retain the above copyright
8  * notice, this list of conditions and the following disclaimer.
9  * * Redistributions in binary form must reproduce the above copyright
10  * notice, this list of conditions and the following disclaimer in the
11  * documentation and/or other materials provided with the distribution.
12  * * Neither the name of ARM Limited nor the
13  * names of its contributors may be used to endorse or promote products
14  * derived from this software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19  * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27 
28 /* license of Kiss FFT */
29 /*
30 Copyright (c) 2003-2010, Mark Borgerding
31 
32 All rights reserved.
33 
34 Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35 
36  * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37  * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38  * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39 
40 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41 */
42 
43 /*
44  * NE10 Library : dsp/NE10_fft.c
45  */
46 
47 #include "NE10_types.h"
48 #include "NE10_macros.h"
49 #include "NE10_fft.h"
50 
51 /* factors buffer:
52  * 0: stage number
53  * 1: stride for the first stage
54  * 2*stage number+2: algorithm flag to imply whether the generic algorithm works.
55  * others: factors
56  *
57  * Only the leading 42 int32 is used to store factors.
58  * The left can be used as algorithm flags, or status flags.
59  * Even the leading bits of stage number can be reused.
60  * */
61 ne10_int32_t ne10_factor (ne10_int32_t n,
62  ne10_int32_t * facbuf,
63  ne10_int32_t ne10_factor_flags)
64 {
65  // This is a walk around. We need to "return" some flags.
66  // Otherwise, we need to modify signature of ne10_factor.
67  assert (NE10_MAXFACTORS >= 32);
68 
69  if ((facbuf == NULL)
70  || (n <= 0))
71  {
72  return NE10_ERR;
73  }
74 
75  ne10_int32_t p;
76  ne10_int32_t i = 1;
77  ne10_int32_t stage_num = 0;
78  ne10_int32_t stride_max = n;
79 
80  // Default algorithm flag is NE10_FFT_ALG_24
81  ne10_int32_t alg_flag = NE10_FFT_ALG_24;
82 
83  // Factor out powers of 4, 2, 5, 3, and other.
84  do
85  {
86  // If NE10_FACTOR_FLAGS has enable NE10_FACTOR_EIGHT.
87  // Try to combine one radix-4 and one radix-2 stages
88  // into one radix-8 stage.
89  if ((ne10_factor_flags & NE10_FACTOR_EIGHT)
90  && ((n==8) || (n==40) || (n==24)))
91  {
92  switch (n)
93  {
94  case 8:
95  p = 8;
96  break;
97  case 24:
98  p = 3;
99  alg_flag = NE10_FFT_ALG_ANY;
100  break;
101  default: // n == 40
102  p = 5;
103  alg_flag = NE10_FFT_ALG_ANY;
104  break;
105  }
106  }
107  else if ((n % 4) == 0)
108  {
109  p = 4;
110  }
111  else if ((n % 2) == 0)
112  {
113  p = 2;
114  }
115  else if ((n % 5) == 0)
116  {
117  p = 5;
118  alg_flag = NE10_FFT_ALG_ANY;
119  }
120  else if ((n % 3) == 0)
121  {
122  p = 3;
123  alg_flag = NE10_FFT_ALG_ANY;
124  }
125  else // stop factoring
126  {
127  p = n;
128  alg_flag = NE10_FFT_ALG_ANY;
129  }
130 
131  n /= p;
132  facbuf[2 * i] = p;
133  facbuf[2 * i + 1] = n;
134  i++;
135  stage_num++;
136  }
137  while (n > 1);
138  facbuf[0] = stage_num;
139  facbuf[1] = stride_max / p;
140 
141  if (stage_num > 21)
142  {
143  // Since nfft is ne10_int32_t, stage_num can never be greater than 21,
144  // because 3^21 > 2^32
145  return NE10_ERR;
146  }
147 
148  facbuf[2 * i] = alg_flag;
149  return NE10_OK;
150 }
151 
152 // Twiddles matrix [radix-1][mstride]
153 // First column (k == 0) is ignored because phase == 1, and
154 // twiddle = (1.0, 0.0).
155 void ne10_fft_generate_twiddles_line_float32 (ne10_fft_cpx_float32_t * twiddles,
156  const ne10_int32_t mstride,
157  const ne10_int32_t fstride,
158  const ne10_int32_t radix,
159  const ne10_int32_t nfft)
160 {
161  ne10_int32_t j, k;
162  ne10_float32_t phase;
163  const ne10_float64_t pi = NE10_PI;
164 
165  for (j = 0; j < mstride; j++)
166  {
167  for (k = 1; k < radix; k++) // phase = 1 when k = 0
168  {
169  phase = -2 * pi * fstride * k * j / nfft;
170  twiddles[mstride * (k - 1) + j].r = (ne10_float32_t) cos (phase);
171  twiddles[mstride * (k - 1) + j].i = (ne10_float32_t) sin (phase);
172  } // radix
173  } // mstride
174 }
175 
176 // Transposed twiddles matrix [mstride][radix-1]
177 // First row (k == 0) is ignored because phase == 1, and
178 // twiddle = (1.0, 0.0).
179 // Transposed twiddle tables are used in RFFT to avoid memory access by a large
180 // stride.
181 void ne10_fft_generate_twiddles_line_transposed_float32 (
182  ne10_fft_cpx_float32_t* twiddles,
183  const ne10_int32_t mstride,
184  const ne10_int32_t fstride,
185  const ne10_int32_t radix,
186  const ne10_int32_t nfft)
187 {
188  ne10_int32_t j, k;
189  ne10_float32_t phase;
190  const ne10_float64_t pi = NE10_PI;
191 
192  for (j = 0; j < mstride; j++)
193  {
194  for (k = 1; k < radix; k++) // phase = 1 when k = 0
195  {
196  phase = -2 * pi * fstride * k * j / nfft;
197  twiddles[(radix - 1) * j + k - 1].r = (ne10_float32_t) cos (phase);
198  twiddles[(radix - 1) * j + k - 1].i = (ne10_float32_t) sin (phase);
199  } // radix
200  } // mstride
201 }
202 
203 // Twiddles matrix [mstride][radix-1]
204 // First column (k == 0)is ignored because phase == 1, and
205 // twiddle = (1.0, 0.0).
206 static void ne10_fft_generate_twiddles_line_int32 (ne10_fft_cpx_int32_t * twiddles,
207  const ne10_int32_t mstride,
208  const ne10_int32_t fstride,
209  const ne10_int32_t radix,
210  const ne10_int32_t nfft)
211 {
212  ne10_int32_t j, k;
213  ne10_float32_t phase;
214  const ne10_float64_t pi = NE10_PI;
215 
216  for (j = 0; j < mstride; j++)
217  {
218  for (k = 1; k < radix; k++) // phase = 1 when k = 0
219  {
220  phase = -2 * pi * fstride * k * j / nfft;
221 
222  ne10_fft_cpx_int32_t *tw = &twiddles[mstride * (k - 1) + j];
223 
224  tw->r = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * cos(phase));
225  tw->i = (ne10_int32_t) floor (0.5f + NE10_F2I32_MAX * sin(phase));
226  } // radix
227  } // mstride
228 }
229 
230 ne10_fft_cpx_int32_t* ne10_fft_generate_twiddles_int32 (ne10_fft_cpx_int32_t * twiddles,
231  const ne10_int32_t * factors,
232  const ne10_int32_t nfft )
233 {
234  ne10_int32_t stage_count = factors[0];
235  ne10_int32_t fstride = factors[1];
236  ne10_int32_t mstride;
237  ne10_int32_t cur_radix; // current radix
238 
239  // for first stage
240  cur_radix = factors[2 * stage_count];
241  if (cur_radix % 2) // current radix is not 4 or 2
242  {
243  twiddles += 1;
244  ne10_fft_generate_twiddles_line_int32 (twiddles, 1, fstride, cur_radix, nfft);
245  twiddles += cur_radix - 1;
246  }
247  stage_count--;
248 
249  // for other stage
250  for (; stage_count > 0; stage_count--)
251  {
252  cur_radix = factors[2 * stage_count];
253  fstride /= cur_radix;
254  mstride = factors[2 * stage_count + 1];
255  ne10_fft_generate_twiddles_line_int32 (twiddles, mstride, fstride, cur_radix, nfft);
256  twiddles += mstride * (cur_radix - 1);
257  } // stage_count
258 
259  return twiddles;
260 }
261 
262 typedef void (*line_generator_float32)(ne10_fft_cpx_float32_t*,
263  const ne10_int32_t,
264  const ne10_int32_t,
265  const ne10_int32_t,
266  const ne10_int32_t);
267 
268 ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_impl_float32 (
269  line_generator_float32 generator,
270  ne10_fft_cpx_float32_t * twiddles,
271  const ne10_int32_t * factors,
272  const ne10_int32_t nfft)
273 {
274  ne10_int32_t stage_count = factors[0];
275  ne10_int32_t fstride = factors[1];
276  ne10_int32_t mstride;
277  ne10_int32_t cur_radix; // current radix
278 
279  // for first stage
280  cur_radix = factors[2 * stage_count];
281  if (cur_radix % 2) // current radix is not 4 or 2
282  {
283  twiddles[0].r = 1.0;
284  twiddles[0].i = 0.0;
285  twiddles += 1;
286  generator (twiddles, 1, fstride, cur_radix, nfft);
287  twiddles += cur_radix - 1;
288  }
289  stage_count --;
290 
291  // for other stage
292  for (; stage_count > 0; stage_count --)
293  {
294  cur_radix = factors[2 * stage_count];
295  fstride /= cur_radix;
296  mstride = factors[2 * stage_count + 1];
297  generator (twiddles, mstride, fstride, cur_radix, nfft);
298  twiddles += mstride * (cur_radix - 1);
299  } // stage_count
300 
301  return twiddles;
302 }
303 
304 ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_float32 (ne10_fft_cpx_float32_t * twiddles,
305  const ne10_int32_t * factors,
306  const ne10_int32_t nfft )
307 {
308  line_generator_float32 generator = ne10_fft_generate_twiddles_line_float32;
309  twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
310  twiddles, factors, nfft);
311  return twiddles;
312 }
313 
314 ne10_fft_cpx_float32_t* ne10_fft_generate_twiddles_transposed_float32 (
315  ne10_fft_cpx_float32_t * twiddles,
316  const ne10_int32_t * factors,
317  const ne10_int32_t nfft)
318 {
319  line_generator_float32 generator =
320  ne10_fft_generate_twiddles_line_transposed_float32;
321  twiddles = ne10_fft_generate_twiddles_impl_float32(generator,
322  twiddles, factors, nfft);
323  return twiddles;
324 }
325 
338 {
339  // For input shorter than 16, fall back to c version.
340  // We would not get much improvement from NEON for these cases.
341  if (nfft < 16)
342  {
343  return ne10_fft_alloc_c2c_float32_c (nfft);
344  }
345 
346  ne10_fft_cfg_float32_t st = NULL;
347  ne10_uint32_t memneeded = sizeof (ne10_fft_state_float32_t)
348  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
349  + sizeof (ne10_fft_cpx_float32_t) * nfft /* twiddle*/
350  + sizeof (ne10_fft_cpx_float32_t) * nfft /* buffer*/
351  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
352 
353  st = (ne10_fft_cfg_float32_t) NE10_MALLOC (memneeded);
354 
355  // Only backward FFT is scaled by default.
356  st->is_forward_scaled = 0;
357  st->is_backward_scaled = 1;
358 
359  // Bad allocation.
360  if (st == NULL)
361  {
362  return st;
363  }
364 
365  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_float32_t);
366  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
367  st->factors = (ne10_int32_t*) address;
368  st->twiddles = (ne10_fft_cpx_float32_t*) (st->factors + (NE10_MAXFACTORS * 2));
369  st->buffer = st->twiddles + nfft;
370 
371  // st->last_twiddles is default NULL.
372  // Calling fft_c or fft_neon is decided by this pointers.
373  st->last_twiddles = NULL;
374 
375  st->nfft = nfft;
376  if (nfft % NE10_FFT_PARA_LEVEL == 0)
377  {
378  // Size of FFT satisfies requirement of NEON optimization.
379  st->nfft /= NE10_FFT_PARA_LEVEL;
380  st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
381  }
382 
383  ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
384 
385  // Can not factor.
386  if (result == NE10_ERR)
387  {
388  NE10_FREE (st);
389  return st;
390  }
391 
392  // Check if radix-8 can be enabled
393  ne10_int32_t stage_count = st->factors[0];
394  ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
395 
396  // Enable radix-8.
397  if (algorithm_flag == NE10_FFT_ALG_ANY)
398  {
399  result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
400  if (result == NE10_ERR)
401  {
402  NE10_FREE (st);
403  return st;
404  }
405  ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
406  }
407  else
408  {
409  st->last_twiddles = NULL;
410  st->nfft = nfft;
411  result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
412  ne10_fft_generate_twiddles_float32 (st->twiddles, st->factors, st->nfft);
413  return st;
414  }
415 
416  // Generate super twiddles for the last stage.
417  if (nfft % NE10_FFT_PARA_LEVEL == 0)
418  {
419  // Size of FFT satisfies requirement of NEON optimization.
420  ne10_fft_generate_twiddles_line_float32 (st->last_twiddles,
421  st->nfft,
422  1,
423  NE10_FFT_PARA_LEVEL,
424  nfft);
425  }
426  return st;
427 }
428 
436 {
437  // For input shorter than 16, fall back to c version.
438  // We would not get much improvement from NEON for these cases.
439  if (nfft < 16)
440  {
441  return ne10_fft_alloc_c2c_int32_c (nfft);
442  }
443 
444  ne10_fft_cfg_int32_t st = NULL;
445  ne10_uint32_t memneeded = sizeof (ne10_fft_state_int32_t)
446  + sizeof (ne10_int32_t) * (NE10_MAXFACTORS * 2) /* factors*/
447  + sizeof (ne10_fft_cpx_int32_t) * nfft /* twiddle*/
448  + sizeof (ne10_fft_cpx_int32_t) * nfft /* buffer*/
449  + NE10_FFT_BYTE_ALIGNMENT; /* 64-bit alignment*/
450 
451  st = (ne10_fft_cfg_int32_t) NE10_MALLOC (memneeded);
452 
453  // Bad allocation.
454  if (st == NULL)
455  {
456  return st;
457  }
458 
459  uintptr_t address = (uintptr_t) st + sizeof (ne10_fft_state_int32_t);
460  NE10_BYTE_ALIGNMENT (address, NE10_FFT_BYTE_ALIGNMENT);
461  st->factors = (ne10_int32_t*) address;
462  st->twiddles = (ne10_fft_cpx_int32_t*) (st->factors + (NE10_MAXFACTORS * 2));
463  st->buffer = st->twiddles + nfft;
464 
465  // st->last_twiddles is default NULL.
466  // Calling fft_c or fft_neon is decided by this pointers.
467  st->last_twiddles = NULL;
468 
469  st->nfft = nfft;
470  if (nfft % NE10_FFT_PARA_LEVEL == 0)
471  {
472  // Size of FFT satisfies requirement of NEON optimization.
473  st->nfft /= NE10_FFT_PARA_LEVEL;
474  st->last_twiddles = st->twiddles + nfft / NE10_FFT_PARA_LEVEL;
475  }
476 
477  ne10_int32_t result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
478 
479  // Can not factor.
480  if (result == NE10_ERR)
481  {
482  NE10_FREE (st);
483  return st;
484  }
485 
486  // Check if radix-8 can be enabled
487  ne10_int32_t stage_count = st->factors[0];
488  ne10_int32_t algorithm_flag = st->factors[2 * (stage_count + 1)];
489 
490  // Enable radix-8.
491  if (algorithm_flag == NE10_FFT_ALG_ANY)
492  {
493  result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_EIGHT);
494  if (result == NE10_ERR)
495  {
496  NE10_FREE (st);
497  return st;
498  }
499  ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
500  }
501  else
502  {
503  st->last_twiddles = NULL;
504  st->nfft = nfft;
505  result = ne10_factor (st->nfft, st->factors, NE10_FACTOR_DEFAULT);
506  ne10_fft_generate_twiddles_int32 (st->twiddles, st->factors, st->nfft);
507  return st;
508  }
509 
510  // Generate super twiddles for the last stage.
511  if (nfft % NE10_FFT_PARA_LEVEL == 0)
512  {
513  // Size of FFT satisfies requirement of NEON optimization.
514  ne10_fft_generate_twiddles_line_int32 (st->last_twiddles,
515  st->nfft,
516  1,
517  NE10_FFT_PARA_LEVEL,
518  nfft);
519  }
520  return st;
521 }
522 
530 void ne10_fft_destroy_c2c_float32 (ne10_fft_cfg_float32_t cfg)
531 {
532  free(cfg);
533 }
534 
535 void ne10_fft_destroy_c2c_int32 (ne10_fft_cfg_int32_t cfg)
536 {
537  free (cfg);
538 }
539 
540 void ne10_fft_destroy_c2c_int16 (ne10_fft_cfg_int16_t cfg)
541 {
542  free (cfg);
543 }
544  //end of C2C_FFT_IFFT_DESTROY group
548  //end of C2C_FFT_IFFT group
552 
565 void ne10_fft_destroy_r2c_float32 (ne10_fft_r2c_cfg_float32_t cfg)
566 {
567  free(cfg);
568 }
569 
570 void ne10_fft_destroy_r2c_int32 (ne10_fft_r2c_cfg_int32_t cfg)
571 {
572  free (cfg);
573 }
574 
575 void ne10_fft_destroy_r2c_int16 (ne10_fft_r2c_cfg_int16_t cfg)
576 {
577  free (cfg);
578 }
579  //end of R2C_FFT_IFFT_DESTROY group
583  //end of R2C_FFT_IFFT group
ne10_fft_state_float32_t
structure for the floating point FFT state
Definition: NE10_types.h:240
ne10_fft_alloc_c2c_float32_neon
ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_neon(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition: NE10_fft.c:337
ne10_fft_state_int32_t
Definition: NE10_types.h:334
ne10_fft_cpx_float32_t
Definition: NE10_types.h:230
ne10_fft_alloc_c2c_int32_neon
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_neon(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition: NE10_fft.c:435
ne10_fft_cpx_int32_t
structure for the 32 bits fixed point FFT function.
Definition: NE10_types.h:328
ne10_fft_state_int16_t
Definition: NE10_types.h:303
ne10_fft_alloc_c2c_float32_c
ne10_fft_cfg_float32_t ne10_fft_alloc_c2c_float32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition: NE10_fft_float32.c:997
ne10_fft_r2c_state_int32_t
Definition: NE10_types.h:345
ne10_fft_alloc_c2c_int32_c
ne10_fft_cfg_int32_t ne10_fft_alloc_c2c_int32_c(ne10_int32_t nfft)
User-callable function to allocate all necessary storage space for the fft.
Definition: NE10_fft_int32.c:1027
ne10_fft_r2c_state_int16_t
Definition: NE10_types.h:313
ne10_fft_r2c_state_float32_t
Definition: NE10_types.h:272
ne10_fft_state_float32_t::is_forward_scaled
ne10_int32_t is_forward_scaled
@biref Flag to control scaling behaviour in forward floating point complex FFT.
Definition: NE10_types.h:255
ne10_fft_state_float32_t::is_backward_scaled
ne10_int32_t is_backward_scaled
@biref Flag to control scaling behaviour in backward floating point complex FFT.
Definition: NE10_types.h:264