47#ifndef NE10_FFT_GENERIC_FLOAT32_H
48#define NE10_FFT_GENERIC_FLOAT32_H
50#include "NE10_types.h"
51#include "NE10_macros.h"
61 scratch_out[0] = scratch_in[0];
62 NE10_CPX_MUL_F32 (scratch_out[1], scratch_in[1], scratch_tw[0]);
69 FFT2_MUL_TW (scratch_out, scratch_in, scratch_tw);
70 NE10_CPX_MUL_F32 (scratch_out[2], scratch_in[2], scratch_tw[1]);
77 FFT3_MUL_TW (scratch_out, scratch_in, scratch_tw);
78 NE10_CPX_MUL_F32 (scratch_out[3], scratch_in[3], scratch_tw[2]);
88 { 0.70711, -0.70711 },
89 { 0.00000, -1.00000 },
90 { -0.70711, -0.70711 },
93#define NE10_BUTTERFLY_INDEX_F32(OUT,IN,OUT_I,OUT_J,IN_I,IN_J) \
95 NE10_CPX_ADD (OUT[OUT_I],IN[IN_I],IN[IN_J]); \
96 NE10_CPX_SUB (OUT[OUT_J],IN[IN_I],IN[IN_J]); \
102 NE10_BUTTERFLY_INDEX_F32 (s,in,0,4,0,4);
103 NE10_BUTTERFLY_INDEX_F32 (s,in,1,5,1,5);
104 NE10_BUTTERFLY_INDEX_F32 (s,in,2,6,2,6);
105 NE10_BUTTERFLY_INDEX_F32 (s,in,3,7,3,7);
112#define NE10_CPX_MUL_TW8_F32(OUT,TW_8_TABLE,OUT_I,TW_J) \
114 ne10_fft_cpx_float32_t TW_TMP = TW_8_TABLE[TW_J]; \
115 NE10_CPX_MUL_F32 (OUT[OUT_I],OUT[OUT_I],TW_TMP); \
118 NE10_CPX_MUL_TW8_F32 (s,TW_8,4,0);
119 NE10_CPX_MUL_TW8_F32 (s,TW_8,5,1);
120 NE10_CPX_MUL_TW8_F32 (s,TW_8,6,2);
121 NE10_CPX_MUL_TW8_F32 (s,TW_8,7,3);
123 NE10_BUTTERFLY_INDEX_F32 (out,s,0,2,0,2);
124 NE10_BUTTERFLY_INDEX_F32 (out,s,1,3,1,3);
125 NE10_BUTTERFLY_INDEX_F32 (out,s,4,6,4,6);
126 NE10_BUTTERFLY_INDEX_F32 (out,s,5,7,5,7);
132 NE10_CPX_MUL_TW8_F32 (out,TW_8,2,0);
133 NE10_CPX_MUL_TW8_F32 (out,TW_8,3,2);
134 NE10_CPX_MUL_TW8_F32 (out,TW_8,6,0);
135 NE10_CPX_MUL_TW8_F32 (out,TW_8,7,2);
136#undef NE10_CPX_MUL_TW8_F32
138 NE10_BUTTERFLY_INDEX_F32 (s,out,0,4,0,1);
139 NE10_BUTTERFLY_INDEX_F32 (s,out,2,6,2,3);
140 NE10_BUTTERFLY_INDEX_F32 (s,out,1,5,4,5);
141 NE10_BUTTERFLY_INDEX_F32 (s,out,3,7,6,7);
158 FFT4_MUL_TW (scratch_out, scratch_in, scratch_tw);
159 NE10_CPX_MUL_F32 (scratch_out[4], scratch_in[4], scratch_tw[3]);
171 NE10_CPX_ADD (scratch_out[0], scratch_in[0], scratch_in[1]);
172 NE10_CPX_SUB (scratch_out[1], scratch_in[0], scratch_in[1]);
181 scratch_in[0] = Fin[0];
182 scratch_in[1] = Fin[1];
183 scratch_in[2] = Fin[2];
185 scratch[1] = scratch_in[1];
186 scratch[2] = scratch_in[2];
188 NE10_CPX_ADD (scratch[3], scratch[1], scratch[2]);
189 NE10_CPX_SUB (scratch[0], scratch[1], scratch[2]);
191 scratch_in[1].r = scratch_in[0].r - scratch[3].r * 0.5;
192 scratch_in[1].i = scratch_in[0].i - scratch[3].i * 0.5;
194 scratch[0].r *= -TW_3I_F32;
195 scratch[0].i *= -TW_3I_F32;
197 scratch_in[0].r += scratch[3].r;
198 scratch_in[0].i += scratch[3].i;
200 scratch_in[2].r = scratch_in[1].r + scratch[0].i;
201 scratch_in[2].i = scratch_in[1].i - scratch[0].r;
203 scratch_in[1].r -= scratch[0].i;
204 scratch_in[1].i += scratch[0].r;
206 Fout[0] = scratch_in[0];
207 Fout[1] = scratch_in[1];
208 Fout[2] = scratch_in[2];
216 NE10_CPX_ADD (scratch[0], scratch_in[0], scratch_in[2]);
217 NE10_CPX_SUB (scratch[1], scratch_in[0], scratch_in[2]);
218 NE10_CPX_ADD (scratch[2], scratch_in[1], scratch_in[3]);
219 NE10_CPX_SUB (scratch[3], scratch_in[1], scratch_in[3]);
221 NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
222 NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
224 scratch_out[1].r = scratch[1].r + scratch[3].i;
225 scratch_out[1].i = scratch[1].i - scratch[3].r;
226 scratch_out[3].r = scratch[1].r - scratch[3].i;
227 scratch_out[3].i = scratch[1].i + scratch[3].r;
234 NE10_CPX_ADD (scratch[0], scratch_out[0], scratch_out[2]);
235 NE10_CPX_SUB (scratch[1], scratch_out[0], scratch_out[2]);
236 NE10_CPX_ADD (scratch[2], scratch_out[1], scratch_out[3]);
237 NE10_CPX_SUB (scratch[3], scratch_out[1], scratch_out[3]);
239 NE10_CPX_SUB (scratch_out[2], scratch[0], scratch[2]);
240 NE10_CPX_ADD (scratch_out[0], scratch[0], scratch[2]);
242 scratch_out[1].r = scratch[1].r + scratch[3].i;
243 scratch_out[1].i = scratch[1].i - scratch[3].r;
244 scratch_out[3].r = scratch[1].r - scratch[3].i;
245 scratch_out[3].i = scratch[1].i + scratch[3].r;
253 scratch_in[0] = Fin[0];
254 scratch_in[1] = Fin[1];
255 scratch_in[2] = Fin[2];
256 scratch_in[3] = Fin[3];
257 scratch_in[4] = Fin[4];
259 scratch[0] = scratch_in[0];
260 scratch[1] = scratch_in[1];
261 scratch[2] = scratch_in[2];
262 scratch[3] = scratch_in[3];
263 scratch[4] = scratch_in[4];
265 NE10_CPX_ADD (scratch[ 7], scratch[1], scratch[4]);
266 NE10_CPX_SUB (scratch[10], scratch[1], scratch[4]);
267 NE10_CPX_ADD (scratch[ 8], scratch[2], scratch[3]);
268 NE10_CPX_SUB (scratch[ 9], scratch[2], scratch[3]);
270 scratch_in[0].r += scratch[7].r + scratch[8].r;
271 scratch_in[0].i += scratch[7].i + scratch[8].i;
273 scratch[5].r = scratch[0].r
274 + NE10_S_MUL (scratch[7].r, TW_5A_F32.r)
275 + NE10_S_MUL (scratch[8].r, TW_5B_F32.r);
276 scratch[5].i = scratch[0].i
277 + NE10_S_MUL (scratch[7].i, TW_5A_F32.r)
278 + NE10_S_MUL (scratch[8].i, TW_5B_F32.r);
280 scratch[6].r = NE10_S_MUL (scratch[10].i, TW_5A_F32.i)
281 + NE10_S_MUL (scratch[9].i, TW_5B_F32.i);
282 scratch[6].i = -NE10_S_MUL (scratch[10].r, TW_5A_F32.i)
283 - NE10_S_MUL (scratch[9].r, TW_5B_F32.i);
285 NE10_CPX_SUB (scratch_in[1], scratch[5], scratch[6]);
286 NE10_CPX_ADD (scratch_in[4], scratch[5], scratch[6]);
288 scratch[11].r = scratch[0].r
289 + NE10_S_MUL (scratch[7].r, TW_5B_F32.r)
290 + NE10_S_MUL (scratch[8].r, TW_5A_F32.r);
291 scratch[11].i = scratch[0].i
292 + NE10_S_MUL (scratch[7].i, TW_5B_F32.r)
293 + NE10_S_MUL (scratch[8].i, TW_5A_F32.r);
295 scratch[12].r = -NE10_S_MUL (scratch[10].i, TW_5B_F32.i)
296 + NE10_S_MUL (scratch[9].i, TW_5A_F32.i);
297 scratch[12].i = NE10_S_MUL (scratch[10].r, TW_5B_F32.i)
298 - NE10_S_MUL (scratch[9].r, TW_5A_F32.i);
300 NE10_CPX_ADD (scratch_in[2], scratch[11], scratch[12]);
301 NE10_CPX_SUB (scratch_in[3], scratch[11], scratch[12]);
303 Fout[0] = scratch_in[0];
304 Fout[1] = scratch_in[1];
305 Fout[2] = scratch_in[2];
306 Fout[3] = scratch_in[3];
307 Fout[4] = scratch_in[4];