Project Ne10
An Open Optimized Software Library Project for the ARM Architecture
Loading...
Searching...
No Matches
NE10_rfft_float32.neonintrinsic.c
1/*
2 * Copyright 2014-15 ARM Limited and Contributors.
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * * Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * * Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 * * Neither the name of ARM Limited nor the
13 * names of its contributors may be used to endorse or promote products
14 * derived from this software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY ARM LIMITED AND CONTRIBUTORS "AS IS" AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 * DISCLAIMED. IN NO EVENT SHALL ARM LIMITED AND CONTRIBUTORS BE LIABLE FOR ANY
20 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28/* license of Kiss FFT */
29/*
30Copyright (c) 2003-2010, Mark Borgerding
31
32All rights reserved.
33
34Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
35
36 * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
37 * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
38 * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
39
40THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41*/
42
43/*
44 * NE10 Library : dsp/NE10_rfft_float32.neonintrinsic.c
45 */
46
47#include <arm_neon.h>
48
49#include "NE10_types.h"
50#include "NE10_macros.h"
51#include "NE10_fft.h"
52#include "NE10_dsp.h"
53#include "NE10_fft.neonintrinsic.h"
54
55NE10_INLINE void ne10_radix8x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
56 const ne10_fft_cpx_float32_t *Fin,
57 const ne10_int32_t fstride,
58 const ne10_int32_t mstride,
59 const ne10_int32_t nfft)
60{
61 ne10_int32_t f_count;
62
63 NE10_DECLARE_8(float32x4_t,q_in);
64 NE10_DECLARE_8(float32x4_t,q_out);
65
66 const float32x4_t *Fin_neon = (float32x4_t*) Fin; // 8 x fstride
67 float32x4_t *Fout_neon = (float32x4_t*) Fout; // fstride x 8
68
69 for (f_count = fstride; f_count > 0; f_count --)
70 {
71 // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = fstride
72 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
73
74 // print q_in0 ~ q_in7
75 // NE10_PRINT_Qx8_VECTOR(q_in);
76
77 // do r2c fft, size = 8
78 NE10_RADIX8x4_R2C_NEON_KERNEL(q_out,q_in);
79
80 // print q_out0 ~ q_out7
81 // NE10_PRINT_Qx8_VECTOR(q_out);
82
83 // store q_out0 ~ q_out7 to Fout_neon, by step = 1
84 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,1);
85
86 Fin_neon = Fin_neon - fstride * 8 + 1;
87 Fout_neon += 8; // next column
88 }
89}
90
91NE10_INLINE void ne10_radix8x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
92 const ne10_fft_cpx_float32_t *Fin,
93 const ne10_int32_t fstride,
94 const ne10_int32_t mstride,
95 const ne10_int32_t nfft)
96{
97 ne10_int32_t f_count;
98
99 NE10_DECLARE_8(float32x4_t,q_in);
100 NE10_DECLARE_8(float32x4_t,q_out);
101
102 const ne10_float32_t one_by_N = 0.25 / nfft;
103 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
104
105 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
106 float32x4_t *Fout_neon = (float32x4_t*) Fout;
107
108 for (f_count = fstride; f_count > 0; f_count --)
109 {
110 // from Fin_neon load 8 float32x4_t into q_in0 ~ q_in7, by step = 1
111 NE10_RADIX8x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
112
113 // NE10_PRINT_Qx8_VECTOR(q_in);
114
115 NE10_RADIX8x4_C2R_NEON_KERNEL(q_out,q_in);
116
117 // NE10_PRINT_Qx8_VECTOR(q_out);
118
119#ifdef NE10_DSP_RFFT_SCALING
120 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
121 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
122 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
123 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
124 q_out4 = vmulq_f32(q_out4,one_by_N_neon);
125 q_out5 = vmulq_f32(q_out5,one_by_N_neon);
126 q_out6 = vmulq_f32(q_out6,one_by_N_neon);
127 q_out7 = vmulq_f32(q_out7,one_by_N_neon);
128#endif
129
130 // store
131 NE10_RADIX8x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
132
133 Fout_neon ++;
134 }
135}
136
137NE10_INLINE void ne10_radix4x4_r2c_neon (ne10_fft_cpx_float32_t *Fout,
138 const ne10_fft_cpx_float32_t *Fin,
139 const ne10_int32_t fstride,
140 const ne10_int32_t mstride,
141 const ne10_int32_t nfft)
142{
143 ne10_int32_t f_count;
144
145 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
146 float32x4_t *Fout_neon = (float32x4_t*) Fout;
147
148 for (f_count = 0; f_count < fstride; f_count ++)
149 {
150 NE10_DECLARE_4(float32x4_t,q_in);
151 NE10_DECLARE_4(float32x4_t,q_out);
152
153 // load
154 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,fstride);
155
156 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in)
157
158 // store
159 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,1);
160
161 Fin_neon = Fin_neon - 4*fstride + 1;
162 Fout_neon += 4;
163 }
164}
165
166NE10_INLINE void ne10_radix4x4_c2r_neon (ne10_fft_cpx_float32_t *Fout,
167 const ne10_fft_cpx_float32_t *Fin,
168 const ne10_int32_t fstride,
169 const ne10_int32_t mstride,
170 const ne10_int32_t nfft)
171{
172 ne10_int32_t f_count;
173
174 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
175 float32x4_t *Fout_neon = (float32x4_t*) Fout;
176
177 const ne10_float32_t one_by_N = 0.25 / nfft;
178 const float32x4_t one_by_N_neon = vdupq_n_f32(one_by_N);
179
180 for (f_count = 0; f_count < fstride; f_count ++)
181 {
182 NE10_DECLARE_4(float32x4_t,q_in);
183 NE10_DECLARE_4(float32x4_t,q_out);
184
185 // load
186 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,1);
187
188 // NE10_PRINT_Qx4_VECTOR(q_in);
189
190 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in)
191
192 // NE10_PRINT_Qx4_VECTOR(q_out);
193
194#ifdef NE10_DSP_RFFT_SCALING
195 q_out0 = vmulq_f32(q_out0,one_by_N_neon);
196 q_out1 = vmulq_f32(q_out1,one_by_N_neon);
197 q_out2 = vmulq_f32(q_out2,one_by_N_neon);
198 q_out3 = vmulq_f32(q_out3,one_by_N_neon);
199#endif
200
201 // store
202 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,fstride);
203
204 Fout_neon ++;
205 }
206}
207
208NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
209 const float32x4_t *Fin_neon,
210 const ne10_int32_t out_step,
211 const ne10_int32_t in_step,
212 const ne10_fft_cpx_float32_t *twiddles)
213{
214 NE10_DECLARE_4(float32x4_t,q_in);
215 NE10_DECLARE_4(float32x4_t,q_out);
216
217 // load
218 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
219
220 NE10_RADIX4x4_R2C_NEON_KERNEL(q_out,q_in);
221
222 // store
223 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
224 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) - 1), q_out1);
225 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
226 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2 * (out_step << 1) - 1), q_out3);
227}
228
229NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon (float32x4_t *Fout_neon,
230 const float32x4_t *Fin_neon,
231 const ne10_int32_t out_step,
232 const ne10_int32_t in_step,
233 const ne10_fft_cpx_float32_t *twiddles)
234{
235 NE10_DECLARE_4(float32x4_t,q_in);
236 NE10_DECLARE_4(float32x4_t,q_out);
237
238 // load
239 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
240 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) - 1) );
241 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
242 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2 * (out_step << 1) - 1) );
243
244 // NE10_PRINT_Qx4_VECTOR(q_in);
245
246 NE10_RADIX4x4_C2R_NEON_KERNEL(q_out,q_in);
247
248 // NE10_PRINT_Qx4_VECTOR(q_out);
249
250 // store
251 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
252}
253
254NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
255 const float32x4_t *Fin_neon,
256 const ne10_int32_t out_step,
257 const ne10_int32_t in_step,
258 const ne10_fft_cpx_float32_t *twiddles)
259{
260 ne10_int32_t m_count;
261 ne10_int32_t loop_count = (out_step>>1) -1;
262 float32x4_t *Fout_b = Fout_neon + (((out_step<<1)-1)<<1) - 2; // reversed
263
264 NE10_DECLARE_3(float32x4x2_t,q2_tw);
265 NE10_DECLARE_4(float32x4x2_t,q2_in);
266 NE10_DECLARE_4(float32x4x2_t,q2_out);
267
268 for (m_count = loop_count; m_count > 0; m_count -- )
269 {
270#ifndef NE10_INLINE_ASM_OPT
271 // load
272 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step ) );
273 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 0*in_step + 1) );
274
275 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step ) );
276 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1*in_step + 1) );
277
278 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step ) );
279 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 2*in_step + 1) );
280
281 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step ) );
282 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) (Fin_neon + 3*in_step + 1) );
283
284 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
285 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
286
287 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
288 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
289
290 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
291 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
292
293 // R2C TW KERNEL
294 NE10_RADIX4x4_R2C_TW_MUL_NEON (q2_out, q2_in, q2_tw);
295#else // NE10_INLINE_ASM_OPT
296#ifndef __aarch64__
297#error Currently, inline assembly optimizations are only available on AArch64.
298#else // __aarch64__
299 const ne10_float32_t *ptr_inr = ((const ne10_float32_t *) Fin_neon);
300 const ne10_float32_t *ptr_ini = ((const ne10_float32_t *) Fin_neon) + 4;
301 asm volatile (
302 "ld1 {%[q2_out0r].4s}, [%[ptr_inr]], %[offset_in] \n\t"
303 "ld1 {%[q2_out0i].4s}, [%[ptr_ini]] \n\t"
304 "ld1 {v10.4s, v11.4s}, [%[ptr_inr]], %[offset_in] \n\t"
305 "ld1 {v12.4s, v13.4s}, [%[ptr_inr]], %[offset_in] \n\t"
306 "ld1 {v14.4s, v15.4s}, [%[ptr_inr]] \n\t"
307 "ld1 {v0.1d, v1.1d, v2.1d}, [%[ptr_tw]] \n\t"
308
309 "fmul %[q2_out1r].4s, v10.4s, v0.4s[0] \n\t" // RR
310 "fmul %[q2_out1i].4s, v10.4s, v0.4s[1] \n\t" // RI
311 "fmls %[q2_out1r].4s, v11.4s, v0.4s[1] \n\t" // RR - II
312 "fmla %[q2_out1i].4s, v11.4s, v0.4s[0] \n\t" // RI + IR
313
314 "fmul %[q2_out2r].4s, v12.4s, v1.4s[0] \n\t" // RR
315 "fmul %[q2_out2i].4s, v12.4s, v1.4s[1] \n\t" // RI
316 "fmls %[q2_out2r].4s, v13.4s, v1.4s[1] \n\t" // RR - II
317 "fmla %[q2_out2i].4s, v13.4s, v1.4s[0] \n\t" // RI + IR
318
319 "fmul %[q2_out3r].4s, v14.4s, v2.4s[0] \n\t" // RR
320 "fmul %[q2_out3i].4s, v14.4s, v2.4s[1] \n\t" // RI
321 "fmls %[q2_out3r].4s, v15.4s, v2.4s[1] \n\t" // RR - II
322 "fmla %[q2_out3i].4s, v15.4s, v2.4s[0] \n\t" // RI + IR
323 : [q2_out0r]"+w"(q2_out0.val[0]),
324 [q2_out0i]"+w"(q2_out0.val[1]),
325 [q2_out1r]"+w"(q2_out1.val[0]),
326 [q2_out1i]"+w"(q2_out1.val[1]),
327 [q2_out2r]"+w"(q2_out2.val[0]),
328 [q2_out2i]"+w"(q2_out2.val[1]),
329 [q2_out3r]"+w"(q2_out3.val[0]),
330 [q2_out3i]"+w"(q2_out3.val[1]),
331 [ptr_inr]"+r"(ptr_inr),
332 [ptr_ini]"+r"(ptr_ini)
333 : [offset_in]"r"(in_step * 16),
334 [ptr_tw]"r"(twiddles)
335 : "memory", "v0", "v1", "v2",
336 "v10", "v11", "v12", "v13", "v14", "v15"
337 );
338#endif // __aarch64__
339#endif // NE10_INLINE_ASM_OPT
340
341 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S1 (q2_in, q2_out);
342 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_S2 (q2_out, q2_in);
343
344 // store
345 vst1q_f32( (ne10_float32_t*) ( Fout_neon ), q2_out0.val[0] );
346 vst1q_f32( (ne10_float32_t*) ( Fout_neon + 1), q2_out0.val[1] );
347
348 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) ), q2_out1.val[0] );
349 vst1q_f32( (ne10_float32_t*) ( Fout_neon + (out_step << 1) + 1), q2_out1.val[1] );
350
351 vst1q_f32( (ne10_float32_t*) ( Fout_b ), q2_out2.val[0] );
352 vst1q_f32( (ne10_float32_t*) ( Fout_b + 1), q2_out2.val[1] );
353
354 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) ), q2_out3.val[0] );
355 vst1q_f32( (ne10_float32_t*) ( Fout_b - (out_step << 1) + 1), q2_out3.val[1] );
356
357 // update pointers
358 Fin_neon += 2;
359 Fout_neon += 2;
360 Fout_b -= 2;
361 twiddles += 3;
362 }
363}
364
365NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon (float32x4_t *Fout_neon,
366 const float32x4_t *Fin_neon,
367 const ne10_int32_t out_step,
368 const ne10_int32_t in_step,
369 const ne10_fft_cpx_float32_t *twiddles)
370{
371 ne10_int32_t m_count;
372 ne10_int32_t loop_count = (out_step>>1) -1;
373 const float32x4_t *Fin_b = Fin_neon + (((out_step<<1)-1)<<1) - 2; // reversed
374
375 NE10_DECLARE_3(float32x4x2_t,q2_tw);
376 NE10_DECLARE_4(float32x4x2_t,q2_in);
377 NE10_DECLARE_4(float32x4x2_t,q2_out);
378
379 for (m_count = loop_count; m_count > 0; m_count -- )
380 {
381 // load
382 q2_in0.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon ) );
383 q2_in0.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + 1) );
384
385 q2_in1.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) ) );
386 q2_in1.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_neon + (out_step << 1) + 1) );
387
388 q2_in2.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b ) );
389 q2_in2.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b + 1) );
390
391 q2_in3.val[0] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) ) );
392 q2_in3.val[1] = vld1q_f32( (ne10_float32_t*) ( Fin_b - (out_step << 1) + 1) );
393
394 q2_tw0.val[0] = vdupq_n_f32(twiddles[0].r);
395 q2_tw0.val[1] = vdupq_n_f32(twiddles[0].i);
396
397 q2_tw1.val[0] = vdupq_n_f32(twiddles[1].r);
398 q2_tw1.val[1] = vdupq_n_f32(twiddles[1].i);
399
400 q2_tw2.val[0] = vdupq_n_f32(twiddles[2].r);
401 q2_tw2.val[1] = vdupq_n_f32(twiddles[2].i);
402
403 // NE10_PRINT_Q2x4_VECTOR(q2_in);
404
405 // R2C TW KERNEL
406 NE10_RADIX4x4_C2R_TW_NEON_KERNEL(q2_out,q2_in,q2_tw);
407
408 // NE10_PRINT_Q2x4_VECTOR(q2_out);
409
410 // store
411 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step ), q2_out0.val[0] );
412 vst1q_f32( (ne10_float32_t*) (Fout_neon + 0*in_step + 1), q2_out0.val[1] );
413
414 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step ), q2_out1.val[0] );
415 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1*in_step + 1), q2_out1.val[1] );
416
417 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step ), q2_out2.val[0] );
418 vst1q_f32( (ne10_float32_t*) (Fout_neon + 2*in_step + 1), q2_out2.val[1] );
419
420 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step ), q2_out3.val[0] );
421 vst1q_f32( (ne10_float32_t*) (Fout_neon + 3*in_step + 1), q2_out3.val[1] );
422
423 // update pointers
424 Fin_neon += 2;
425 Fout_neon += 2;
426 Fin_b -= 2;
427 twiddles += 3;
428 }
429}
430
431NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
432 const float32x4_t *Fin_neon,
433 const ne10_int32_t out_step,
434 const ne10_int32_t in_step,
435 const ne10_fft_cpx_float32_t *twiddles)
436{
437 NE10_DECLARE_4(float32x4_t,q_in);
438 NE10_DECLARE_4(float32x4_t,q_out);
439
440 // load
441 NE10_RADIX4x4_R2C_NEON_LOAD(Fin_neon,q_in,in_step);
442
443 NE10_RADIX4x4_R2C_TW_NEON_KERNEL_LAST(q_out,q_in);
444
445 // store
446 vst1q_f32( (ne10_float32_t*) (Fout_neon ), q_out0);
447 vst1q_f32( (ne10_float32_t*) (Fout_neon + 1), q_out1);
448 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) ), q_out2);
449 vst1q_f32( (ne10_float32_t*) (Fout_neon + (out_step << 1) + 1), q_out3);
450}
451
452NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (float32x4_t *Fout_neon,
453 const float32x4_t *Fin_neon,
454 const ne10_int32_t out_step,
455 const ne10_int32_t in_step,
456 const ne10_fft_cpx_float32_t *twiddles)
457{
458 NE10_DECLARE_4(float32x4_t,q_in);
459 NE10_DECLARE_4(float32x4_t,q_out);
460
461 // load
462 q_in0 = vld1q_f32( (ne10_float32_t*) (Fin_neon ) );
463 q_in1 = vld1q_f32( (ne10_float32_t*) (Fin_neon + 1) );
464 q_in2 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) ) );
465 q_in3 = vld1q_f32( (ne10_float32_t*) (Fin_neon + (out_step << 1) + 1) );
466
467 // NE10_PRINT_Qx4_VECTOR(q_in);
468
469 NE10_RADIX4x4_C2R_TW_NEON_KERNEL_LAST(q_out,q_in);
470
471 // NE10_PRINT_Qx4_VECTOR(q_out);
472
473 // store
474 NE10_RADIX4x4_R2C_NEON_STORE(Fout_neon,q_out,in_step);
475}
476
477NE10_INLINE void ne10_radix4x4_r2c_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
478 const ne10_fft_cpx_float32_t *Fin,
479 const ne10_int32_t fstride,
480 const ne10_int32_t mstride,
481 const ne10_int32_t nfft,
482 const ne10_fft_cpx_float32_t *twiddles)
483{
484 ne10_int32_t f_count;
485 const ne10_int32_t in_step = nfft >> 2;
486 const ne10_int32_t out_step = mstride;
487
488 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
489 float32x4_t *Fout_neon = (float32x4_t*) Fout;
490 const ne10_fft_cpx_float32_t *tw;
491
492 for (f_count = fstride; f_count; f_count --)
493 {
494 tw = twiddles + 3;
495
496 // first butterfly
497 ne10_radix4x4_r2c_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
498
499 Fin_neon ++;
500 Fout_neon ++;
501
502 // other butterfly
503 // Twiddle tables are transposed to avoid memory access by a large stride.
504 ne10_radix4x4_r2c_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
505
506 // update Fin_r, Fout_r, twiddles
507 Fin_neon += 2 * ( (out_step >> 1) - 1);
508 Fout_neon += 2 * ( (out_step >> 1) - 1);
509
510 // last butterfly
511 ne10_radix4x4_r2c_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
512 Fin_neon ++;
513 Fout_neon ++;
514
515 Fout_neon = Fout_neon + 3 * out_step;
516 } // f_count
517}
518
519NE10_INLINE void ne10_radix4x4_c2r_with_twiddles_neon (ne10_fft_cpx_float32_t *Fout,
520 const ne10_fft_cpx_float32_t *Fin,
521 const ne10_int32_t fstride,
522 const ne10_int32_t mstride,
523 const ne10_int32_t nfft,
524 const ne10_fft_cpx_float32_t *twiddles)
525{
526 ne10_int32_t f_count;
527 const ne10_int32_t in_step = nfft >> 2;
528 const ne10_int32_t out_step = mstride;
529
530 const float32x4_t *Fin_neon = (float32x4_t*) Fin;
531 float32x4_t *Fout_neon = (float32x4_t*) Fout;
532 const ne10_fft_cpx_float32_t *tw;
533
534 for (f_count = fstride; f_count; f_count --)
535 {
536 tw = twiddles + 3;
537
538 // first butterfly
539 ne10_radix4x4_c2r_with_twiddles_first_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, NULL);
540
541 Fin_neon ++;
542 Fout_neon ++;
543
544 // other butterfly
545 // Twiddle tables are transposed to avoid memory access by a large stride.
546 ne10_radix4x4_c2r_with_twiddles_other_butterfly_neon ( Fout_neon, Fin_neon, out_step, in_step, tw);
547
548 // update Fin_r, Fout_r, twiddles
549 Fin_neon += 2 * ( (out_step >> 1) - 1);
550 Fout_neon += 2 * ( (out_step >> 1) - 1);
551
552 // last butterfly
553 ne10_radix4x4_c2r_with_twiddles_last_butterfly_neon (Fout_neon, Fin_neon, out_step, in_step, NULL);
554 Fin_neon ++;
555 Fout_neon ++;
556
557 Fin_neon = Fin_neon + 3 * out_step;
558 } // f_count
559}
560
561NE10_INLINE void ne10_mixed_radix_r2c_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
562 const ne10_fft_cpx_float32_t * Fin,
563 const ne10_int32_t * factors,
564 const ne10_fft_cpx_float32_t * twiddles,
565 ne10_fft_cpx_float32_t * buffer)
566{
567 ne10_int32_t fstride, mstride, nfft;
568 ne10_int32_t radix;
569 ne10_int32_t stage_count;
570
571 // PRINT_STAGE_INFO;
572
573 // init fstride, mstride, radix, nfft
574 stage_count = factors[0];
575 fstride = factors[1];
576 mstride = factors[ (stage_count << 1) - 1 ];
577 radix = factors[ stage_count << 1 ];
578 nfft = radix * fstride; // not the real nfft
579
580 // PRINT_STAGE_INFO;
581
582 if (radix == 2)
583 {
584 // combine one radix-4 and one radix-2 into one radix-8
585 mstride <<= 2;
586 fstride >>= 2;
587 twiddles += 6;
588 stage_count --;
589 }
590
591 if (stage_count % 2 == 1) // since there is another stage outside
592 {
593 ne10_swap_ptr (buffer, Fout);
594 }
595
596 // the first stage
597 if (radix == 2) // length of FFT is 2^n (n is odd)
598 {
599 ne10_radix8x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
600 }
601 else if (radix == 4) // length of FFT is 2^n (n is even)
602 {
603 ne10_radix4x4_r2c_neon (Fout, Fin, fstride, mstride, nfft);
604 }
605 // end of first stage
606
607 // others
608 for (; fstride > 1;)
609 {
610 fstride >>= 2;
611 ne10_swap_ptr (buffer, Fout);
612
613 ne10_radix4x4_r2c_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
614 twiddles += 3 * mstride;
615 mstride <<= 2;
616 } // other stage
617}
618
619NE10_INLINE void ne10_mixed_radix_c2r_butterfly_float32_neon (ne10_fft_cpx_float32_t * Fout,
620 const ne10_fft_cpx_float32_t * Fin,
621 const ne10_int32_t * factors,
622 const ne10_fft_cpx_float32_t * twiddles,
623 ne10_fft_cpx_float32_t * buffer)
624{
625 ne10_int32_t fstride, mstride, nfft;
626 ne10_int32_t radix;
627 ne10_int32_t stage_count;
628
629 // PRINT_STAGE_INFO;
630
631 // init fstride, mstride, radix, nfft
632 stage_count = factors[0];
633 fstride = factors[1];
634
635 mstride = factors[ (stage_count << 1) - 1 ];
636 radix = factors[ stage_count << 1 ];
637 nfft = radix * fstride; // not the real nfft
638
639 // fstride, mstride for last last stage
640 fstride = 1;
641 mstride = nfft >> 2;
642
643 if (radix == 2)
644 {
645 // combine one radix-4 and one radix-2 into one radix-8
646 stage_count --;
647 }
648
649 if (stage_count % 2 == 0)
650 {
651 ne10_swap_ptr(Fout,buffer);
652 }
653
654 // others but the first stage
655 for (; stage_count > 1;)
656 {
657 twiddles -= 3 * mstride;
658
659 // PRINT_STAGE_INFO;
660 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
661 ne10_radix4x4_c2r_with_twiddles_neon (Fout, buffer, fstride, mstride, nfft, twiddles);
662
663 fstride <<= 2;
664 mstride >>= 2;
665 stage_count --;
666 ne10_swap_ptr (buffer, Fout);
667 }
668
669 // first stage -- inversed
670 if (radix == 2) // length of FFT is 2^n (n is odd)
671 {
672 mstride >>= 1;
673
674 // PRINT_STAGE_INFO;
675 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
676 ne10_radix8x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
677 }
678 else if (radix == 4) // length of FFT is 2^n (n is even)
679 {
680 // PRINT_STAGE_INFO;
681 // PRINT_POINTERS_INFO(Fin,Fout,buffer,twiddles);
682 ne10_radix4x4_c2r_neon (Fout, buffer, fstride, mstride, nfft);
683 }
684}
685
686NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
687 const ne10_fft_cpx_float32_t *src,
688 const ne10_fft_cpx_float32_t *twiddles,
689 const ne10_int32_t nfft)
690{
691 // b0
692 {
693 ne10_float32_t q_4r_out[4];
694 const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
695
696 NE10_FFT_R2C_4R_RCR(q_4r_out,p_src_r);
697
698 dst[0].r = q_4r_out[0];
699 dst[0].i = q_4r_out[3];
700 dst += (nfft>>2);
701 dst[0].r = q_4r_out[1];
702 dst[0].i = q_4r_out[2];
703 dst -= (nfft>>2);
704 }
705
706 // b2
707 {
708 const ne10_float32_t *p_src_r = (const ne10_float32_t*) (src);
709 p_src_r += nfft;
710 p_src_r -= 4;
711
712 ne10_float32_t q_4r_out[4];
713
714 NE10_FFT_R2C_4R_CC(q_4r_out,p_src_r);
715
716 dst += (nfft>>3);
717 dst[0].r = q_4r_out[0];
718 dst[0].i = q_4r_out[1];
719 dst += (nfft>>2);
720 dst[0].r = q_4r_out[2];
721 dst[0].i = q_4r_out[3];
722 dst -= (nfft>>3);
723 dst -= (nfft>>2);
724 }
725
726 // b1
727 ne10_fft_cpx_float32_t cc_out[4];
728 ne10_fft_cpx_float32_t cc_in [4];
729 const ne10_float32_t *p_src_r = (const ne10_float32_t*) src;
730 p_src_r += 4;
731
732 cc_out[0].r = *(p_src_r ++);
733 cc_out[1].r = *(p_src_r ++);
734 cc_out[2].r = *(p_src_r ++);
735 cc_out[3].r = *(p_src_r ++);
736
737 cc_out[0].i = *(p_src_r ++);
738 cc_out[1].i = *(p_src_r ++);
739 cc_out[2].i = *(p_src_r ++);
740 cc_out[3].i = *(p_src_r ++);
741
742 NE10_PRINT_Q2_VECTOR(cc_out);
743
744 // twiddles[0] = ( 1.0, 0.0);
745 // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
746 cc_in[0] = cc_out[0];
747 twiddles ++;
748
749 NE10_CPX_MUL_F32(cc_in[1],cc_out[1],twiddles[0]);
750 twiddles ++;
751
752 NE10_CPX_MUL_F32(cc_in[2],cc_out[2],twiddles[0]);
753 twiddles ++;
754
755 NE10_CPX_MUL_F32(cc_in[3],cc_out[3],twiddles[0]);
756
757 // NE10_PRINT_Q2_VECTOR(cc_in);
758
759 NE10_FFT_R2C_CC_CC(cc_out,cc_in);
760
761 // NE10_PRINT_Q2_VECTOR(cc_out);
762
763 dst[1] = cc_out[0];
764 dst += (nfft>>2);
765 dst[ 1] = cc_out[1];
766 dst[-1] = cc_out[3];
767 dst += (nfft>>2);
768 dst[-1] = cc_out[2];
769}
770
771NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly (ne10_fft_cpx_float32_t *dst,
772 const ne10_fft_cpx_float32_t *src,
773 const ne10_fft_cpx_float32_t *twiddles,
774 const ne10_int32_t nfft)
775{
776 // b0
777 {
778 ne10_float32_t q_4r_in[4];
779 ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
780
781 q_4r_in[0] = src[0].r;
782 q_4r_in[3] = src[0].i;
783 src += (nfft>>2);
784 q_4r_in[1] = src[0].r;
785 q_4r_in[2] = src[0].i;
786 src -= (nfft>>2);
787
788 NE10_FFT_C2R_RCR_4R(p_dst_r,q_4r_in);
789 }
790
791 // b2
792 {
793 // float32x4_t q_in;
794 ne10_float32_t *p_dst_r = (ne10_float32_t*) (dst);
795 p_dst_r += nfft;
796 p_dst_r -= 4;
797
798 ne10_float32_t q_4r_in[4];
799 src += (nfft>>3);
800 q_4r_in[0] = src[0].r;
801 q_4r_in[1] = src[0].i;
802 src += (nfft>>2);
803 q_4r_in[2] = src[0].r;
804 q_4r_in[3] = src[0].i;
805 src -= (nfft>>3);
806 src -= (nfft>>2);
807
808 NE10_FFT_C2R_CC_4R(p_dst_r,q_4r_in);
809 }
810
811 // b1
812 ne10_fft_cpx_float32_t cc_out[4];
813 ne10_fft_cpx_float32_t cc_in [4];
814 ne10_float32_t *p_dst_r = (ne10_float32_t*) dst;
815 p_dst_r += 4;
816
817 // load
818 cc_out[0] = src[1];
819 src += (nfft>>2);
820 cc_out[2] = src[ 1];
821 cc_out[3] = src[-1];
822 src += (nfft>>2);
823 cc_out[1] = src[-1];
824
825 // NE10_PRINT_Q2_VECTOR(cc_out);
826
827 NE10_FFT_C2R_CC_CC(cc_in,cc_out);
828
829 // NE10_PRINT_Q2_VECTOR(cc_in);
830
831 // twiddles[0] = ( 1.0, 0.0);
832 // NE10_CPX_MUL_F32(cc_in[0],cc_out[0],twiddles[0]);
833 cc_out[0] = cc_in[0];
834 twiddles ++;
835
836 NE10_CPX_CONJ_MUL_F32(cc_out[1],cc_in[1],twiddles[0]);
837 twiddles ++;
838
839 NE10_CPX_CONJ_MUL_F32(cc_out[2],cc_in[2],twiddles[0]);
840 twiddles ++;
841
842 NE10_CPX_CONJ_MUL_F32(cc_out[3],cc_in[3],twiddles[0]);
843
844 // NE10_PRINT_Q2_VECTOR(cc_out);
845
846 *(p_dst_r ++) = cc_out[0].r;
847 *(p_dst_r ++) = cc_out[1].r;
848 *(p_dst_r ++) = cc_out[2].r;
849 *(p_dst_r ++) = cc_out[3].r;
850
851 *(p_dst_r ++) = cc_out[0].i;
852 *(p_dst_r ++) = cc_out[1].i;
853 *(p_dst_r ++) = cc_out[2].i;
854 *(p_dst_r ++) = cc_out[3].i;
855}
856
857NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
858 const ne10_fft_cpx_float32_t *src,
859 const ne10_fft_cpx_float32_t *twiddles,
860 const ne10_int32_t nfft)
861{
862 // assert ( nfft % 4 == 0 );
863 const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 12;
864 ne10_float32_t *fout_r = (ne10_float32_t*) dst;
865 const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
866
867 ne10_float32_t q_in0[4], q_out0[4],
868 q_in1[4], q_out1[4],
869 q_in2[4], q_out2[4],
870 q_in3[4], q_out3[4];
871
872 ne10_float32_t q2_tw0[2][4],
873 q2_tw1[2][4];
874
875 /* INPUT & OUTPUT
876 * 0R 1R 2R 3R Q0
877 * 0I 1I 2I 3I Q1
878 * 4R 5R 6R 7R Q2
879 * 4I 5I 6I 7I Q3
880 */
881
882 q_in0[0] = *(fin_r++);
883 q_in0[1] = *(fin_r++);
884 q_in0[2] = *(fin_r++);
885 q_in0[3] = *(fin_r++);
886 q_in1[0] = *(fin_r++);
887 q_in1[1] = *(fin_r++);
888 q_in1[2] = *(fin_r++);
889 q_in1[3] = *(fin_r++);
890 q_in2[0] = *(fin_r++);
891 q_in2[1] = *(fin_r++);
892 q_in2[2] = *(fin_r++);
893 q_in2[3] = *(fin_r++);
894 q_in3[0] = *(fin_r++);
895 q_in3[1] = *(fin_r++);
896 q_in3[2] = *(fin_r++);
897 q_in3[3] = *(fin_r++);
898
899 // NE10_PRINT_Q_VECTOR(q_in0);
900 // NE10_PRINT_Q_VECTOR(q_in1);
901 // NE10_PRINT_Q_VECTOR(q_in2);
902 // NE10_PRINT_Q_VECTOR(q_in3);
903
904 q2_tw0[0][0] = tw[0];
905 q2_tw0[0][1] = tw[2];
906 q2_tw0[0][2] = tw[4];
907 q2_tw0[0][3] = tw[6];
908 q2_tw0[1][0] = tw[1];
909 q2_tw0[1][1] = tw[3];
910 q2_tw0[1][2] = tw[5];
911 q2_tw0[1][3] = tw[7];
912
913 q2_tw1[0][0] = tw[0+8];
914 q2_tw1[0][1] = tw[2+8];
915 q2_tw1[0][2] = tw[4+8];
916 q2_tw1[0][3] = tw[6+8];
917 q2_tw1[1][0] = tw[1+8];
918 q2_tw1[1][1] = tw[3+8];
919 q2_tw1[1][2] = tw[5+8];
920 q2_tw1[1][3] = tw[7+8];
921
922 // TW: in->out
923 q_out0[0] = q_in0[0];
924 q_out1[0] = q_in1[0];
925 q_out2[0] = q_in2[0];
926 q_out3[0] = q_in3[0];
927
928 //----------------------------------------------------------//
929 // first 2 lines
930 // R R R I I
931 q_out0[1] = q_in0[1] * q2_tw0[0][1] - q_in1[1] * q2_tw0[1][1];
932 // I R I I R
933 q_out1[1] = q_in0[1] * q2_tw0[1][1] + q_in1[1] * q2_tw0[0][1];
934
935 // R R R I I
936 q_out0[2] = q_in0[2] * q2_tw0[0][2] - q_in1[2] * q2_tw0[1][2];
937 // I R I I R
938 q_out1[2] = q_in0[2] * q2_tw0[1][2] + q_in1[2] * q2_tw0[0][2];
939
940 // R R R I I
941 q_out0[3] = q_in0[3] * q2_tw0[0][3] - q_in1[3] * q2_tw0[1][3];
942 // I R I I R
943 q_out1[3] = q_in0[3] * q2_tw0[1][3] + q_in1[3] * q2_tw0[0][3];
944
945 //---------------------------------------------------------//
946 // second 2 lines
947 // R R R I I
948 q_out2[1] = q_in2[1] * q2_tw1[0][1] - q_in3[1] * q2_tw1[1][1];
949 // I R I I R
950 q_out3[1] = q_in2[1] * q2_tw1[1][1] + q_in3[1] * q2_tw1[0][1];
951
952 // R R R I I
953 q_out2[2] = q_in2[2] * q2_tw1[0][2] - q_in3[2] * q2_tw1[1][2];
954 // I R I I R
955 q_out3[2] = q_in2[2] * q2_tw1[1][2] + q_in3[2] * q2_tw1[0][2];
956
957 // R R R I I
958 q_out2[3] = q_in2[3] * q2_tw1[0][3] - q_in3[3] * q2_tw1[1][3];
959 // I R I I R
960 q_out3[3] = q_in2[3] * q2_tw1[1][3] + q_in3[3] * q2_tw1[0][3];
961
962 // NE10_PRINT_Q_VECTOR(q_out0);
963 // NE10_PRINT_Q_VECTOR(q_out1);
964 // NE10_PRINT_Q_VECTOR(q_out2);
965 // NE10_PRINT_Q_VECTOR(q_out3);
966
967 // BUTTERFLY - radix 4x2
968 // STAGE
969 // q_out -> q_in
970 // R i R j R k
971 q_in0[0] = q_out0[0] + q_out0[2];
972 q_in1[0] = q_out1[0] + q_out1[2];
973
974 q_in0[1] = q_out0[0] - q_out0[2];
975 q_in1[1] = q_out1[0] - q_out1[2];
976
977 // R i R j R k
978 q_in0[2] = q_out0[1] + q_out0[3];
979 q_in1[2] = q_out1[1] + q_out1[3];
980
981 q_in0[3] = q_out0[1] - q_out0[3];
982 q_in1[3] = q_out1[1] - q_out1[3];
983
984 // R i R j R k
985 q_in2[0] = q_out2[0] + q_out2[2];
986 q_in3[0] = q_out3[0] + q_out3[2];
987
988 q_in2[1] = q_out2[0] - q_out2[2];
989 q_in3[1] = q_out3[0] - q_out3[2];
990
991 // R i R j R k
992 q_in2[2] = q_out2[1] + q_out2[3];
993 q_in3[2] = q_out3[1] + q_out3[3];
994
995 q_in2[3] = q_out2[1] - q_out2[3];
996 q_in3[3] = q_out3[1] - q_out3[3];
997
998 // NE10_PRINT_Q_VECTOR(q_in0);
999 // NE10_PRINT_Q_VECTOR(q_in1);
1000 // NE10_PRINT_Q_VECTOR(q_in2);
1001 // NE10_PRINT_Q_VECTOR(q_in3);
1002
1003 // STAGE
1004 // q_in -> q_out
1005 // and transpose
1006 // R i R j R k
1007 q_out0[0] = q_in0[0] + q_in0[2];
1008 q_out0[1] = q_in1[0] + q_in1[2];
1009
1010 q_out2[2] = q_in0[0] - q_in0[2];
1011 q_out2[3] = - q_in1[0] + q_in1[2];// CONJ
1012
1013 // R i R j R k
1014 q_out3[2] = q_in0[1] - q_in1[3];
1015 q_out3[3] = - q_in1[1] - q_in0[3];// CONJ
1016
1017 q_out1[0] = q_in0[1] + q_in1[3];
1018 q_out1[1] = q_in1[1] - q_in0[3];
1019
1020 // R i R j R k
1021 q_out0[2] = q_in2[0] + q_in2[2];
1022 q_out0[3] = q_in3[0] + q_in3[2];
1023
1024 q_out2[0] = q_in2[0] - q_in2[2];
1025 q_out2[1] = - q_in3[0] + q_in3[2];// CONJ
1026
1027 // R i R j R k
1028 q_out3[0] = q_in2[1] - q_in3[3];
1029 q_out3[1] = - q_in3[1] - q_in2[3]; // CONJ
1030
1031 q_out1[2] = q_in2[1] + q_in3[3];
1032 q_out1[3] = q_in3[1] - q_in2[3];
1033
1034 // NE10_PRINT_Q_VECTOR(q_out0);
1035 // NE10_PRINT_Q_VECTOR(q_out1);
1036 // NE10_PRINT_Q_VECTOR(q_out2);
1037 // NE10_PRINT_Q_VECTOR(q_out3);
1038
1039 // STORE
1040 fout_r += 4;
1041 fout_r[0] = q_out0[0];
1042 fout_r[1] = q_out0[1];
1043 fout_r[2] = q_out0[2];
1044 fout_r[3] = q_out0[3];
1045
1046 fout_r += (nfft>>1);
1047 fout_r[0] = q_out1[0];
1048 fout_r[1] = q_out1[1];
1049 fout_r[2] = q_out1[2];
1050 fout_r[3] = q_out1[3];
1051
1052 fout_r -= 10;
1053 fout_r[0] = q_out3[0];
1054 fout_r[1] = q_out3[1];
1055 fout_r[2] = q_out3[2];
1056 fout_r[3] = q_out3[3];
1057
1058 fout_r += (nfft>>1);
1059 fout_r[0] = q_out2[0];
1060 fout_r[1] = q_out2[1];
1061 fout_r[2] = q_out2[2];
1062 fout_r[3] = q_out2[3];
1063}
1064
1065NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly (ne10_fft_cpx_float32_t *dst,
1066 const ne10_fft_cpx_float32_t *src,
1067 const ne10_fft_cpx_float32_t *twiddles,
1068 const ne10_int32_t nfft)
1069{
1070 const ne10_float32_t *fin_r = (const ne10_float32_t*) src;
1071 ne10_float32_t *fout_r = (ne10_float32_t*) dst + 12;
1072 const ne10_float32_t *tw = (const ne10_float32_t*) twiddles + 8;
1073
1074 ne10_float32_t q_in0[4], q_out0[4],
1075 q_in1[4], q_out1[4],
1076 q_in2[4], q_out2[4],
1077 q_in3[4], q_out3[4];
1078
1079 ne10_float32_t q2_tw0[2][4],
1080 q2_tw1[2][4];
1081
1082 /* INPUT & OUTPUT
1083 * 0R 1R 2R 3R Q0
1084 * 0I 1I 2I 3I Q1
1085 * 4R 5R 6R 7R Q2
1086 * 4I 5I 6I 7I Q3
1087 */
1088
1089 // load
1090 fin_r += 4;
1091 q_in0[0] = fin_r[0];
1092 q_in0[1] = fin_r[1];
1093 q_in0[2] = fin_r[2];
1094 q_in0[3] = fin_r[3];
1095
1096 fin_r += (nfft>>1);
1097 q_in1[0] = fin_r[0];
1098 q_in1[1] = fin_r[1];
1099 q_in1[2] = fin_r[2];
1100 q_in1[3] = fin_r[3];
1101
1102 fin_r -= 10;
1103 q_in3[0] = fin_r[0];
1104 q_in3[1] = fin_r[1];
1105 q_in3[2] = fin_r[2];
1106 q_in3[3] = fin_r[3];
1107
1108 fin_r += (nfft>>1);
1109 q_in2[0] = fin_r[0];
1110 q_in2[1] = fin_r[1];
1111 q_in2[2] = fin_r[2];
1112 q_in2[3] = fin_r[3];
1113
1114 // NE10_PRINT_Q_VECTOR(q_in0);
1115 // NE10_PRINT_Q_VECTOR(q_in1);
1116 // NE10_PRINT_Q_VECTOR(q_in2);
1117 // NE10_PRINT_Q_VECTOR(q_in3);
1118
1119 // OUTPUT
1120 // INPUT
1121#define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1122 q_out ## I1 [I2] = ( q_in ## K1 [K2] + q_in ## S1 [S2] ); \
1123 q_out ## J1 [J2] = ( q_in ## K1 [K2] - q_in ## S1 [S2] ); \
1124} while(0);
1125
1126 // STAGE
1127 // q_in -> q_out
1128 // and transpose
1129 NE10_INV_BUTTERFLY_TMP( 0,0, 0,2,
1130 0,0, 2,2);
1131
1132 NE10_INV_BUTTERFLY_TMP( 1,2, 1,0,
1133 0,1, 2,3);
1134
1135 NE10_INV_BUTTERFLY_TMP( 0,1, 1,3,
1136 1,0, 3,2);
1137
1138 q_in3[3] *= - 1.0f;
1139 NE10_INV_BUTTERFLY_TMP( 1,1, 0,3,
1140 3,3, 1,1);
1141
1142 NE10_INV_BUTTERFLY_TMP( 2,0, 2,2,
1143 0,2, 2,0);
1144
1145 NE10_INV_BUTTERFLY_TMP( 3,2, 3,0,
1146 0,3, 2,1);
1147
1148 NE10_INV_BUTTERFLY_TMP( 2,1, 3,3,
1149 1,2, 3,0);
1150
1151 q_in3[1] *= - 1.0f;
1152 NE10_INV_BUTTERFLY_TMP( 3,1, 2,3,
1153 3,1, 1,3);
1154#undef NE10_INV_BUTTERFLY_TMP
1155
1156 // NE10_PRINT_Q_VECTOR(q_out0);
1157 // NE10_PRINT_Q_VECTOR(q_out1);
1158 // NE10_PRINT_Q_VECTOR(q_out2);
1159 // NE10_PRINT_Q_VECTOR(q_out3);
1160
1161 // BUTTERFLY - radix 4x2
1162 // STAGE
1163 // q_out -> q_in
1164
1165 // OUTPUT
1166 // INPUT
1167#define NE10_INV_BUTTERFLY_TMP(I1,I2,J1,J2,K1,K2,S1,S2) do { \
1168 q_in ## I1 [I2] = ( q_out ## K1 [K2] + q_out ## S1 [S2] ); \
1169 q_in ## J1 [J2] = ( q_out ## K1 [K2] - q_out ## S1 [S2] ); \
1170} while(0);
1171
1172 NE10_INV_BUTTERFLY_TMP(0,0, 0,2,
1173 0,0, 0,1);
1174
1175 NE10_INV_BUTTERFLY_TMP(1,0, 1,2,
1176 1,0, 1,1);
1177
1178 NE10_INV_BUTTERFLY_TMP(0,1, 0,3,
1179 0,2, 0,3);
1180
1181 NE10_INV_BUTTERFLY_TMP(1,1, 1,3,
1182 1,2, 1,3);
1183
1184 NE10_INV_BUTTERFLY_TMP(2,0, 2,2,
1185 2,0, 2,1);
1186
1187 NE10_INV_BUTTERFLY_TMP(3,0, 3,2,
1188 3,0, 3,1);
1189
1190
1191 NE10_INV_BUTTERFLY_TMP(2,1, 2,3,
1192 2,2, 2,3);
1193
1194 NE10_INV_BUTTERFLY_TMP(3,1, 3,3,
1195 3,2, 3,3);
1196
1197 // NE10_PRINT_Q_VECTOR(q_in0);
1198 // NE10_PRINT_Q_VECTOR(q_in1);
1199 // NE10_PRINT_Q_VECTOR(q_in2);
1200 // NE10_PRINT_Q_VECTOR(q_in3);
1201#undef NE10_INV_BUTTERFLY_TMP
1202
1203 // load tw
1204 q2_tw0[0][0] = tw[0];
1205 q2_tw0[0][1] = tw[2];
1206 q2_tw0[0][2] = tw[4];
1207 q2_tw0[0][3] = tw[6];
1208 q2_tw0[1][0] = tw[1];
1209 q2_tw0[1][1] = tw[3];
1210 q2_tw0[1][2] = tw[5];
1211 q2_tw0[1][3] = tw[7];
1212
1213 q2_tw1[0][0] = tw[0+8];
1214 q2_tw1[0][1] = tw[2+8];
1215 q2_tw1[0][2] = tw[4+8];
1216 q2_tw1[0][3] = tw[6+8];
1217 q2_tw1[1][0] = tw[1+8];
1218 q2_tw1[1][1] = tw[3+8];
1219 q2_tw1[1][2] = tw[5+8];
1220 q2_tw1[1][3] = tw[7+8];
1221
1222 // TW: in->out
1223 q_out0[0] = q_in0[0];
1224 q_out1[0] = q_in1[0];
1225 q_out2[0] = q_in2[0];
1226 q_out3[0] = q_in3[0];
1227
1228 //----------------------------------------------------------//
1229 // first 2 lines
1230 // R R R I I
1231 q_out0[1] = q_in0[1] * q2_tw0[0][1] + q_in1[1] * q2_tw0[1][1];
1232 // I R I I R
1233 q_out1[1] = q_in0[1] * q2_tw0[1][1] - q_in1[1] * q2_tw0[0][1];
1234
1235 // R R R I I
1236 q_out0[2] = q_in0[2] * q2_tw0[0][2] + q_in1[2] * q2_tw0[1][2];
1237 // I R I I R
1238 q_out1[2] = q_in0[2] * q2_tw0[1][2] - q_in1[2] * q2_tw0[0][2];
1239
1240 // R R R I I
1241 q_out0[3] = q_in0[3] * q2_tw0[0][3] + q_in1[3] * q2_tw0[1][3];
1242 // I R I I R
1243 q_out1[3] = q_in0[3] * q2_tw0[1][3] - q_in1[3] * q2_tw0[0][3];
1244
1245 //----------------------------------------------------------//
1246 // second 2 lines
1247 // R R R I I
1248 q_out2[1] = q_in2[1] * q2_tw1[0][1] + q_in3[1] * q2_tw1[1][1];
1249 // I R I I R
1250 q_out3[1] = q_in2[1] * q2_tw1[1][1] - q_in3[1] * q2_tw1[0][1];
1251
1252 // R R R I I
1253 q_out2[2] = q_in2[2] * q2_tw1[0][2] + q_in3[2] * q2_tw1[1][2];
1254 // I R I I R
1255 q_out3[2] = q_in2[2] * q2_tw1[1][2] - q_in3[2] * q2_tw1[0][2];
1256
1257 // R R R I I
1258 q_out2[3] = q_in2[3] * q2_tw1[0][3] + q_in3[3] * q2_tw1[1][3];
1259 // I R I I R
1260 q_out3[3] = q_in2[3] * q2_tw1[1][3] - q_in3[3] * q2_tw1[0][3];
1261
1262 // STORE
1263 *(fout_r++) = q_out0[0];
1264 *(fout_r++) = q_out0[1];
1265 *(fout_r++) = q_out0[2];
1266 *(fout_r++) = q_out0[3];
1267 *(fout_r++) = q_out1[0];
1268 *(fout_r++) = - q_out1[1];
1269 *(fout_r++) = - q_out1[2];
1270 *(fout_r++) = - q_out1[3];
1271 *(fout_r++) = q_out2[0];
1272 *(fout_r++) = q_out2[1];
1273 *(fout_r++) = q_out2[2];
1274 *(fout_r++) = q_out2[3];
1275 *(fout_r++) = q_out3[0];
1276 *(fout_r++) = - q_out3[1];
1277 *(fout_r++) = - q_out3[2];
1278 *(fout_r++) = - q_out3[3];
1279}
1280
1281NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
1282 const ne10_fft_cpx_float32_t *src,
1283 const ne10_fft_cpx_float32_t *twiddles,
1284 const ne10_int32_t nfft)
1285{
1286 const ne10_float32_t *fin_r = ((const ne10_float32_t*) src) + 12 + 16;
1287 ne10_float32_t *fout_r = (ne10_float32_t*) dst + 8;
1288 ne10_float32_t *fout_b = (ne10_float32_t*) dst - 14;
1289 const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1290
1291 // Take 4 elements as a set.
1292 // The leading 8 sets are already transformed in first and seconds butterflies.
1293 // This function transforms 8 sets in each loop.
1294 ne10_int32_t loop_count = ((nfft >> 2) - 8) >> 3;
1295
1296 for (; loop_count > 0; loop_count--)
1297 {
1298 NE10_DECLARE_4 (float32x4x2_t, q2_in); // 8Q
1299 NE10_DECLARE_3 (float32x4x2_t, q2_tw); // 6Q
1300 NE10_DECLARE_4 (float32x4x2_t, q2_out); // 8Q
1301
1302 /* INPUT
1303 * 0R 1R 2R 3R Q0
1304 * 0I 1I 2I 3I Q1
1305 * 4R 5R 6R 7R Q2
1306 * 4I 5I 6I 7I Q3
1307 * 8R 9R aR bR Q4
1308 * 8I 9I aI bI Q5
1309 * cR dR eR fR Q6
1310 * cI dI eI fI Q7
1311 */
1312
1313 // transpose
1314 // q2_out -> q2_in
1315 /*
1316 * val[0]
1317 * 0R 4R 8R cR Q0
1318 * 1R 5R 9R dR Q2
1319 * 2R 6R aR eR Q4
1320 * 3R 7R bR fR Q6
1321 *
1322 * val[1]
1323 * 0I 4I 8I cI Q1
1324 * 1I 5I 9I dI Q3
1325 * 2I 6I aI eI Q5
1326 * 3I 7I bI fI Q7
1327 */
1328
1329#ifndef NE10_INLINE_ASM_OPT
1330 q2_out0.val[0] = vld1q_f32 (fin_r);
1331 fin_r += 4;
1332 q2_out0.val[1] = vld1q_f32 (fin_r);
1333 fin_r += 4;
1334 q2_out1.val[0] = vld1q_f32 (fin_r);
1335 fin_r += 4;
1336 q2_out1.val[1] = vld1q_f32 (fin_r);
1337 fin_r += 4;
1338 q2_out2.val[0] = vld1q_f32 (fin_r);
1339 fin_r += 4;
1340 q2_out2.val[1] = vld1q_f32 (fin_r);
1341 fin_r += 4;
1342 q2_out3.val[0] = vld1q_f32 (fin_r);
1343 fin_r += 4;
1344 q2_out3.val[1] = vld1q_f32 (fin_r);
1345 fin_r += 4;
1346
1347 NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in, q2_out);
1348#else // NE10_INLINE_ASM_OPT
1349#ifndef __aarch64__
1350#error Currently, inline assembly optimizations are only available on AArch64.
1351#else // __aarch64__
1352 asm volatile (
1353 "ld1 {v0.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[0]
1354 "ld1 {v4.4s}, [%[fin_r]], 16 \n\t" // q2_in0.val[1]
1355 "ld1 {v1.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[0]
1356 "ld1 {v5.4s}, [%[fin_r]], 16 \n\t" // q2_in1.val[1]
1357 "ld1 {v2.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[0]
1358 "ld1 {v6.4s}, [%[fin_r]], 16 \n\t" // q2_in2.val[1]
1359 "ld1 {v3.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[0]
1360 "ld1 {v7.4s}, [%[fin_r]], 16 \n\t" // q2_in3.val[1]
1361 // NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1362 "trn1 v8.4s, v0.4s, v1.4s \n\t"
1363 "trn2 v9.4s, v0.4s, v1.4s \n\t"
1364 "trn1 v10.4s, v2.4s, v3.4s \n\t"
1365 "trn2 v11.4s, v2.4s, v3.4s \n\t"
1366
1367 "trn1 %[q2_in0r].2d, v8.2d, v10.2d \n\t"
1368 "trn1 %[q2_in1r].2d, v9.2d, v11.2d \n\t"
1369 "trn2 %[q2_in2r].2d, v8.2d, v10.2d \n\t"
1370 "trn2 %[q2_in3r].2d, v9.2d, v11.2d \n\t"
1371
1372 "trn1 v8.4s, v4.4s, v5.4s \n\t"
1373 "trn2 v9.4s, v4.4s, v5.4s \n\t"
1374 "trn1 v10.4s, v6.4s, v7.4s \n\t"
1375 "trn2 v11.4s, v6.4s, v7.4s \n\t"
1376
1377 "trn1 %[q2_in0i].2d, v8.2d, v10.2d \n\t"
1378 "trn1 %[q2_in1i].2d, v9.2d, v11.2d \n\t"
1379 "trn2 %[q2_in2i].2d, v8.2d, v10.2d \n\t"
1380 "trn2 %[q2_in3i].2d, v9.2d, v11.2d \n\t"
1381
1382 : [q2_in0r]"+w"(q2_in0.val[0]),
1383 [q2_in0i]"+w"(q2_in0.val[1]),
1384 [q2_in1r]"+w"(q2_in1.val[0]),
1385 [q2_in1i]"+w"(q2_in1.val[1]),
1386 [q2_in2r]"+w"(q2_in2.val[0]),
1387 [q2_in2i]"+w"(q2_in2.val[1]),
1388 [q2_in3r]"+w"(q2_in3.val[0]),
1389 [q2_in3i]"+w"(q2_in3.val[1]),
1390 [fin_r]"+r"(fin_r)
1391 :
1392 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1393 "v8", "v9", "v10", "v11"
1394 );
1395#endif // __aarch64__
1396#endif // NE10_INLINE_ASM_OPT
1397
1398#ifndef NE10_INLINE_ASM_OPT
1399 // Load twiddles
1400 q2_tw0 = vld2q_f32 (tw);
1401 tw += 8;
1402 q2_tw1 = vld2q_f32 (tw);
1403 tw += 8;
1404 q2_tw2 = vld2q_f32 (tw);
1405 tw += 8;
1406
1407 // tw
1408 // q2_in -> q2_out
1409 q2_out0 = q2_in0;
1410 NE10_CPX_MUL_NEON_F32 (q2_out1, q2_in1, q2_tw0);
1411 NE10_CPX_MUL_NEON_F32 (q2_out2, q2_in2, q2_tw1);
1412 NE10_CPX_MUL_NEON_F32 (q2_out3, q2_in3, q2_tw2);
1413#else // NE10_INLINE_ASM_OPT
1414#ifndef __aarch64__
1415#error Currently, inline assembly optimizations are only available on AArch64.
1416#else // __aarch64__
1417 asm volatile (
1418 // Load twiddles
1419 "ld2 {v0.4s, v1.4s}, [%[tw0]] \n\t" // q2_tw0
1420 "ld2 {v2.4s, v3.4s}, [%[tw1]] \n\t" // q2_tw1
1421 "ld2 {v4.4s, v5.4s}, [%[tw2]] \n\t" // q2_tw2
1422 // tw
1423 // q2_in -> q2_out
1424 // NE10_CPX_MUL_NEON_F32(q2_out1,q2_in1,q2_tw0);
1425 "fmul %[q2_out1r].4s, v0.4s, %[q2_in1r].4s \n\t" // RR
1426 "fmul %[q2_out1i].4s, v0.4s, %[q2_in1i].4s \n\t" // RI
1427 "fmls %[q2_out1r].4s, v1.4s, %[q2_in1i].4s \n\t" // RR - II
1428 "fmla %[q2_out1i].4s, v1.4s, %[q2_in1r].4s \n\t" // RI + IR
1429 // NE10_CPX_MUL_NEON_F32(q2_out2,q2_in2,q2_tw1);
1430 "fmul %[q2_out2r].4s, v2.4s, %[q2_in2r].4s \n\t" // RR
1431 "fmul %[q2_out2i].4s, v2.4s, %[q2_in2i].4s \n\t" // RI
1432 "fmls %[q2_out2r].4s, v3.4s, %[q2_in2i].4s \n\t" // RR - II
1433 "fmla %[q2_out2i].4s, v3.4s, %[q2_in2r].4s \n\t" // RI + IR
1434 // NE10_CPX_MUL_NEON_F32(q2_out3,q2_in3,q2_tw2);
1435 "fmul %[q2_out3r].4s, v4.4s, %[q2_in3r].4s \n\t" // RR
1436 "fmul %[q2_out3i].4s, v4.4s, %[q2_in3i].4s \n\t" // RI
1437 "fmls %[q2_out3r].4s, v5.4s, %[q2_in3i].4s \n\t" // RR - II
1438 "fmla %[q2_out3i].4s, v5.4s, %[q2_in3r].4s \n\t" // RI + IR
1439 : [q2_out1r]"+w"(q2_out1.val[0]),
1440 [q2_out1i]"+w"(q2_out1.val[1]),
1441 [q2_out2r]"+w"(q2_out2.val[0]),
1442 [q2_out2i]"+w"(q2_out2.val[1]),
1443 [q2_out3r]"+w"(q2_out3.val[0]),
1444 [q2_out3i]"+w"(q2_out3.val[1])
1445 : [tw0]"r"(tw),
1446 [tw1]"r"(tw + 8),
1447 [tw2]"r"(tw + 16),
1448 [q2_in1r]"w"(q2_in1.val[0]),
1449 [q2_in1i]"w"(q2_in1.val[1]),
1450 [q2_in2r]"w"(q2_in2.val[0]),
1451 [q2_in2i]"w"(q2_in2.val[1]),
1452 [q2_in3r]"w"(q2_in3.val[0]),
1453 [q2_in3i]"w"(q2_in3.val[1])
1454 : "memory", "v0", "v1", "v2", "v3", "v4", "v5"
1455 );
1456 q2_out0 = q2_in0;
1457 tw += 24;
1458#endif // __aarch64__
1459#endif // NE10_INLINE_ASM_OPT
1460
1461 // butterfly
1462 // out -> in
1463 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out2.val[0]);
1464 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out2.val[1]);
1465 q2_in1.val[0] = vsubq_f32 (q2_out0.val[0], q2_out2.val[0]);
1466 q2_in1.val[1] = vsubq_f32 (q2_out0.val[1], q2_out2.val[1]);
1467 q2_in2.val[0] = vaddq_f32 (q2_out1.val[0], q2_out3.val[0]);
1468 q2_in2.val[1] = vaddq_f32 (q2_out1.val[1], q2_out3.val[1]);
1469 q2_in3.val[0] = vsubq_f32 (q2_out1.val[0], q2_out3.val[0]);
1470 q2_in3.val[1] = vsubq_f32 (q2_out1.val[1], q2_out3.val[1]);
1471
1472 // in -> out
1473 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1474 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1475 q2_out3.val[0] = vsubq_f32 (q2_in1.val[0], q2_in3.val[1]);
1476 q2_out3.val[1] = vaddq_f32 (q2_in1.val[1], q2_in3.val[0]);
1477
1478 q2_out3.val[1] = vnegq_f32 (q2_out3.val[1]);
1479 q2_out2.val[1] = vnegq_f32 (q2_out2.val[1]);
1480
1481#ifndef NE10_INLINE_ASM_OPT
1482 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1483 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1484
1485 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[1]);
1486 q2_out1.val[1] = vsubq_f32 (q2_in1.val[1], q2_in3.val[0]);
1487
1488 // reverse -- CONJ
1489 NE10_REVERSE_FLOAT32X4 (q2_out2.val[0]);
1490 NE10_REVERSE_FLOAT32X4 (q2_out2.val[1]);
1491 NE10_REVERSE_FLOAT32X4 (q2_out3.val[0]);
1492 NE10_REVERSE_FLOAT32X4 (q2_out3.val[1]);
1493
1494 // store
1495 vst2q_f32 (fout_r, q2_out0);
1496 vst2q_f32 (fout_r + (nfft >> 1), q2_out1);
1497 vst2q_f32 (fout_b + (nfft >> 1), q2_out3);
1498 vst2q_f32 (fout_b + nfft, q2_out2);
1499#else // NE10_INLINE_ASM_OPT
1500#ifndef __aarch64__
1501#error Currently, inline assembly optimizations are only available on AArch64.
1502#else // __aarch64__
1503 asm volatile (
1504 "fadd v0.4s, %[q2_in0r].4s, %[q2_in2r].4s \n\t"
1505 "fadd v1.4s, %[q2_in0i].4s, %[q2_in2i].4s \n\t"
1506 "fadd v2.4s, %[q2_in1r].4s, %[q2_in3i].4s \n\t"
1507 "fsub v3.4s, %[q2_in1i].4s, %[q2_in3r].4s \n\t"
1508 // reverse -- CONJ
1509 "rev64 %[q2_in2r].4s, %[q2_out2r].4s \n\t"
1510 "rev64 %[q2_in2i].4s, %[q2_out2i].4s \n\t"
1511 "rev64 %[q2_in3r].4s, %[q2_out3r].4s \n\t"
1512 "rev64 %[q2_in3i].4s, %[q2_out3i].4s \n\t"
1513 "ext v4.16b, %[q2_in2r].16b, %[q2_in2r].16b, #8 \n\t"
1514 "ext v5.16b, %[q2_in2i].16b, %[q2_in2i].16b, #8 \n\t"
1515 "ext v6.16b, %[q2_in3r].16b, %[q2_in3r].16b, #8 \n\t"
1516 "ext v7.16b, %[q2_in3i].16b, %[q2_in3i].16b, #8 \n\t"
1517 // store
1518 "st2 {v0.4s, v1.4s}, [%[fout0]] \n\t"
1519 "st2 {v2.4s, v3.4s}, [%[fout1]] \n\t"
1520 "st2 {v4.4s, v5.4s}, [%[fout2]] \n\t"
1521 "st2 {v6.4s, v7.4s}, [%[fout3]] \n\t"
1522 :
1523 : [fout0]"r"(fout_r),
1524 [fout1]"r"(fout_r + (nfft>>1)),
1525 [fout2]"r"(fout_b + nfft),
1526 [fout3]"r"(fout_b + (nfft>>1)),
1527 [q2_out2r]"w"(q2_out2.val[0]),
1528 [q2_out2i]"w"(q2_out2.val[1]),
1529 [q2_out3r]"w"(q2_out3.val[0]),
1530 [q2_out3i]"w"(q2_out3.val[1]),
1531 [q2_in0r]"w"(q2_in0.val[0]),
1532 [q2_in0i]"w"(q2_in0.val[1]),
1533 [q2_in1r]"w"(q2_in1.val[0]),
1534 [q2_in1i]"w"(q2_in1.val[1]),
1535 [q2_in2r]"w"(q2_in2.val[0]),
1536 [q2_in2i]"w"(q2_in2.val[1]),
1537 [q2_in3r]"w"(q2_in3.val[0]),
1538 [q2_in3i]"w"(q2_in3.val[1])
1539 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1540 );
1541#endif // __aarch64__
1542#endif // NE10_INLINE_ASM_OPT
1543
1544 fout_r += 8;
1545 fout_b -= 8;
1546 }
1547}
1548
1549NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly (ne10_fft_cpx_float32_t *dst,
1550 const ne10_fft_cpx_float32_t *src,
1551 const ne10_fft_cpx_float32_t *twiddles,
1552 const ne10_int32_t nfft)
1553{
1554 ne10_float32_t *fout_r = ((ne10_float32_t*) dst ) + 12 + 16 ;
1555 const ne10_float32_t *fin_r = (const ne10_float32_t*) src + 8;
1556 const ne10_float32_t *fin_b = (const ne10_float32_t*) src - 14;
1557 const ne10_float32_t *tw = ((const ne10_float32_t*) twiddles) + 8 + 16;
1558 ne10_int32_t loop_count = ((nfft>>2)-8)>>3;
1559
1560 for ( ; loop_count>0; loop_count -- )
1561 {
1562 NE10_DECLARE_4(float32x4x2_t,q2_in); // 8Q
1563 NE10_DECLARE_3(float32x4x2_t,q2_tw); // 6Q
1564 NE10_DECLARE_4(float32x4x2_t,q2_out); // 8Q
1565
1566 /* INPUT
1567 * 0R 1R 2R 3R Q0
1568 * 0I 1I 2I 3I Q1
1569 * 4R 5R 6R 7R Q2
1570 * 4I 5I 6I 7I Q3
1571 * 8R 9R aR bR Q4
1572 * 8I 9I aI bI Q5
1573 * cR dR eR fR Q6
1574 * cI dI eI fI Q7
1575 */
1576
1577 q2_in0 = vld2q_f32(fin_r );
1578 q2_in1 = vld2q_f32(fin_r + (nfft>>1));
1579 fin_r += 8;
1580
1581 q2_in3 = vld2q_f32(fin_b + (nfft>>1));
1582 q2_in2 = vld2q_f32(fin_b + nfft );
1583 fin_b -= 8;
1584
1585 q2_tw0 = vld2q_f32(tw);
1586 tw += 8;
1587 q2_tw1 = vld2q_f32(tw);
1588 tw += 8;
1589 q2_tw2 = vld2q_f32(tw);
1590 tw += 8;
1591
1592 // reverse -- CONJ
1593 NE10_REVERSE_FLOAT32X4( q2_in3.val[0] );
1594 NE10_REVERSE_FLOAT32X4( q2_in3.val[1] );
1595 NE10_REVERSE_FLOAT32X4( q2_in2.val[0] );
1596 NE10_REVERSE_FLOAT32X4( q2_in2.val[1] );
1597
1598 q2_in2.val[1] = vnegq_f32( q2_in2.val[1] );
1599 q2_in3.val[1] = vnegq_f32( q2_in3.val[1] );
1600
1601 // in -> out
1602 q2_out0.val[0] = vaddq_f32 (q2_in0.val[0], q2_in2.val[0]);
1603 q2_out2.val[0] = vsubq_f32 (q2_in0.val[0], q2_in2.val[0]);
1604
1605 q2_out0.val[1] = vaddq_f32 (q2_in0.val[1], q2_in2.val[1]);
1606 q2_out2.val[1] = vsubq_f32 (q2_in0.val[1], q2_in2.val[1]);
1607
1608 q2_out1.val[0] = vaddq_f32 (q2_in1.val[0], q2_in3.val[0]);
1609 q2_out3.val[1] = vsubq_f32 (q2_in1.val[0], q2_in3.val[0]);
1610
1611 q2_out1.val[1] = vaddq_f32 (q2_in3.val[1], q2_in1.val[1]);
1612 q2_out3.val[0] = vsubq_f32 (q2_in3.val[1], q2_in1.val[1]);
1613
1614 // out -> in
1615 q2_in0.val[0] = vaddq_f32 (q2_out0.val[0], q2_out1.val[0]);
1616 q2_in2.val[0] = vsubq_f32 (q2_out0.val[0], q2_out1.val[0]);
1617
1618 q2_in0.val[1] = vaddq_f32 (q2_out0.val[1], q2_out1.val[1]);
1619 q2_in2.val[1] = vsubq_f32 (q2_out0.val[1], q2_out1.val[1]);
1620
1621 q2_in1.val[0] = vaddq_f32 (q2_out2.val[0], q2_out3.val[0]);
1622 q2_in3.val[0] = vsubq_f32 (q2_out2.val[0], q2_out3.val[0]);
1623
1624 q2_in1.val[1] = vaddq_f32 (q2_out2.val[1], q2_out3.val[1]);
1625 q2_in3.val[1] = vsubq_f32 (q2_out2.val[1], q2_out3.val[1]);
1626
1627 // tw
1628 // q2_in -> q2_out
1629 q2_out0 = q2_in0;
1630 NE10_CPX_MUL_INV_NEON_F32(q2_out1,q2_in1,q2_tw0);
1631 NE10_CPX_MUL_INV_NEON_F32(q2_out2,q2_in2,q2_tw1);
1632 NE10_CPX_MUL_INV_NEON_F32(q2_out3,q2_in3,q2_tw2);
1633
1634 // transpose
1635 // q2_out -> q2_in
1636 NE10_RADIX4X4C_TRANSPOSE_NEON (q2_in,q2_out);
1637
1638 // store
1639 vst1q_f32(fout_r, q2_in0.val[0]);
1640 fout_r += 4;
1641 vst1q_f32(fout_r, q2_in0.val[1]);
1642 fout_r += 4;
1643 vst1q_f32(fout_r, q2_in1.val[0]);
1644 fout_r += 4;
1645 vst1q_f32(fout_r, q2_in1.val[1]);
1646 fout_r += 4;
1647 vst1q_f32(fout_r, q2_in2.val[0]);
1648 fout_r += 4;
1649 vst1q_f32(fout_r, q2_in2.val[1]);
1650 fout_r += 4;
1651 vst1q_f32(fout_r, q2_in3.val[0]);
1652 fout_r += 4;
1653 vst1q_f32(fout_r, q2_in3.val[1]);
1654 fout_r += 4;
1655 }
1656}
1657
1658NE10_INLINE void ne10_radix4_r2c_with_twiddles_last_stage( ne10_fft_cpx_float32_t *dst,
1659 const ne10_fft_cpx_float32_t *src,
1660 const ne10_fft_cpx_float32_t *twiddles,
1661 const ne10_int32_t nfft)
1662{
1663 ne10_radix4_r2c_with_twiddles_last_stage_first_butterfly(dst,src,twiddles,nfft);
1664
1665 if (nfft==16)
1666 {
1667 return;
1668 }
1669
1670 ne10_radix4_r2c_with_twiddles_last_stage_second_butterfly(dst,src,twiddles,nfft);
1671
1672 if (nfft==32)
1673 {
1674 return;
1675 }
1676
1677 ne10_radix4_r2c_with_twiddles_last_stage_other_butterfly(dst,src,twiddles,nfft);
1678}
1679
1680NE10_INLINE void ne10_radix4_c2r_with_twiddles_first_stage( ne10_fft_cpx_float32_t *dst,
1681 const ne10_fft_cpx_float32_t *src,
1682 const ne10_fft_cpx_float32_t *twiddles,
1683 const ne10_int32_t nfft)
1684{
1685 ne10_radix4_c2r_with_twiddles_first_stage_first_butterfly(dst,src,twiddles,nfft);
1686
1687 if (nfft==16)
1688 {
1689 return;
1690 }
1691
1692 ne10_radix4_c2r_with_twiddles_first_stage_second_butterfly(dst,src,twiddles,nfft);
1693
1694 if (nfft==32)
1695 {
1696 return;
1697 }
1698
1699 ne10_radix4_c2r_with_twiddles_first_stage_other_butterfly(dst,src,twiddles,nfft);
1700}
1701
1718 ne10_float32_t *fin,
1720{
1721 typedef ne10_float32_t REAL;
1722 typedef ne10_fft_cpx_float32_t CPLX;
1723
1724 ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1725 ne10_float32_t *fout_r = (ne10_float32_t*) fout;
1726
1727 switch (cfg->nfft)
1728 {
1729 case 8:
1730 ne10_radix8_r2c_c ( (CPLX*) fout_r, (const CPLX*) fin, 1, 1, 8);
1731 fout[0].r = fout[0].i;
1732 break;
1733 default:
1734 ne10_mixed_radix_r2c_butterfly_float32_neon (fout, (CPLX*) fin, cfg->r_factors_neon, cfg->r_twiddles_neon, tmpbuf);
1735 ne10_radix4_r2c_with_twiddles_last_stage(fout, tmpbuf, cfg->r_super_twiddles_neon, cfg->nfft);
1736 fout[cfg->nfft / 2].r = fout[0].i;
1737 break;
1738 }
1739 fout[0].i = fout[cfg->nfft / 2].i = 0.0f;
1740}
1741
1752void ne10_fft_c2r_1d_float32_neon (ne10_float32_t *fout,
1755{
1756 typedef ne10_float32_t REAL;
1757 typedef ne10_fft_cpx_float32_t CPLX;
1758
1759 ne10_fft_cpx_float32_t * tmpbuf = cfg->buffer;
1760 ne10_fft_cpx_float32_t * fout_c;
1761 ne10_int32_t stage_count;
1762 ne10_int32_t radix;
1763
1764 switch (cfg->nfft)
1765 {
1766 case 8:
1767 fin[0].i = fin[0].r;
1768 fin[0].r = 0.0f;
1769 ne10_radix8_c2r_c ( (CPLX*) fout, (const CPLX*) &fin[0].i, 1, 1, 8);
1770 fin[0].r = fin[0].i;
1771 break;
1772 default:
1773 stage_count = cfg->r_factors_neon[0];
1774 radix = cfg->r_factors_neon[ stage_count << 1 ];
1775 if (radix==2)
1776 {
1777 stage_count --;
1778 }
1779 fin[0].i = fin[cfg->nfft>>1].r;
1780 fout_c = (stage_count % 2==1) ? tmpbuf : (CPLX*)fout;
1781 ne10_radix4_c2r_with_twiddles_first_stage( (CPLX*) fout_c, fin, cfg->r_super_twiddles_neon, cfg->nfft);
1782 ne10_mixed_radix_c2r_butterfly_float32_neon ( (CPLX*) fout, (CPLX*) NULL, cfg->r_factors_neon, cfg->r_twiddles_neon_backward, tmpbuf);
1783 break;
1784 }
1785 fin[0].i = 0.0f;
1786}
1787
void ne10_fft_r2c_1d_float32_neon(ne10_fft_cpx_float32_t *fout, ne10_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 FFT (real to complex) of float(32-bit) data.
void ne10_fft_c2r_1d_float32_neon(ne10_float32_t *fout, ne10_fft_cpx_float32_t *fin, ne10_fft_r2c_cfg_float32_t cfg)
Mixed radix-2/4 IFFT (complex to real) of float(32-bit) data.