Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_x2_rotator_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
81#ifndef INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
82#define INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H
83
84
85#include <math.h>
86#include <stdio.h>
87#include <stdlib.h>
88#include <volk/volk_complex.h>
89#define ROTATOR_RELOAD 512
90#define ROTATOR_RELOAD_2 (ROTATOR_RELOAD / 2)
91#define ROTATOR_RELOAD_4 (ROTATOR_RELOAD / 4)
92
93
94#ifdef LV_HAVE_GENERIC
95
97 const lv_32fc_t* inVector,
98 const lv_32fc_t phase_inc,
99 lv_32fc_t* phase,
100 unsigned int num_points)
101{
102 unsigned int i = 0;
103 int j = 0;
104 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
105 for (j = 0; j < ROTATOR_RELOAD; ++j) {
106 *outVector++ = *inVector++ * (*phase);
107 (*phase) *= phase_inc;
108 }
109
110 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
111 }
112 for (i = 0; i < num_points % ROTATOR_RELOAD; ++i) {
113 *outVector++ = *inVector++ * (*phase);
114 (*phase) *= phase_inc;
115 }
116 if (i) {
117 // Make sure, we normalize phase on every call!
118 (*phase) /= hypotf(lv_creal(*phase), lv_cimag(*phase));
119 }
120}
121
122#endif /* LV_HAVE_GENERIC */
123
124
125#ifdef LV_HAVE_NEON
126#include <arm_neon.h>
128
130 const lv_32fc_t* inVector,
131 const lv_32fc_t phase_inc,
132 lv_32fc_t* phase,
133 unsigned int num_points)
134
135{
136 lv_32fc_t* outputVectorPtr = outVector;
137 const lv_32fc_t* inputVectorPtr = inVector;
138 lv_32fc_t incr = 1;
139 lv_32fc_t phasePtr[4] = { (*phase), (*phase), (*phase), (*phase) };
140 float32x4x2_t input_vec;
141 float32x4x2_t output_vec;
142
143 unsigned int i = 0, j = 0;
144 // const unsigned int quarter_points = num_points / 4;
145
146 for (i = 0; i < 4; ++i) {
147 phasePtr[i] *= incr;
148 incr *= (phase_inc);
149 }
150
151 // Notice that incr has be incremented in the previous loop
152 const lv_32fc_t incrPtr[4] = { incr, incr, incr, incr };
153 const float32x4x2_t incr_vec = vld2q_f32((float*)incrPtr);
154 float32x4x2_t phase_vec = vld2q_f32((float*)phasePtr);
155
156 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
157 for (j = 0; j < ROTATOR_RELOAD_4; j++) {
158 input_vec = vld2q_f32((float*)inputVectorPtr);
159 // Prefetch next one, speeds things up
160 __VOLK_PREFETCH(inputVectorPtr + 4);
161 // Rotate
162 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
163 // Increase phase
164 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
165 // Store output
166 vst2q_f32((float*)outputVectorPtr, output_vec);
167
168 outputVectorPtr += 4;
169 inputVectorPtr += 4;
170 }
171 // normalize phase so magnitude doesn't grow because of
172 // floating point rounding error
173 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
174 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
175 // Multiply complex with real
176 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
177 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
178 }
179
180 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; i++) {
181 input_vec = vld2q_f32((float*)inputVectorPtr);
182 // Prefetch next one, speeds things up
183 __VOLK_PREFETCH(inputVectorPtr + 4);
184 // Rotate
185 output_vec = _vmultiply_complexq_f32(input_vec, phase_vec);
186 // Increase phase
187 phase_vec = _vmultiply_complexq_f32(phase_vec, incr_vec);
188 // Store output
189 vst2q_f32((float*)outputVectorPtr, output_vec);
190
191 outputVectorPtr += 4;
192 inputVectorPtr += 4;
193 }
194 // if(i) == true means we looped above
195 if (i) {
196 // normalize phase so magnitude doesn't grow because of
197 // floating point rounding error
198 const float32x4_t mag_squared = _vmagnitudesquaredq_f32(phase_vec);
199 const float32x4_t inv_mag = _vinvsqrtq_f32(mag_squared);
200 // Multiply complex with real
201 phase_vec.val[0] = vmulq_f32(phase_vec.val[0], inv_mag);
202 phase_vec.val[1] = vmulq_f32(phase_vec.val[1], inv_mag);
203 }
204 // Store current phase
205 vst2q_f32((float*)phasePtr, phase_vec);
206
207 // Deal with the rest
208 for (i = 0; i < num_points % 4; i++) {
209 *outputVectorPtr++ = *inputVectorPtr++ * phasePtr[0];
210 phasePtr[0] *= (phase_inc);
211 }
212
213 // For continuous phase next time we need to call this function
214 (*phase) = phasePtr[0];
215}
216
217#endif /* LV_HAVE_NEON */
218
219
220#ifdef LV_HAVE_SSE4_1
221#include <smmintrin.h>
222
223static inline void volk_32fc_s32fc_x2_rotator_32fc_a_sse4_1(lv_32fc_t* outVector,
224 const lv_32fc_t* inVector,
225 const lv_32fc_t phase_inc,
226 lv_32fc_t* phase,
227 unsigned int num_points)
228{
229 lv_32fc_t* cPtr = outVector;
230 const lv_32fc_t* aPtr = inVector;
231 lv_32fc_t incr = 1;
232 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
233
234 unsigned int i, j = 0;
235
236 for (i = 0; i < 2; ++i) {
237 phase_Ptr[i] *= incr;
238 incr *= (phase_inc);
239 }
240
241 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
242
243 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
244 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
245
246 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
247 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
248
249 aVal = _mm_load_ps((float*)aPtr);
250
251 yl = _mm_moveldup_ps(phase_Val);
252 yh = _mm_movehdup_ps(phase_Val);
253 ylp = _mm_moveldup_ps(inc_Val);
254 yhp = _mm_movehdup_ps(inc_Val);
255
256 tmp1 = _mm_mul_ps(aVal, yl);
257 tmp1p = _mm_mul_ps(phase_Val, ylp);
258
259 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
260 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
261 tmp2 = _mm_mul_ps(aVal, yh);
262 tmp2p = _mm_mul_ps(phase_Val, yhp);
263
264 z = _mm_addsub_ps(tmp1, tmp2);
265 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
266
267 _mm_store_ps((float*)cPtr, z);
268
269 aPtr += 2;
270 cPtr += 2;
271 }
272 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
273 tmp2 = _mm_hadd_ps(tmp1, tmp1);
274 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
275 tmp2 = _mm_sqrt_ps(tmp1);
276 phase_Val = _mm_div_ps(phase_Val, tmp2);
277 }
278 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
279 aVal = _mm_load_ps((float*)aPtr);
280
281 yl = _mm_moveldup_ps(phase_Val);
282 yh = _mm_movehdup_ps(phase_Val);
283 ylp = _mm_moveldup_ps(inc_Val);
284 yhp = _mm_movehdup_ps(inc_Val);
285
286 tmp1 = _mm_mul_ps(aVal, yl);
287
288 tmp1p = _mm_mul_ps(phase_Val, ylp);
289
290 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
291 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
292 tmp2 = _mm_mul_ps(aVal, yh);
293 tmp2p = _mm_mul_ps(phase_Val, yhp);
294
295 z = _mm_addsub_ps(tmp1, tmp2);
296 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
297
298 _mm_store_ps((float*)cPtr, z);
299
300 aPtr += 2;
301 cPtr += 2;
302 }
303 if (i) {
304 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
305 tmp2 = _mm_hadd_ps(tmp1, tmp1);
306 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
307 tmp2 = _mm_sqrt_ps(tmp1);
308 phase_Val = _mm_div_ps(phase_Val, tmp2);
309 }
310
311 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
312 if (num_points & 1) {
313 *cPtr++ = *aPtr++ * phase_Ptr[0];
314 phase_Ptr[0] *= (phase_inc);
315 }
316
317 (*phase) = phase_Ptr[0];
318}
319
320#endif /* LV_HAVE_SSE4_1 for aligned */
321
322
323#ifdef LV_HAVE_SSE4_1
324#include <smmintrin.h>
325
326static inline void volk_32fc_s32fc_x2_rotator_32fc_u_sse4_1(lv_32fc_t* outVector,
327 const lv_32fc_t* inVector,
328 const lv_32fc_t phase_inc,
329 lv_32fc_t* phase,
330 unsigned int num_points)
331{
332 lv_32fc_t* cPtr = outVector;
333 const lv_32fc_t* aPtr = inVector;
334 lv_32fc_t incr = 1;
335 lv_32fc_t phase_Ptr[2] = { (*phase), (*phase) };
336
337 unsigned int i, j = 0;
338
339 for (i = 0; i < 2; ++i) {
340 phase_Ptr[i] *= incr;
341 incr *= (phase_inc);
342 }
343
344 /*printf("%f, %f\n", lv_creal(phase_Ptr[0]), lv_cimag(phase_Ptr[0]));
345 printf("%f, %f\n", lv_creal(phase_Ptr[1]), lv_cimag(phase_Ptr[1]));
346 printf("incr: %f, %f\n", lv_creal(incr), lv_cimag(incr));*/
347 __m128 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
348
349 phase_Val = _mm_loadu_ps((float*)phase_Ptr);
350 inc_Val = _mm_set_ps(lv_cimag(incr), lv_creal(incr), lv_cimag(incr), lv_creal(incr));
351
352 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
353 for (j = 0; j < ROTATOR_RELOAD_2; ++j) {
354
355 aVal = _mm_loadu_ps((float*)aPtr);
356
357 yl = _mm_moveldup_ps(phase_Val);
358 yh = _mm_movehdup_ps(phase_Val);
359 ylp = _mm_moveldup_ps(inc_Val);
360 yhp = _mm_movehdup_ps(inc_Val);
361
362 tmp1 = _mm_mul_ps(aVal, yl);
363 tmp1p = _mm_mul_ps(phase_Val, ylp);
364
365 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
366 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
367 tmp2 = _mm_mul_ps(aVal, yh);
368 tmp2p = _mm_mul_ps(phase_Val, yhp);
369
370 z = _mm_addsub_ps(tmp1, tmp2);
371 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
372
373 _mm_storeu_ps((float*)cPtr, z);
374
375 aPtr += 2;
376 cPtr += 2;
377 }
378 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
379 tmp2 = _mm_hadd_ps(tmp1, tmp1);
380 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
381 tmp2 = _mm_sqrt_ps(tmp1);
382 phase_Val = _mm_div_ps(phase_Val, tmp2);
383 }
384 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 2; ++i) {
385 aVal = _mm_loadu_ps((float*)aPtr);
386
387 yl = _mm_moveldup_ps(phase_Val);
388 yh = _mm_movehdup_ps(phase_Val);
389 ylp = _mm_moveldup_ps(inc_Val);
390 yhp = _mm_movehdup_ps(inc_Val);
391
392 tmp1 = _mm_mul_ps(aVal, yl);
393
394 tmp1p = _mm_mul_ps(phase_Val, ylp);
395
396 aVal = _mm_shuffle_ps(aVal, aVal, 0xB1);
397 phase_Val = _mm_shuffle_ps(phase_Val, phase_Val, 0xB1);
398 tmp2 = _mm_mul_ps(aVal, yh);
399 tmp2p = _mm_mul_ps(phase_Val, yhp);
400
401 z = _mm_addsub_ps(tmp1, tmp2);
402 phase_Val = _mm_addsub_ps(tmp1p, tmp2p);
403
404 _mm_storeu_ps((float*)cPtr, z);
405
406 aPtr += 2;
407 cPtr += 2;
408 }
409 if (i) {
410 tmp1 = _mm_mul_ps(phase_Val, phase_Val);
411 tmp2 = _mm_hadd_ps(tmp1, tmp1);
412 tmp1 = _mm_shuffle_ps(tmp2, tmp2, 0xD8);
413 tmp2 = _mm_sqrt_ps(tmp1);
414 phase_Val = _mm_div_ps(phase_Val, tmp2);
415 }
416
417 _mm_storeu_ps((float*)phase_Ptr, phase_Val);
418 if (num_points & 1) {
419 *cPtr++ = *aPtr++ * phase_Ptr[0];
420 phase_Ptr[0] *= (phase_inc);
421 }
422
423 (*phase) = phase_Ptr[0];
424}
425
426#endif /* LV_HAVE_SSE4_1 */
427
428
429#ifdef LV_HAVE_AVX
430#include <immintrin.h>
432
434 const lv_32fc_t* inVector,
435 const lv_32fc_t phase_inc,
436 lv_32fc_t* phase,
437 unsigned int num_points)
438{
439 lv_32fc_t* cPtr = outVector;
440 const lv_32fc_t* aPtr = inVector;
441 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
442 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
443
444 unsigned int i, j = 0;
445
446 for (i = 0; i < 4; ++i) {
447 phase_Ptr[i] *= incr;
448 incr *= (phase_inc);
449 }
450
451 __m256 aVal, phase_Val, z;
452
453 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
454
455 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
456 lv_creal(incr),
457 lv_cimag(incr),
458 lv_creal(incr),
459 lv_cimag(incr),
460 lv_creal(incr),
461 lv_cimag(incr),
462 lv_creal(incr));
463
464 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
465 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
466
467 aVal = _mm256_load_ps((float*)aPtr);
468
469 z = _mm256_complexmul_ps(aVal, phase_Val);
470 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
471
472 _mm256_store_ps((float*)cPtr, z);
473
474 aPtr += 4;
475 cPtr += 4;
476 }
477 phase_Val = _mm256_normalize_ps(phase_Val);
478 }
479
480 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
481 aVal = _mm256_load_ps((float*)aPtr);
482
483 z = _mm256_complexmul_ps(aVal, phase_Val);
484 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
485
486 _mm256_store_ps((float*)cPtr, z);
487
488 aPtr += 4;
489 cPtr += 4;
490 }
491 if (i) {
492 phase_Val = _mm256_normalize_ps(phase_Val);
493 }
494
495 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
496 (*phase) = phase_Ptr[0];
497 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
498}
499
500#endif /* LV_HAVE_AVX for aligned */
501
502
503#ifdef LV_HAVE_AVX
504#include <immintrin.h>
506
508 const lv_32fc_t* inVector,
509 const lv_32fc_t phase_inc,
510 lv_32fc_t* phase,
511 unsigned int num_points)
512{
513 lv_32fc_t* cPtr = outVector;
514 const lv_32fc_t* aPtr = inVector;
515 lv_32fc_t incr = lv_cmake(1.0f, 0.0f);
516 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
517
518 unsigned int i, j = 0;
519
520 for (i = 0; i < 4; ++i) {
521 phase_Ptr[i] *= incr;
522 incr *= (phase_inc);
523 }
524
525 __m256 aVal, phase_Val, z;
526
527 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
528
529 const __m256 inc_Val = _mm256_set_ps(lv_cimag(incr),
530 lv_creal(incr),
531 lv_cimag(incr),
532 lv_creal(incr),
533 lv_cimag(incr),
534 lv_creal(incr),
535 lv_cimag(incr),
536 lv_creal(incr));
537
538 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); ++i) {
539 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
540
541 aVal = _mm256_loadu_ps((float*)aPtr);
542
543 z = _mm256_complexmul_ps(aVal, phase_Val);
544 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
545
546 _mm256_storeu_ps((float*)cPtr, z);
547
548 aPtr += 4;
549 cPtr += 4;
550 }
551 phase_Val = _mm256_normalize_ps(phase_Val);
552 }
553
554 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
555 aVal = _mm256_loadu_ps((float*)aPtr);
556
557 z = _mm256_complexmul_ps(aVal, phase_Val);
558 phase_Val = _mm256_complexmul_ps(phase_Val, inc_Val);
559
560 _mm256_storeu_ps((float*)cPtr, z);
561
562 aPtr += 4;
563 cPtr += 4;
564 }
565 if (i) {
566 phase_Val = _mm256_normalize_ps(phase_Val);
567 }
568
569 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
570 (*phase) = phase_Ptr[0];
571 volk_32fc_s32fc_x2_rotator_32fc_generic(cPtr, aPtr, phase_inc, phase, num_points % 4);
572}
573
574#endif /* LV_HAVE_AVX */
575
576#if LV_HAVE_AVX && LV_HAVE_FMA
577#include <immintrin.h>
578
579static inline void volk_32fc_s32fc_x2_rotator_32fc_a_avx_fma(lv_32fc_t* outVector,
580 const lv_32fc_t* inVector,
581 const lv_32fc_t phase_inc,
582 lv_32fc_t* phase,
583 unsigned int num_points)
584{
585 lv_32fc_t* cPtr = outVector;
586 const lv_32fc_t* aPtr = inVector;
587 lv_32fc_t incr = 1;
589 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
590
591 unsigned int i, j = 0;
592
593 for (i = 0; i < 4; ++i) {
594 phase_Ptr[i] *= incr;
595 incr *= (phase_inc);
596 }
597
598 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
599
600 phase_Val = _mm256_load_ps((float*)phase_Ptr);
601 inc_Val = _mm256_set_ps(lv_cimag(incr),
602 lv_creal(incr),
603 lv_cimag(incr),
604 lv_creal(incr),
605 lv_cimag(incr),
606 lv_creal(incr),
607 lv_cimag(incr),
608 lv_creal(incr));
609
610 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
611 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
612
613 aVal = _mm256_load_ps((float*)aPtr);
614
615 yl = _mm256_moveldup_ps(phase_Val);
616 yh = _mm256_movehdup_ps(phase_Val);
617 ylp = _mm256_moveldup_ps(inc_Val);
618 yhp = _mm256_movehdup_ps(inc_Val);
619
620 tmp1 = aVal;
621 tmp1p = phase_Val;
622
623 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
624 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
625 tmp2 = _mm256_mul_ps(aVal, yh);
626 tmp2p = _mm256_mul_ps(phase_Val, yhp);
627
628 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
629 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
630
631 _mm256_store_ps((float*)cPtr, z);
632
633 aPtr += 4;
634 cPtr += 4;
635 }
636 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
637 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
638 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
639 tmp2 = _mm256_sqrt_ps(tmp1);
640 phase_Val = _mm256_div_ps(phase_Val, tmp2);
641 }
642 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
643 aVal = _mm256_load_ps((float*)aPtr);
644
645 yl = _mm256_moveldup_ps(phase_Val);
646 yh = _mm256_movehdup_ps(phase_Val);
647 ylp = _mm256_moveldup_ps(inc_Val);
648 yhp = _mm256_movehdup_ps(inc_Val);
649
650 tmp1 = aVal;
651 tmp1p = phase_Val;
652
653 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
654 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
655 tmp2 = _mm256_mul_ps(aVal, yh);
656 tmp2p = _mm256_mul_ps(phase_Val, yhp);
657
658 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
659 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
660
661 _mm256_store_ps((float*)cPtr, z);
662
663 aPtr += 4;
664 cPtr += 4;
665 }
666 if (i) {
667 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
668 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
669 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
670 tmp2 = _mm256_sqrt_ps(tmp1);
671 phase_Val = _mm256_div_ps(phase_Val, tmp2);
672 }
673
674 _mm256_store_ps((float*)phase_Ptr, phase_Val);
675 for (i = 0; i < num_points % 4; ++i) {
676 *cPtr++ = *aPtr++ * phase_Ptr[0];
677 phase_Ptr[0] *= (phase_inc);
678 }
679
680 (*phase) = phase_Ptr[0];
681}
682
683#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned*/
684
685#if LV_HAVE_AVX && LV_HAVE_FMA
686#include <immintrin.h>
687
688static inline void volk_32fc_s32fc_x2_rotator_32fc_u_avx_fma(lv_32fc_t* outVector,
689 const lv_32fc_t* inVector,
690 const lv_32fc_t phase_inc,
691 lv_32fc_t* phase,
692 unsigned int num_points)
693{
694 lv_32fc_t* cPtr = outVector;
695 const lv_32fc_t* aPtr = inVector;
696 lv_32fc_t incr = 1;
697 lv_32fc_t phase_Ptr[4] = { (*phase), (*phase), (*phase), (*phase) };
698
699 unsigned int i, j = 0;
700
701 for (i = 0; i < 4; ++i) {
702 phase_Ptr[i] *= incr;
703 incr *= (phase_inc);
704 }
705
706 __m256 aVal, phase_Val, inc_Val, yl, yh, tmp1, tmp2, z, ylp, yhp, tmp1p, tmp2p;
707
708 phase_Val = _mm256_loadu_ps((float*)phase_Ptr);
709 inc_Val = _mm256_set_ps(lv_cimag(incr),
710 lv_creal(incr),
711 lv_cimag(incr),
712 lv_creal(incr),
713 lv_cimag(incr),
714 lv_creal(incr),
715 lv_cimag(incr),
716 lv_creal(incr));
717
718 for (i = 0; i < (unsigned int)(num_points / ROTATOR_RELOAD); i++) {
719 for (j = 0; j < ROTATOR_RELOAD_4; ++j) {
720
721 aVal = _mm256_loadu_ps((float*)aPtr);
722
723 yl = _mm256_moveldup_ps(phase_Val);
724 yh = _mm256_movehdup_ps(phase_Val);
725 ylp = _mm256_moveldup_ps(inc_Val);
726 yhp = _mm256_movehdup_ps(inc_Val);
727
728 tmp1 = aVal;
729 tmp1p = phase_Val;
730
731 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
732 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
733 tmp2 = _mm256_mul_ps(aVal, yh);
734 tmp2p = _mm256_mul_ps(phase_Val, yhp);
735
736 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
737 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
738
739 _mm256_storeu_ps((float*)cPtr, z);
740
741 aPtr += 4;
742 cPtr += 4;
743 }
744 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
745 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
746 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
747 tmp2 = _mm256_sqrt_ps(tmp1);
748 phase_Val = _mm256_div_ps(phase_Val, tmp2);
749 }
750 for (i = 0; i < (num_points % ROTATOR_RELOAD) / 4; ++i) {
751 aVal = _mm256_loadu_ps((float*)aPtr);
752
753 yl = _mm256_moveldup_ps(phase_Val);
754 yh = _mm256_movehdup_ps(phase_Val);
755 ylp = _mm256_moveldup_ps(inc_Val);
756 yhp = _mm256_movehdup_ps(inc_Val);
757
758 tmp1 = aVal;
759 tmp1p = phase_Val;
760
761 aVal = _mm256_shuffle_ps(aVal, aVal, 0xB1);
762 phase_Val = _mm256_shuffle_ps(phase_Val, phase_Val, 0xB1);
763 tmp2 = _mm256_mul_ps(aVal, yh);
764 tmp2p = _mm256_mul_ps(phase_Val, yhp);
765
766 z = _mm256_fmaddsub_ps(tmp1, yl, tmp2);
767 phase_Val = _mm256_fmaddsub_ps(tmp1p, ylp, tmp2p);
768
769 _mm256_storeu_ps((float*)cPtr, z);
770
771 aPtr += 4;
772 cPtr += 4;
773 }
774 if (i) {
775 tmp1 = _mm256_mul_ps(phase_Val, phase_Val);
776 tmp2 = _mm256_hadd_ps(tmp1, tmp1);
777 tmp1 = _mm256_shuffle_ps(tmp2, tmp2, 0xD8);
778 tmp2 = _mm256_sqrt_ps(tmp1);
779 phase_Val = _mm256_div_ps(phase_Val, tmp2);
780 }
781
782 _mm256_storeu_ps((float*)phase_Ptr, phase_Val);
783 for (i = 0; i < num_points % 4; ++i) {
784 *cPtr++ = *aPtr++ * phase_Ptr[0];
785 phase_Ptr[0] *= (phase_inc);
786 }
787
788 (*phase) = phase_Ptr[0];
789}
790
791#endif /* LV_HAVE_AVX && LV_HAVE_FMA*/
792
793#endif /* INCLUDED_volk_32fc_s32fc_rotator_32fc_a_H */
static void volk_32fc_s32fc_x2_rotator_32fc_neon(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:129
static void volk_32fc_s32fc_x2_rotator_32fc_u_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:507
#define ROTATOR_RELOAD_4
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:91
#define ROTATOR_RELOAD_2
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:90
#define ROTATOR_RELOAD
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:89
static void volk_32fc_s32fc_x2_rotator_32fc_generic(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:96
static void volk_32fc_s32fc_x2_rotator_32fc_a_avx(lv_32fc_t *outVector, const lv_32fc_t *inVector, const lv_32fc_t phase_inc, lv_32fc_t *phase, unsigned int num_points)
Definition: volk_32fc_s32fc_x2_rotator_32fc.h:433
static __m256 _mm256_complexmul_ps(__m256 x, __m256 y)
Definition: volk_avx_intrinsics.h:32
static __m256 _mm256_normalize_ps(__m256 val)
Definition: volk_avx_intrinsics.h:64
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:96
static float32x4x2_t _vmultiply_complexq_f32(float32x4x2_t a_val, float32x4x2_t b_val)
Definition: volk_neon_intrinsics.h:118
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86