Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16i_32fc_dot_prod_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
58#ifndef INCLUDED_volk_16i_32fc_dot_prod_32fc_H
59#define INCLUDED_volk_16i_32fc_dot_prod_32fc_H
60
61#include <stdio.h>
62#include <volk/volk_common.h>
63
64
65#ifdef LV_HAVE_GENERIC
66
68 const short* input,
69 const lv_32fc_t* taps,
70 unsigned int num_points)
71{
72
73 static const int N_UNROLL = 4;
74
75 lv_32fc_t acc0 = 0;
76 lv_32fc_t acc1 = 0;
77 lv_32fc_t acc2 = 0;
78 lv_32fc_t acc3 = 0;
79
80 unsigned i = 0;
81 unsigned n = (num_points / N_UNROLL) * N_UNROLL;
82
83 for (i = 0; i < n; i += N_UNROLL) {
84 acc0 += taps[i + 0] * (float)input[i + 0];
85 acc1 += taps[i + 1] * (float)input[i + 1];
86 acc2 += taps[i + 2] * (float)input[i + 2];
87 acc3 += taps[i + 3] * (float)input[i + 3];
88 }
89
90 for (; i < num_points; i++) {
91 acc0 += taps[i] * (float)input[i];
92 }
93
94 *result = acc0 + acc1 + acc2 + acc3;
95}
96
97#endif /*LV_HAVE_GENERIC*/
98
99#ifdef LV_HAVE_NEON
100#include <arm_neon.h>
102 const short* input,
103 const lv_32fc_t* taps,
104 unsigned int num_points)
105{
106
107 unsigned ii;
108 unsigned quarter_points = num_points / 4;
109 lv_32fc_t* tapsPtr = (lv_32fc_t*)taps;
110 short* inputPtr = (short*)input;
111 lv_32fc_t accumulator_vec[4];
112
113 float32x4x2_t tapsVal, accumulator_val;
114 int16x4_t input16;
115 int32x4_t input32;
116 float32x4_t input_float, prod_re, prod_im;
117
118 accumulator_val.val[0] = vdupq_n_f32(0.0);
119 accumulator_val.val[1] = vdupq_n_f32(0.0);
120
121 for (ii = 0; ii < quarter_points; ++ii) {
122 tapsVal = vld2q_f32((float*)tapsPtr);
123 input16 = vld1_s16(inputPtr);
124 // widen 16-bit int to 32-bit int
125 input32 = vmovl_s16(input16);
126 // convert 32-bit int to float with scale
127 input_float = vcvtq_f32_s32(input32);
128
129 prod_re = vmulq_f32(input_float, tapsVal.val[0]);
130 prod_im = vmulq_f32(input_float, tapsVal.val[1]);
131
132 accumulator_val.val[0] = vaddq_f32(prod_re, accumulator_val.val[0]);
133 accumulator_val.val[1] = vaddq_f32(prod_im, accumulator_val.val[1]);
134
135 tapsPtr += 4;
136 inputPtr += 4;
137 }
138 vst2q_f32((float*)accumulator_vec, accumulator_val);
139 accumulator_vec[0] += accumulator_vec[1];
140 accumulator_vec[2] += accumulator_vec[3];
141 accumulator_vec[0] += accumulator_vec[2];
142
143 for (ii = quarter_points * 4; ii < num_points; ++ii) {
144 accumulator_vec[0] += *(tapsPtr++) * (float)(*(inputPtr++));
145 }
146
147 *result = accumulator_vec[0];
148}
149
150#endif /*LV_HAVE_NEON*/
151
152#if LV_HAVE_SSE && LV_HAVE_MMX
153
154static inline void volk_16i_32fc_dot_prod_32fc_u_sse(lv_32fc_t* result,
155 const short* input,
156 const lv_32fc_t* taps,
157 unsigned int num_points)
158{
159
160 unsigned int number = 0;
161 const unsigned int sixteenthPoints = num_points / 8;
162
163 float res[2];
164 float *realpt = &res[0], *imagpt = &res[1];
165 const short* aPtr = input;
166 const float* bPtr = (float*)taps;
167
168 __m64 m0, m1;
169 __m128 f0, f1, f2, f3;
170 __m128 a0Val, a1Val, a2Val, a3Val;
171 __m128 b0Val, b1Val, b2Val, b3Val;
172 __m128 c0Val, c1Val, c2Val, c3Val;
173
174 __m128 dotProdVal0 = _mm_setzero_ps();
175 __m128 dotProdVal1 = _mm_setzero_ps();
176 __m128 dotProdVal2 = _mm_setzero_ps();
177 __m128 dotProdVal3 = _mm_setzero_ps();
178
179 for (; number < sixteenthPoints; number++) {
180
181 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
182 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
183 f0 = _mm_cvtpi16_ps(m0);
184 f1 = _mm_cvtpi16_ps(m0);
185 f2 = _mm_cvtpi16_ps(m1);
186 f3 = _mm_cvtpi16_ps(m1);
187
188 a0Val = _mm_unpacklo_ps(f0, f1);
189 a1Val = _mm_unpackhi_ps(f0, f1);
190 a2Val = _mm_unpacklo_ps(f2, f3);
191 a3Val = _mm_unpackhi_ps(f2, f3);
192
193 b0Val = _mm_loadu_ps(bPtr);
194 b1Val = _mm_loadu_ps(bPtr + 4);
195 b2Val = _mm_loadu_ps(bPtr + 8);
196 b3Val = _mm_loadu_ps(bPtr + 12);
197
198 c0Val = _mm_mul_ps(a0Val, b0Val);
199 c1Val = _mm_mul_ps(a1Val, b1Val);
200 c2Val = _mm_mul_ps(a2Val, b2Val);
201 c3Val = _mm_mul_ps(a3Val, b3Val);
202
203 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
204 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
205 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
206 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
207
208 aPtr += 8;
209 bPtr += 16;
210 }
211
212 _mm_empty(); // clear the mmx technology state
213
214 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
215 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
216 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
217
218 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
219
220 _mm_store_ps(dotProductVector,
221 dotProdVal0); // Store the results back into the dot product vector
222
223 *realpt = dotProductVector[0];
224 *imagpt = dotProductVector[1];
225 *realpt += dotProductVector[2];
226 *imagpt += dotProductVector[3];
227
228 number = sixteenthPoints * 8;
229 for (; number < num_points; number++) {
230 *realpt += ((*aPtr) * (*bPtr++));
231 *imagpt += ((*aPtr++) * (*bPtr++));
232 }
233
234 *result = *(lv_32fc_t*)(&res[0]);
235}
236
237#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
238
239
240#if LV_HAVE_AVX2 && LV_HAVE_FMA
241
242static inline void volk_16i_32fc_dot_prod_32fc_u_avx2_fma(lv_32fc_t* result,
243 const short* input,
244 const lv_32fc_t* taps,
245 unsigned int num_points)
246{
247
248 unsigned int number = 0;
249 const unsigned int sixteenthPoints = num_points / 16;
250
251 float res[2];
252 float *realpt = &res[0], *imagpt = &res[1];
253 const short* aPtr = input;
254 const float* bPtr = (float*)taps;
255
256 __m128i m0, m1;
257 __m256i f0, f1;
258 __m256 g0, g1, h0, h1, h2, h3;
259 __m256 a0Val, a1Val, a2Val, a3Val;
260 __m256 b0Val, b1Val, b2Val, b3Val;
261
262 __m256 dotProdVal0 = _mm256_setzero_ps();
263 __m256 dotProdVal1 = _mm256_setzero_ps();
264 __m256 dotProdVal2 = _mm256_setzero_ps();
265 __m256 dotProdVal3 = _mm256_setzero_ps();
266
267 for (; number < sixteenthPoints; number++) {
268
269 m0 = _mm_loadu_si128((__m128i const*)aPtr);
270 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
271
272 f0 = _mm256_cvtepi16_epi32(m0);
273 g0 = _mm256_cvtepi32_ps(f0);
274 f1 = _mm256_cvtepi16_epi32(m1);
275 g1 = _mm256_cvtepi32_ps(f1);
276
277 h0 = _mm256_unpacklo_ps(g0, g0);
278 h1 = _mm256_unpackhi_ps(g0, g0);
279 h2 = _mm256_unpacklo_ps(g1, g1);
280 h3 = _mm256_unpackhi_ps(g1, g1);
281
282 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
283 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
284 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
285 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
286
287 b0Val = _mm256_loadu_ps(bPtr);
288 b1Val = _mm256_loadu_ps(bPtr + 8);
289 b2Val = _mm256_loadu_ps(bPtr + 16);
290 b3Val = _mm256_loadu_ps(bPtr + 24);
291
292 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
293 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
294 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
295 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
296
297 aPtr += 16;
298 bPtr += 32;
299 }
300
301 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
302 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
303 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
304
305 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
306
307 _mm256_store_ps(dotProductVector,
308 dotProdVal0); // Store the results back into the dot product vector
309
310 *realpt = dotProductVector[0];
311 *imagpt = dotProductVector[1];
312 *realpt += dotProductVector[2];
313 *imagpt += dotProductVector[3];
314 *realpt += dotProductVector[4];
315 *imagpt += dotProductVector[5];
316 *realpt += dotProductVector[6];
317 *imagpt += dotProductVector[7];
318
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *realpt += ((*aPtr) * (*bPtr++));
322 *imagpt += ((*aPtr++) * (*bPtr++));
323 }
324
325 *result = *(lv_32fc_t*)(&res[0]);
326}
327
328#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
329
330
331#ifdef LV_HAVE_AVX2
332
333static inline void volk_16i_32fc_dot_prod_32fc_u_avx2(lv_32fc_t* result,
334 const short* input,
335 const lv_32fc_t* taps,
336 unsigned int num_points)
337{
338
339 unsigned int number = 0;
340 const unsigned int sixteenthPoints = num_points / 16;
341
342 float res[2];
343 float *realpt = &res[0], *imagpt = &res[1];
344 const short* aPtr = input;
345 const float* bPtr = (float*)taps;
346
347 __m128i m0, m1;
348 __m256i f0, f1;
349 __m256 g0, g1, h0, h1, h2, h3;
350 __m256 a0Val, a1Val, a2Val, a3Val;
351 __m256 b0Val, b1Val, b2Val, b3Val;
352 __m256 c0Val, c1Val, c2Val, c3Val;
353
354 __m256 dotProdVal0 = _mm256_setzero_ps();
355 __m256 dotProdVal1 = _mm256_setzero_ps();
356 __m256 dotProdVal2 = _mm256_setzero_ps();
357 __m256 dotProdVal3 = _mm256_setzero_ps();
358
359 for (; number < sixteenthPoints; number++) {
360
361 m0 = _mm_loadu_si128((__m128i const*)aPtr);
362 m1 = _mm_loadu_si128((__m128i const*)(aPtr + 8));
363
364 f0 = _mm256_cvtepi16_epi32(m0);
365 g0 = _mm256_cvtepi32_ps(f0);
366 f1 = _mm256_cvtepi16_epi32(m1);
367 g1 = _mm256_cvtepi32_ps(f1);
368
369 h0 = _mm256_unpacklo_ps(g0, g0);
370 h1 = _mm256_unpackhi_ps(g0, g0);
371 h2 = _mm256_unpacklo_ps(g1, g1);
372 h3 = _mm256_unpackhi_ps(g1, g1);
373
374 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
375 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
376 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
377 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
378
379 b0Val = _mm256_loadu_ps(bPtr);
380 b1Val = _mm256_loadu_ps(bPtr + 8);
381 b2Val = _mm256_loadu_ps(bPtr + 16);
382 b3Val = _mm256_loadu_ps(bPtr + 24);
383
384 c0Val = _mm256_mul_ps(a0Val, b0Val);
385 c1Val = _mm256_mul_ps(a1Val, b1Val);
386 c2Val = _mm256_mul_ps(a2Val, b2Val);
387 c3Val = _mm256_mul_ps(a3Val, b3Val);
388
389 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
390 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
391 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
392 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
393
394 aPtr += 16;
395 bPtr += 32;
396 }
397
398 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
399 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
400 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
401
402 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
403
404 _mm256_store_ps(dotProductVector,
405 dotProdVal0); // Store the results back into the dot product vector
406
407 *realpt = dotProductVector[0];
408 *imagpt = dotProductVector[1];
409 *realpt += dotProductVector[2];
410 *imagpt += dotProductVector[3];
411 *realpt += dotProductVector[4];
412 *imagpt += dotProductVector[5];
413 *realpt += dotProductVector[6];
414 *imagpt += dotProductVector[7];
415
416 number = sixteenthPoints * 16;
417 for (; number < num_points; number++) {
418 *realpt += ((*aPtr) * (*bPtr++));
419 *imagpt += ((*aPtr++) * (*bPtr++));
420 }
421
422 *result = *(lv_32fc_t*)(&res[0]);
423}
424
425#endif /*LV_HAVE_AVX2*/
426
427
428#if LV_HAVE_SSE && LV_HAVE_MMX
429
430
431static inline void volk_16i_32fc_dot_prod_32fc_a_sse(lv_32fc_t* result,
432 const short* input,
433 const lv_32fc_t* taps,
434 unsigned int num_points)
435{
436
437 unsigned int number = 0;
438 const unsigned int sixteenthPoints = num_points / 8;
439
440 float res[2];
441 float *realpt = &res[0], *imagpt = &res[1];
442 const short* aPtr = input;
443 const float* bPtr = (float*)taps;
444
445 __m64 m0, m1;
446 __m128 f0, f1, f2, f3;
447 __m128 a0Val, a1Val, a2Val, a3Val;
448 __m128 b0Val, b1Val, b2Val, b3Val;
449 __m128 c0Val, c1Val, c2Val, c3Val;
450
451 __m128 dotProdVal0 = _mm_setzero_ps();
452 __m128 dotProdVal1 = _mm_setzero_ps();
453 __m128 dotProdVal2 = _mm_setzero_ps();
454 __m128 dotProdVal3 = _mm_setzero_ps();
455
456 for (; number < sixteenthPoints; number++) {
457
458 m0 = _mm_set_pi16(*(aPtr + 3), *(aPtr + 2), *(aPtr + 1), *(aPtr + 0));
459 m1 = _mm_set_pi16(*(aPtr + 7), *(aPtr + 6), *(aPtr + 5), *(aPtr + 4));
460 f0 = _mm_cvtpi16_ps(m0);
461 f1 = _mm_cvtpi16_ps(m0);
462 f2 = _mm_cvtpi16_ps(m1);
463 f3 = _mm_cvtpi16_ps(m1);
464
465 a0Val = _mm_unpacklo_ps(f0, f1);
466 a1Val = _mm_unpackhi_ps(f0, f1);
467 a2Val = _mm_unpacklo_ps(f2, f3);
468 a3Val = _mm_unpackhi_ps(f2, f3);
469
470 b0Val = _mm_load_ps(bPtr);
471 b1Val = _mm_load_ps(bPtr + 4);
472 b2Val = _mm_load_ps(bPtr + 8);
473 b3Val = _mm_load_ps(bPtr + 12);
474
475 c0Val = _mm_mul_ps(a0Val, b0Val);
476 c1Val = _mm_mul_ps(a1Val, b1Val);
477 c2Val = _mm_mul_ps(a2Val, b2Val);
478 c3Val = _mm_mul_ps(a3Val, b3Val);
479
480 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
481 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
482 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
483 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
484
485 aPtr += 8;
486 bPtr += 16;
487 }
488
489 _mm_empty(); // clear the mmx technology state
490
491 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
492 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
493 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
494
495 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
496
497 _mm_store_ps(dotProductVector,
498 dotProdVal0); // Store the results back into the dot product vector
499
500 *realpt = dotProductVector[0];
501 *imagpt = dotProductVector[1];
502 *realpt += dotProductVector[2];
503 *imagpt += dotProductVector[3];
504
505 number = sixteenthPoints * 8;
506 for (; number < num_points; number++) {
507 *realpt += ((*aPtr) * (*bPtr++));
508 *imagpt += ((*aPtr++) * (*bPtr++));
509 }
510
511 *result = *(lv_32fc_t*)(&res[0]);
512}
513
514#endif /*LV_HAVE_SSE && LV_HAVE_MMX*/
515
516#ifdef LV_HAVE_AVX2
517
518static inline void volk_16i_32fc_dot_prod_32fc_a_avx2(lv_32fc_t* result,
519 const short* input,
520 const lv_32fc_t* taps,
521 unsigned int num_points)
522{
523
524 unsigned int number = 0;
525 const unsigned int sixteenthPoints = num_points / 16;
526
527 float res[2];
528 float *realpt = &res[0], *imagpt = &res[1];
529 const short* aPtr = input;
530 const float* bPtr = (float*)taps;
531
532 __m128i m0, m1;
533 __m256i f0, f1;
534 __m256 g0, g1, h0, h1, h2, h3;
535 __m256 a0Val, a1Val, a2Val, a3Val;
536 __m256 b0Val, b1Val, b2Val, b3Val;
537 __m256 c0Val, c1Val, c2Val, c3Val;
538
539 __m256 dotProdVal0 = _mm256_setzero_ps();
540 __m256 dotProdVal1 = _mm256_setzero_ps();
541 __m256 dotProdVal2 = _mm256_setzero_ps();
542 __m256 dotProdVal3 = _mm256_setzero_ps();
543
544 for (; number < sixteenthPoints; number++) {
545
546 m0 = _mm_load_si128((__m128i const*)aPtr);
547 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
548
549 f0 = _mm256_cvtepi16_epi32(m0);
550 g0 = _mm256_cvtepi32_ps(f0);
551 f1 = _mm256_cvtepi16_epi32(m1);
552 g1 = _mm256_cvtepi32_ps(f1);
553
554 h0 = _mm256_unpacklo_ps(g0, g0);
555 h1 = _mm256_unpackhi_ps(g0, g0);
556 h2 = _mm256_unpacklo_ps(g1, g1);
557 h3 = _mm256_unpackhi_ps(g1, g1);
558
559 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
560 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
561 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
562 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
563
564 b0Val = _mm256_load_ps(bPtr);
565 b1Val = _mm256_load_ps(bPtr + 8);
566 b2Val = _mm256_load_ps(bPtr + 16);
567 b3Val = _mm256_load_ps(bPtr + 24);
568
569 c0Val = _mm256_mul_ps(a0Val, b0Val);
570 c1Val = _mm256_mul_ps(a1Val, b1Val);
571 c2Val = _mm256_mul_ps(a2Val, b2Val);
572 c3Val = _mm256_mul_ps(a3Val, b3Val);
573
574 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
575 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
576 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
577 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
578
579 aPtr += 16;
580 bPtr += 32;
581 }
582
583 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
584 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
585 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
586
587 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
588
589 _mm256_store_ps(dotProductVector,
590 dotProdVal0); // Store the results back into the dot product vector
591
592 *realpt = dotProductVector[0];
593 *imagpt = dotProductVector[1];
594 *realpt += dotProductVector[2];
595 *imagpt += dotProductVector[3];
596 *realpt += dotProductVector[4];
597 *imagpt += dotProductVector[5];
598 *realpt += dotProductVector[6];
599 *imagpt += dotProductVector[7];
600
601 number = sixteenthPoints * 16;
602 for (; number < num_points; number++) {
603 *realpt += ((*aPtr) * (*bPtr++));
604 *imagpt += ((*aPtr++) * (*bPtr++));
605 }
606
607 *result = *(lv_32fc_t*)(&res[0]);
608}
609
610
611#endif /*LV_HAVE_AVX2*/
612
613#if LV_HAVE_AVX2 && LV_HAVE_FMA
614
615static inline void volk_16i_32fc_dot_prod_32fc_a_avx2_fma(lv_32fc_t* result,
616 const short* input,
617 const lv_32fc_t* taps,
618 unsigned int num_points)
619{
620
621 unsigned int number = 0;
622 const unsigned int sixteenthPoints = num_points / 16;
623
624 float res[2];
625 float *realpt = &res[0], *imagpt = &res[1];
626 const short* aPtr = input;
627 const float* bPtr = (float*)taps;
628
629 __m128i m0, m1;
630 __m256i f0, f1;
631 __m256 g0, g1, h0, h1, h2, h3;
632 __m256 a0Val, a1Val, a2Val, a3Val;
633 __m256 b0Val, b1Val, b2Val, b3Val;
634
635 __m256 dotProdVal0 = _mm256_setzero_ps();
636 __m256 dotProdVal1 = _mm256_setzero_ps();
637 __m256 dotProdVal2 = _mm256_setzero_ps();
638 __m256 dotProdVal3 = _mm256_setzero_ps();
639
640 for (; number < sixteenthPoints; number++) {
641
642 m0 = _mm_load_si128((__m128i const*)aPtr);
643 m1 = _mm_load_si128((__m128i const*)(aPtr + 8));
644
645 f0 = _mm256_cvtepi16_epi32(m0);
646 g0 = _mm256_cvtepi32_ps(f0);
647 f1 = _mm256_cvtepi16_epi32(m1);
648 g1 = _mm256_cvtepi32_ps(f1);
649
650 h0 = _mm256_unpacklo_ps(g0, g0);
651 h1 = _mm256_unpackhi_ps(g0, g0);
652 h2 = _mm256_unpacklo_ps(g1, g1);
653 h3 = _mm256_unpackhi_ps(g1, g1);
654
655 a0Val = _mm256_permute2f128_ps(h0, h1, 0x20);
656 a1Val = _mm256_permute2f128_ps(h0, h1, 0x31);
657 a2Val = _mm256_permute2f128_ps(h2, h3, 0x20);
658 a3Val = _mm256_permute2f128_ps(h2, h3, 0x31);
659
660 b0Val = _mm256_load_ps(bPtr);
661 b1Val = _mm256_load_ps(bPtr + 8);
662 b2Val = _mm256_load_ps(bPtr + 16);
663 b3Val = _mm256_load_ps(bPtr + 24);
664
665 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
666 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
667 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
668 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
669
670 aPtr += 16;
671 bPtr += 32;
672 }
673
674 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
675 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
676 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
677
678 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
679
680 _mm256_store_ps(dotProductVector,
681 dotProdVal0); // Store the results back into the dot product vector
682
683 *realpt = dotProductVector[0];
684 *imagpt = dotProductVector[1];
685 *realpt += dotProductVector[2];
686 *imagpt += dotProductVector[3];
687 *realpt += dotProductVector[4];
688 *imagpt += dotProductVector[5];
689 *realpt += dotProductVector[6];
690 *imagpt += dotProductVector[7];
691
692 number = sixteenthPoints * 16;
693 for (; number < num_points; number++) {
694 *realpt += ((*aPtr) * (*bPtr++));
695 *imagpt += ((*aPtr++) * (*bPtr++));
696 }
697
698 *result = *(lv_32fc_t*)(&res[0]);
699}
700
701
702#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
703
704
705#endif /*INCLUDED_volk_16i_32fc_dot_prod_32fc_H*/
static void volk_16i_32fc_dot_prod_32fc_neon(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:101
static void volk_16i_32fc_dot_prod_32fc_generic(lv_32fc_t *result, const short *input, const lv_32fc_t *taps, unsigned int num_points)
Definition: volk_16i_32fc_dot_prod_32fc.h:67
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25