Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_x2_dot_prod_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
58#ifndef INCLUDED_volk_32f_x2_dot_prod_16i_H
59#define INCLUDED_volk_32f_x2_dot_prod_16i_H
60
61#include <stdio.h>
62#include <volk/volk_common.h>
63
64
65#ifdef LV_HAVE_GENERIC
66
67
68static inline void volk_32f_x2_dot_prod_16i_generic(int16_t* result,
69 const float* input,
70 const float* taps,
71 unsigned int num_points)
72{
73
74 float dotProduct = 0;
75 const float* aPtr = input;
76 const float* bPtr = taps;
77 unsigned int number = 0;
78
79 for (number = 0; number < num_points; number++) {
80 dotProduct += ((*aPtr++) * (*bPtr++));
81 }
82
83 *result = (int16_t)dotProduct;
84}
85
86#endif /*LV_HAVE_GENERIC*/
87
88
89#ifdef LV_HAVE_SSE
90
91static inline void volk_32f_x2_dot_prod_16i_a_sse(int16_t* result,
92 const float* input,
93 const float* taps,
94 unsigned int num_points)
95{
96
97 unsigned int number = 0;
98 const unsigned int sixteenthPoints = num_points / 16;
99
100 float dotProduct = 0;
101 const float* aPtr = input;
102 const float* bPtr = taps;
103
104 __m128 a0Val, a1Val, a2Val, a3Val;
105 __m128 b0Val, b1Val, b2Val, b3Val;
106 __m128 c0Val, c1Val, c2Val, c3Val;
107
108 __m128 dotProdVal0 = _mm_setzero_ps();
109 __m128 dotProdVal1 = _mm_setzero_ps();
110 __m128 dotProdVal2 = _mm_setzero_ps();
111 __m128 dotProdVal3 = _mm_setzero_ps();
112
113 for (; number < sixteenthPoints; number++) {
114
115 a0Val = _mm_load_ps(aPtr);
116 a1Val = _mm_load_ps(aPtr + 4);
117 a2Val = _mm_load_ps(aPtr + 8);
118 a3Val = _mm_load_ps(aPtr + 12);
119 b0Val = _mm_load_ps(bPtr);
120 b1Val = _mm_load_ps(bPtr + 4);
121 b2Val = _mm_load_ps(bPtr + 8);
122 b3Val = _mm_load_ps(bPtr + 12);
123
124 c0Val = _mm_mul_ps(a0Val, b0Val);
125 c1Val = _mm_mul_ps(a1Val, b1Val);
126 c2Val = _mm_mul_ps(a2Val, b2Val);
127 c3Val = _mm_mul_ps(a3Val, b3Val);
128
129 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
130 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
131 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
132 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
133
134 aPtr += 16;
135 bPtr += 16;
136 }
137
138 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
139 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
140 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
141
142 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
143
144 _mm_store_ps(dotProductVector,
145 dotProdVal0); // Store the results back into the dot product vector
146
147 dotProduct = dotProductVector[0];
148 dotProduct += dotProductVector[1];
149 dotProduct += dotProductVector[2];
150 dotProduct += dotProductVector[3];
151
152 number = sixteenthPoints * 16;
153 for (; number < num_points; number++) {
154 dotProduct += ((*aPtr++) * (*bPtr++));
155 }
156
157 *result = (short)dotProduct;
158}
159
160#endif /*LV_HAVE_SSE*/
161
162
163#if LV_HAVE_AVX2 && LV_HAVE_FMA
164
165static inline void volk_32f_x2_dot_prod_16i_a_avx2_fma(int16_t* result,
166 const float* input,
167 const float* taps,
168 unsigned int num_points)
169{
170
171 unsigned int number = 0;
172 const unsigned int thirtysecondPoints = num_points / 32;
173
174 float dotProduct = 0;
175 const float* aPtr = input;
176 const float* bPtr = taps;
177
178 __m256 a0Val, a1Val, a2Val, a3Val;
179 __m256 b0Val, b1Val, b2Val, b3Val;
180
181 __m256 dotProdVal0 = _mm256_setzero_ps();
182 __m256 dotProdVal1 = _mm256_setzero_ps();
183 __m256 dotProdVal2 = _mm256_setzero_ps();
184 __m256 dotProdVal3 = _mm256_setzero_ps();
185
186 for (; number < thirtysecondPoints; number++) {
187
188 a0Val = _mm256_load_ps(aPtr);
189 a1Val = _mm256_load_ps(aPtr + 8);
190 a2Val = _mm256_load_ps(aPtr + 16);
191 a3Val = _mm256_load_ps(aPtr + 24);
192 b0Val = _mm256_load_ps(bPtr);
193 b1Val = _mm256_load_ps(bPtr + 8);
194 b2Val = _mm256_load_ps(bPtr + 16);
195 b3Val = _mm256_load_ps(bPtr + 24);
196
197 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
198 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
199 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
200 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
201
202 aPtr += 32;
203 bPtr += 32;
204 }
205
206 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
207 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
208 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
209
210 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
211
212 _mm256_store_ps(dotProductVector,
213 dotProdVal0); // Store the results back into the dot product vector
214
215 dotProduct = dotProductVector[0];
216 dotProduct += dotProductVector[1];
217 dotProduct += dotProductVector[2];
218 dotProduct += dotProductVector[3];
219 dotProduct += dotProductVector[4];
220 dotProduct += dotProductVector[5];
221 dotProduct += dotProductVector[6];
222 dotProduct += dotProductVector[7];
223
224 number = thirtysecondPoints * 32;
225 for (; number < num_points; number++) {
226 dotProduct += ((*aPtr++) * (*bPtr++));
227 }
228
229 *result = (short)dotProduct;
230}
231
232#endif /*LV_HAVE_AVX2 && LV_HAVE_FMA*/
233
234
235#ifdef LV_HAVE_AVX
236
237static inline void volk_32f_x2_dot_prod_16i_a_avx(int16_t* result,
238 const float* input,
239 const float* taps,
240 unsigned int num_points)
241{
242
243 unsigned int number = 0;
244 const unsigned int thirtysecondPoints = num_points / 32;
245
246 float dotProduct = 0;
247 const float* aPtr = input;
248 const float* bPtr = taps;
249
250 __m256 a0Val, a1Val, a2Val, a3Val;
251 __m256 b0Val, b1Val, b2Val, b3Val;
252 __m256 c0Val, c1Val, c2Val, c3Val;
253
254 __m256 dotProdVal0 = _mm256_setzero_ps();
255 __m256 dotProdVal1 = _mm256_setzero_ps();
256 __m256 dotProdVal2 = _mm256_setzero_ps();
257 __m256 dotProdVal3 = _mm256_setzero_ps();
258
259 for (; number < thirtysecondPoints; number++) {
260
261 a0Val = _mm256_load_ps(aPtr);
262 a1Val = _mm256_load_ps(aPtr + 8);
263 a2Val = _mm256_load_ps(aPtr + 16);
264 a3Val = _mm256_load_ps(aPtr + 24);
265 b0Val = _mm256_load_ps(bPtr);
266 b1Val = _mm256_load_ps(bPtr + 8);
267 b2Val = _mm256_load_ps(bPtr + 16);
268 b3Val = _mm256_load_ps(bPtr + 24);
269
270 c0Val = _mm256_mul_ps(a0Val, b0Val);
271 c1Val = _mm256_mul_ps(a1Val, b1Val);
272 c2Val = _mm256_mul_ps(a2Val, b2Val);
273 c3Val = _mm256_mul_ps(a3Val, b3Val);
274
275 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
276 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
277 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
278 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
279
280 aPtr += 32;
281 bPtr += 32;
282 }
283
284 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
285 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
286 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
287
288 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
289
290 _mm256_store_ps(dotProductVector,
291 dotProdVal0); // Store the results back into the dot product vector
292
293 dotProduct = dotProductVector[0];
294 dotProduct += dotProductVector[1];
295 dotProduct += dotProductVector[2];
296 dotProduct += dotProductVector[3];
297 dotProduct += dotProductVector[4];
298 dotProduct += dotProductVector[5];
299 dotProduct += dotProductVector[6];
300 dotProduct += dotProductVector[7];
301
302 number = thirtysecondPoints * 32;
303 for (; number < num_points; number++) {
304 dotProduct += ((*aPtr++) * (*bPtr++));
305 }
306
307 *result = (short)dotProduct;
308}
309
310#endif /*LV_HAVE_AVX*/
311
312#ifdef LV_HAVE_AVX512F
313
314static inline void volk_32f_x2_dot_prod_16i_a_avx512f(int16_t* result,
315 const float* input,
316 const float* taps,
317 unsigned int num_points)
318{
319
320 unsigned int number = 0;
321 const unsigned int sixtyfourthPoints = num_points / 64;
322
323 float dotProduct = 0;
324 const float* aPtr = input;
325 const float* bPtr = taps;
326
327 __m512 a0Val, a1Val, a2Val, a3Val;
328 __m512 b0Val, b1Val, b2Val, b3Val;
329
330 __m512 dotProdVal0 = _mm512_setzero_ps();
331 __m512 dotProdVal1 = _mm512_setzero_ps();
332 __m512 dotProdVal2 = _mm512_setzero_ps();
333 __m512 dotProdVal3 = _mm512_setzero_ps();
334
335 for (; number < sixtyfourthPoints; number++) {
336
337 a0Val = _mm512_load_ps(aPtr);
338 a1Val = _mm512_load_ps(aPtr + 16);
339 a2Val = _mm512_load_ps(aPtr + 32);
340 a3Val = _mm512_load_ps(aPtr + 48);
341 b0Val = _mm512_load_ps(bPtr);
342 b1Val = _mm512_load_ps(bPtr + 16);
343 b2Val = _mm512_load_ps(bPtr + 32);
344 b3Val = _mm512_load_ps(bPtr + 48);
345
346 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
347 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
348 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
349 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
350
351 aPtr += 64;
352 bPtr += 64;
353 }
354
355 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
356 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
357 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
358
359 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
360
361 _mm512_store_ps(dotProductVector,
362 dotProdVal0); // Store the results back into the dot product vector
363
364 dotProduct = dotProductVector[0];
365 dotProduct += dotProductVector[1];
366 dotProduct += dotProductVector[2];
367 dotProduct += dotProductVector[3];
368 dotProduct += dotProductVector[4];
369 dotProduct += dotProductVector[5];
370 dotProduct += dotProductVector[6];
371 dotProduct += dotProductVector[7];
372 dotProduct += dotProductVector[8];
373 dotProduct += dotProductVector[9];
374 dotProduct += dotProductVector[10];
375 dotProduct += dotProductVector[11];
376 dotProduct += dotProductVector[12];
377 dotProduct += dotProductVector[13];
378 dotProduct += dotProductVector[14];
379 dotProduct += dotProductVector[15];
380
381 number = sixtyfourthPoints * 64;
382 for (; number < num_points; number++) {
383 dotProduct += ((*aPtr++) * (*bPtr++));
384 }
385
386 *result = (short)dotProduct;
387}
388
389#endif /*LV_HAVE_AVX512F*/
390
391
392#ifdef LV_HAVE_SSE
393
394static inline void volk_32f_x2_dot_prod_16i_u_sse(int16_t* result,
395 const float* input,
396 const float* taps,
397 unsigned int num_points)
398{
399
400 unsigned int number = 0;
401 const unsigned int sixteenthPoints = num_points / 16;
402
403 float dotProduct = 0;
404 const float* aPtr = input;
405 const float* bPtr = taps;
406
407 __m128 a0Val, a1Val, a2Val, a3Val;
408 __m128 b0Val, b1Val, b2Val, b3Val;
409 __m128 c0Val, c1Val, c2Val, c3Val;
410
411 __m128 dotProdVal0 = _mm_setzero_ps();
412 __m128 dotProdVal1 = _mm_setzero_ps();
413 __m128 dotProdVal2 = _mm_setzero_ps();
414 __m128 dotProdVal3 = _mm_setzero_ps();
415
416 for (; number < sixteenthPoints; number++) {
417
418 a0Val = _mm_loadu_ps(aPtr);
419 a1Val = _mm_loadu_ps(aPtr + 4);
420 a2Val = _mm_loadu_ps(aPtr + 8);
421 a3Val = _mm_loadu_ps(aPtr + 12);
422 b0Val = _mm_loadu_ps(bPtr);
423 b1Val = _mm_loadu_ps(bPtr + 4);
424 b2Val = _mm_loadu_ps(bPtr + 8);
425 b3Val = _mm_loadu_ps(bPtr + 12);
426
427 c0Val = _mm_mul_ps(a0Val, b0Val);
428 c1Val = _mm_mul_ps(a1Val, b1Val);
429 c2Val = _mm_mul_ps(a2Val, b2Val);
430 c3Val = _mm_mul_ps(a3Val, b3Val);
431
432 dotProdVal0 = _mm_add_ps(c0Val, dotProdVal0);
433 dotProdVal1 = _mm_add_ps(c1Val, dotProdVal1);
434 dotProdVal2 = _mm_add_ps(c2Val, dotProdVal2);
435 dotProdVal3 = _mm_add_ps(c3Val, dotProdVal3);
436
437 aPtr += 16;
438 bPtr += 16;
439 }
440
441 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal1);
442 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal2);
443 dotProdVal0 = _mm_add_ps(dotProdVal0, dotProdVal3);
444
445 __VOLK_ATTR_ALIGNED(16) float dotProductVector[4];
446
447 _mm_store_ps(dotProductVector,
448 dotProdVal0); // Store the results back into the dot product vector
449
450 dotProduct = dotProductVector[0];
451 dotProduct += dotProductVector[1];
452 dotProduct += dotProductVector[2];
453 dotProduct += dotProductVector[3];
454
455 number = sixteenthPoints * 16;
456 for (; number < num_points; number++) {
457 dotProduct += ((*aPtr++) * (*bPtr++));
458 }
459
460 *result = (short)dotProduct;
461}
462
463#endif /*LV_HAVE_SSE*/
464
465
466#if LV_HAVE_AVX2 && LV_HAVE_FMA
467
468static inline void volk_32f_x2_dot_prod_16i_u_avx2_fma(int16_t* result,
469 const float* input,
470 const float* taps,
471 unsigned int num_points)
472{
473
474 unsigned int number = 0;
475 const unsigned int thirtysecondPoints = num_points / 32;
476
477 float dotProduct = 0;
478 const float* aPtr = input;
479 const float* bPtr = taps;
480
481 __m256 a0Val, a1Val, a2Val, a3Val;
482 __m256 b0Val, b1Val, b2Val, b3Val;
483
484 __m256 dotProdVal0 = _mm256_setzero_ps();
485 __m256 dotProdVal1 = _mm256_setzero_ps();
486 __m256 dotProdVal2 = _mm256_setzero_ps();
487 __m256 dotProdVal3 = _mm256_setzero_ps();
488
489 for (; number < thirtysecondPoints; number++) {
490
491 a0Val = _mm256_loadu_ps(aPtr);
492 a1Val = _mm256_loadu_ps(aPtr + 8);
493 a2Val = _mm256_loadu_ps(aPtr + 16);
494 a3Val = _mm256_loadu_ps(aPtr + 24);
495 b0Val = _mm256_loadu_ps(bPtr);
496 b1Val = _mm256_loadu_ps(bPtr + 8);
497 b2Val = _mm256_loadu_ps(bPtr + 16);
498 b3Val = _mm256_loadu_ps(bPtr + 24);
499
500 dotProdVal0 = _mm256_fmadd_ps(a0Val, b0Val, dotProdVal0);
501 dotProdVal1 = _mm256_fmadd_ps(a1Val, b1Val, dotProdVal1);
502 dotProdVal2 = _mm256_fmadd_ps(a2Val, b2Val, dotProdVal2);
503 dotProdVal3 = _mm256_fmadd_ps(a3Val, b3Val, dotProdVal3);
504
505 aPtr += 32;
506 bPtr += 32;
507 }
508
509 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
510 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
511 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
512
513 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
514
515 _mm256_store_ps(dotProductVector,
516 dotProdVal0); // Store the results back into the dot product vector
517
518 dotProduct = dotProductVector[0];
519 dotProduct += dotProductVector[1];
520 dotProduct += dotProductVector[2];
521 dotProduct += dotProductVector[3];
522 dotProduct += dotProductVector[4];
523 dotProduct += dotProductVector[5];
524 dotProduct += dotProductVector[6];
525 dotProduct += dotProductVector[7];
526
527 number = thirtysecondPoints * 32;
528 for (; number < num_points; number++) {
529 dotProduct += ((*aPtr++) * (*bPtr++));
530 }
531
532 *result = (short)dotProduct;
533}
534
535#endif /*LV_HAVE_AVX2 && lV_HAVE_FMA*/
536
537
538#ifdef LV_HAVE_AVX
539
540static inline void volk_32f_x2_dot_prod_16i_u_avx(int16_t* result,
541 const float* input,
542 const float* taps,
543 unsigned int num_points)
544{
545
546 unsigned int number = 0;
547 const unsigned int thirtysecondPoints = num_points / 32;
548
549 float dotProduct = 0;
550 const float* aPtr = input;
551 const float* bPtr = taps;
552
553 __m256 a0Val, a1Val, a2Val, a3Val;
554 __m256 b0Val, b1Val, b2Val, b3Val;
555 __m256 c0Val, c1Val, c2Val, c3Val;
556
557 __m256 dotProdVal0 = _mm256_setzero_ps();
558 __m256 dotProdVal1 = _mm256_setzero_ps();
559 __m256 dotProdVal2 = _mm256_setzero_ps();
560 __m256 dotProdVal3 = _mm256_setzero_ps();
561
562 for (; number < thirtysecondPoints; number++) {
563
564 a0Val = _mm256_loadu_ps(aPtr);
565 a1Val = _mm256_loadu_ps(aPtr + 8);
566 a2Val = _mm256_loadu_ps(aPtr + 16);
567 a3Val = _mm256_loadu_ps(aPtr + 24);
568 b0Val = _mm256_loadu_ps(bPtr);
569 b1Val = _mm256_loadu_ps(bPtr + 8);
570 b2Val = _mm256_loadu_ps(bPtr + 16);
571 b3Val = _mm256_loadu_ps(bPtr + 24);
572
573 c0Val = _mm256_mul_ps(a0Val, b0Val);
574 c1Val = _mm256_mul_ps(a1Val, b1Val);
575 c2Val = _mm256_mul_ps(a2Val, b2Val);
576 c3Val = _mm256_mul_ps(a3Val, b3Val);
577
578 dotProdVal0 = _mm256_add_ps(c0Val, dotProdVal0);
579 dotProdVal1 = _mm256_add_ps(c1Val, dotProdVal1);
580 dotProdVal2 = _mm256_add_ps(c2Val, dotProdVal2);
581 dotProdVal3 = _mm256_add_ps(c3Val, dotProdVal3);
582
583 aPtr += 32;
584 bPtr += 32;
585 }
586
587 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal1);
588 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal2);
589 dotProdVal0 = _mm256_add_ps(dotProdVal0, dotProdVal3);
590
591 __VOLK_ATTR_ALIGNED(32) float dotProductVector[8];
592
593 _mm256_store_ps(dotProductVector,
594 dotProdVal0); // Store the results back into the dot product vector
595
596 dotProduct = dotProductVector[0];
597 dotProduct += dotProductVector[1];
598 dotProduct += dotProductVector[2];
599 dotProduct += dotProductVector[3];
600 dotProduct += dotProductVector[4];
601 dotProduct += dotProductVector[5];
602 dotProduct += dotProductVector[6];
603 dotProduct += dotProductVector[7];
604
605 number = thirtysecondPoints * 32;
606 for (; number < num_points; number++) {
607 dotProduct += ((*aPtr++) * (*bPtr++));
608 }
609
610 *result = (short)dotProduct;
611}
612
613#endif /*LV_HAVE_AVX*/
614
615#ifdef LV_HAVE_AVX512F
616
617static inline void volk_32f_x2_dot_prod_16i_u_avx512f(int16_t* result,
618 const float* input,
619 const float* taps,
620 unsigned int num_points)
621{
622
623 unsigned int number = 0;
624 const unsigned int sixtyfourthPoints = num_points / 64;
625
626 float dotProduct = 0;
627 const float* aPtr = input;
628 const float* bPtr = taps;
629
630 __m512 a0Val, a1Val, a2Val, a3Val;
631 __m512 b0Val, b1Val, b2Val, b3Val;
632
633 __m512 dotProdVal0 = _mm512_setzero_ps();
634 __m512 dotProdVal1 = _mm512_setzero_ps();
635 __m512 dotProdVal2 = _mm512_setzero_ps();
636 __m512 dotProdVal3 = _mm512_setzero_ps();
637
638 for (; number < sixtyfourthPoints; number++) {
639
640 a0Val = _mm512_loadu_ps(aPtr);
641 a1Val = _mm512_loadu_ps(aPtr + 16);
642 a2Val = _mm512_loadu_ps(aPtr + 32);
643 a3Val = _mm512_loadu_ps(aPtr + 48);
644 b0Val = _mm512_loadu_ps(bPtr);
645 b1Val = _mm512_loadu_ps(bPtr + 16);
646 b2Val = _mm512_loadu_ps(bPtr + 32);
647 b3Val = _mm512_loadu_ps(bPtr + 48);
648
649 dotProdVal0 = _mm512_fmadd_ps(a0Val, b0Val, dotProdVal0);
650 dotProdVal1 = _mm512_fmadd_ps(a1Val, b1Val, dotProdVal1);
651 dotProdVal2 = _mm512_fmadd_ps(a2Val, b2Val, dotProdVal2);
652 dotProdVal3 = _mm512_fmadd_ps(a3Val, b3Val, dotProdVal3);
653
654 aPtr += 64;
655 bPtr += 64;
656 }
657
658 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal1);
659 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal2);
660 dotProdVal0 = _mm512_add_ps(dotProdVal0, dotProdVal3);
661
662 __VOLK_ATTR_ALIGNED(64) float dotProductVector[16];
663
664 _mm512_storeu_ps(dotProductVector,
665 dotProdVal0); // Store the results back into the dot product vector
666
667 dotProduct = dotProductVector[0];
668 dotProduct += dotProductVector[1];
669 dotProduct += dotProductVector[2];
670 dotProduct += dotProductVector[3];
671 dotProduct += dotProductVector[4];
672 dotProduct += dotProductVector[5];
673 dotProduct += dotProductVector[6];
674 dotProduct += dotProductVector[7];
675 dotProduct += dotProductVector[8];
676 dotProduct += dotProductVector[9];
677 dotProduct += dotProductVector[10];
678 dotProduct += dotProductVector[11];
679 dotProduct += dotProductVector[12];
680 dotProduct += dotProductVector[13];
681 dotProduct += dotProductVector[14];
682 dotProduct += dotProductVector[15];
683
684 number = sixtyfourthPoints * 64;
685 for (; number < num_points; number++) {
686 dotProduct += ((*aPtr++) * (*bPtr++));
687 }
688
689 *result = (short)dotProduct;
690}
691
692#endif /*LV_HAVE_AVX512F*/
693
694
695#endif /*INCLUDED_volk_32f_x2_dot_prod_16i_H*/
static void volk_32f_x2_dot_prod_16i_u_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:394
static void volk_32f_x2_dot_prod_16i_generic(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:68
static void volk_32f_x2_dot_prod_16i_a_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:237
static void volk_32f_x2_dot_prod_16i_u_avx(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:540
static void volk_32f_x2_dot_prod_16i_a_sse(int16_t *result, const float *input, const float *taps, unsigned int num_points)
Definition: volk_32f_x2_dot_prod_16i.h:91
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56