48#ifndef INCLUDED_volk_16ic_x2_dot_prod_16ic_H
49#define INCLUDED_volk_16ic_x2_dot_prod_16ic_H
61 unsigned int num_points)
63 result[0] =
lv_cmake((int16_t)0, (int16_t)0);
65 for (n = 0; n < num_points; n++) {
81 unsigned int num_points)
85 const unsigned int sse_iters = num_points / 4;
93 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
97 realcacc = _mm_setzero_si128();
98 imagcacc = _mm_setzero_si128();
100 mask_imag = _mm_set_epi8(
101 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
102 mask_real = _mm_set_epi8(
103 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
105 for (number = 0; number < sse_iters; number++) {
110 b = _mm_load_si128((__m128i*)_in_b);
112 c = _mm_mullo_epi16(a, b);
114 c_sr = _mm_srli_si128(c, 2);
116 real = _mm_subs_epi16(c, c_sr);
118 b_sl = _mm_slli_si128(b, 2);
119 a_sl = _mm_slli_si128(a, 2);
121 imag1 = _mm_mullo_epi16(a, b_sl);
122 imag2 = _mm_mullo_epi16(b, a_sl);
124 imag = _mm_adds_epi16(imag1, imag2);
126 realcacc = _mm_adds_epi16(realcacc, real);
127 imagcacc = _mm_adds_epi16(imagcacc, imag);
133 realcacc = _mm_and_si128(realcacc, mask_real);
134 imagcacc = _mm_and_si128(imagcacc, mask_imag);
136 a = _mm_or_si128(realcacc, imagcacc);
138 _mm_store_si128((__m128i*)dotProductVector,
141 for (number = 0; number < 4; ++number) {
148 for (number = 0; number < (num_points % 4); ++number) {
161#include <emmintrin.h>
166 unsigned int num_points)
170 const unsigned int sse_iters = num_points / 4;
178 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
179 realcacc, imagcacc, result;
182 realcacc = _mm_setzero_si128();
183 imagcacc = _mm_setzero_si128();
185 mask_imag = _mm_set_epi8(
186 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
187 mask_real = _mm_set_epi8(
188 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
190 for (number = 0; number < sse_iters; number++) {
195 b = _mm_loadu_si128((__m128i*)_in_b);
197 c = _mm_mullo_epi16(a, b);
199 c_sr = _mm_srli_si128(c, 2);
201 real = _mm_subs_epi16(c, c_sr);
203 b_sl = _mm_slli_si128(b, 2);
204 a_sl = _mm_slli_si128(a, 2);
206 imag1 = _mm_mullo_epi16(a, b_sl);
207 imag2 = _mm_mullo_epi16(b, a_sl);
209 imag = _mm_adds_epi16(imag1, imag2);
211 realcacc = _mm_adds_epi16(realcacc, real);
212 imagcacc = _mm_adds_epi16(imagcacc, imag);
218 realcacc = _mm_and_si128(realcacc, mask_real);
219 imagcacc = _mm_and_si128(imagcacc, mask_imag);
221 result = _mm_or_si128(realcacc, imagcacc);
223 _mm_storeu_si128((__m128i*)dotProductVector,
226 for (number = 0; number < 4; ++number) {
233 for (number = 0; number < (num_points % 4); ++number) {
245#include <immintrin.h>
247static inline void volk_16ic_x2_dot_prod_16ic_u_avx2(
lv_16sc_t* out,
250 unsigned int num_points)
254 const unsigned int avx_iters = num_points / 8;
262 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
263 realcacc, imagcacc, result;
266 realcacc = _mm256_setzero_si256();
267 imagcacc = _mm256_setzero_si256();
269 mask_imag = _mm256_set_epi8(0xFF,
301 mask_real = _mm256_set_epi8(0,
334 for (number = 0; number < avx_iters; number++) {
335 a = _mm256_loadu_si256((__m256i*)_in_a);
337 b = _mm256_loadu_si256((__m256i*)_in_b);
339 c = _mm256_mullo_epi16(a, b);
341 c_sr = _mm256_srli_si256(c, 2);
343 real = _mm256_subs_epi16(c, c_sr);
345 b_sl = _mm256_slli_si256(b, 2);
346 a_sl = _mm256_slli_si256(a, 2);
348 imag1 = _mm256_mullo_epi16(a, b_sl);
349 imag2 = _mm256_mullo_epi16(b, a_sl);
351 imag = _mm256_adds_epi16(imag1, imag2);
353 realcacc = _mm256_adds_epi16(realcacc, real);
354 imagcacc = _mm256_adds_epi16(imagcacc, imag);
360 realcacc = _mm256_and_si256(realcacc, mask_real);
361 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
363 result = _mm256_or_si256(realcacc, imagcacc);
365 _mm256_storeu_si256((__m256i*)dotProductVector,
368 for (number = 0; number < 8; ++number) {
375 for (number = 0; number < (num_points % 8); ++number) {
387#include <immintrin.h>
389static inline void volk_16ic_x2_dot_prod_16ic_a_avx2(
lv_16sc_t* out,
392 unsigned int num_points)
396 const unsigned int avx_iters = num_points / 8;
404 __m256i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
405 realcacc, imagcacc, result;
408 realcacc = _mm256_setzero_si256();
409 imagcacc = _mm256_setzero_si256();
411 mask_imag = _mm256_set_epi8(0xFF,
443 mask_real = _mm256_set_epi8(0,
476 for (number = 0; number < avx_iters; number++) {
477 a = _mm256_load_si256((__m256i*)_in_a);
479 b = _mm256_load_si256((__m256i*)_in_b);
481 c = _mm256_mullo_epi16(a, b);
483 c_sr = _mm256_srli_si256(c, 2);
485 real = _mm256_subs_epi16(c, c_sr);
487 b_sl = _mm256_slli_si256(b, 2);
488 a_sl = _mm256_slli_si256(a, 2);
490 imag1 = _mm256_mullo_epi16(a, b_sl);
491 imag2 = _mm256_mullo_epi16(b, a_sl);
493 imag = _mm256_adds_epi16(imag1, imag2);
495 realcacc = _mm256_adds_epi16(realcacc, real);
496 imagcacc = _mm256_adds_epi16(imagcacc, imag);
502 realcacc = _mm256_and_si256(realcacc, mask_real);
503 imagcacc = _mm256_and_si256(imagcacc, mask_imag);
505 result = _mm256_or_si256(realcacc, imagcacc);
507 _mm256_store_si256((__m256i*)dotProductVector,
510 for (number = 0; number < 8; ++number) {
517 for (number = 0; number < (num_points % 8); ++number) {
534 unsigned int num_points)
536 unsigned int quarter_points = num_points / 4;
541 *out =
lv_cmake((int16_t)0, (int16_t)0);
543 if (quarter_points > 0) {
546 int16x4x2_t a_val, b_val, c_val, accumulator;
547 int16x4x2_t tmp_real, tmp_imag;
549 accumulator.val[0] = vdup_n_s16(0);
550 accumulator.val[1] = vdup_n_s16(0);
553 for (number = 0; number < quarter_points; ++number) {
554 a_val = vld2_s16((int16_t*)a_ptr);
555 b_val = vld2_s16((int16_t*)b_ptr);
561 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
563 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
567 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
569 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
571 c_val.val[0] = vqsub_s16(tmp_real.val[0], tmp_real.val[1]);
572 c_val.val[1] = vqadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
574 accumulator.val[0] = vqadd_s16(accumulator.val[0], c_val.val[0]);
575 accumulator.val[1] = vqadd_s16(accumulator.val[1], c_val.val[1]);
581 vst2_s16((int16_t*)accum_result, accumulator);
582 for (number = 0; number < 4; ++number) {
592 for (number = quarter_points * 4; number < num_points; ++number) {
593 *out += (*a_ptr++) * (*b_ptr++);
606 unsigned int num_points)
608 unsigned int quarter_points = num_points / 4;
615 int16x4x2_t a_val, b_val, accumulator;
618 accumulator.val[0] = vdup_n_s16(0);
619 accumulator.val[1] = vdup_n_s16(0);
621 for (number = 0; number < quarter_points; ++number) {
622 a_val = vld2_s16((int16_t*)a_ptr);
623 b_val = vld2_s16((int16_t*)b_ptr);
627 tmp.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
628 tmp.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
631 tmp.val[0] = vmls_s16(tmp.val[0], a_val.val[1], b_val.val[1]);
632 tmp.val[1] = vmla_s16(tmp.val[1], a_val.val[0], b_val.val[1]);
634 accumulator.val[0] = vqadd_s16(accumulator.val[0], tmp.val[0]);
635 accumulator.val[1] = vqadd_s16(accumulator.val[1], tmp.val[1]);
641 vst2_s16((int16_t*)accum_result, accumulator);
642 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
645 for (number = quarter_points * 4; number < num_points; ++number) {
646 *out += (*a_ptr++) * (*b_ptr++);
659 unsigned int num_points)
661 unsigned int quarter_points = num_points / 4;
668 int16x4x2_t a_val, b_val, accumulator1, accumulator2;
671 accumulator1.val[0] = vdup_n_s16(0);
672 accumulator1.val[1] = vdup_n_s16(0);
673 accumulator2.val[0] = vdup_n_s16(0);
674 accumulator2.val[1] = vdup_n_s16(0);
676 for (number = 0; number < quarter_points; ++number) {
677 a_val = vld2_s16((int16_t*)a_ptr);
678 b_val = vld2_s16((int16_t*)b_ptr);
683 accumulator1.val[0] = vmla_s16(accumulator1.val[0], a_val.val[0], b_val.val[0]);
684 accumulator2.val[0] = vmls_s16(accumulator2.val[0], a_val.val[1], b_val.val[1]);
685 accumulator1.val[1] = vmla_s16(accumulator1.val[1], a_val.val[0], b_val.val[1]);
686 accumulator2.val[1] = vmla_s16(accumulator2.val[1], a_val.val[1], b_val.val[0]);
692 accumulator1.val[0] = vqadd_s16(accumulator1.val[0], accumulator2.val[0]);
693 accumulator1.val[1] = vqadd_s16(accumulator1.val[1], accumulator2.val[1]);
695 vst2_s16((int16_t*)accum_result, accumulator1);
696 *out = accum_result[0] + accum_result[1] + accum_result[2] + accum_result[3];
699 for (number = quarter_points * 4; number < num_points; ++number) {
700 *out += (*a_ptr++) * (*b_ptr++);
static int16_t sat_adds16i(int16_t x, int16_t y)
Definition: saturation_arithmetic.h:29
static void volk_16ic_x2_dot_prod_16ic_neon_optvma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:656
static void volk_16ic_x2_dot_prod_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:58
static void volk_16ic_x2_dot_prod_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:78
static void volk_16ic_x2_dot_prod_16ic_neon_vma(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:603
static void volk_16ic_x2_dot_prod_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:163
static void volk_16ic_x2_dot_prod_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_dot_prod_16ic.h:531
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
#define lv_creal(x)
Definition: volk_complex.h:87
short complex lv_16sc_t
Definition: volk_complex.h:62