57#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
58#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_a_H
68volk_8ic_x2_s32f_multiply_conjugate_32fc_a_avx2(
lv_32fc_t* cVector,
72 unsigned int num_points)
74 unsigned int number = 0;
75 const unsigned int oneEigthPoints = num_points / 8;
77 __m256i x, y, realz, imagz;
78 __m256 ret, retlo, rethi;
82 __m256i conjugateSign =
83 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
85 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
87 for (; number < oneEigthPoints; number++) {
89 x = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)a));
90 y = _mm256_cvtepi8_epi16(_mm_load_si128((__m128i*)b));
93 realz = _mm256_madd_epi16(x, y);
96 y = _mm256_sign_epi16(y, conjugateSign);
99 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
100 _MM_SHUFFLE(2, 3, 0, 1));
103 imagz = _mm256_madd_epi16(x, y);
106 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
109 retlo = _mm256_mul_ps(retlo, invScalar);
112 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
115 rethi = _mm256_mul_ps(rethi, invScalar);
117 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
118 _mm256_store_ps((
float*)c, ret);
121 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
122 _mm256_store_ps((
float*)c, ret);
129 number = oneEigthPoints * 8;
130 float* cFloatPtr = (
float*)&cVector[number];
131 int8_t* a8Ptr = (int8_t*)&aVector[number];
132 int8_t* b8Ptr = (int8_t*)&bVector[number];
133 for (; number < num_points; number++) {
134 float aReal = (float)*a8Ptr++;
135 float aImag = (float)*a8Ptr++;
137 float bReal = (float)*b8Ptr++;
138 float bImag = (float)*b8Ptr++;
142 *cFloatPtr++ =
lv_creal(temp) / scalar;
143 *cFloatPtr++ =
lv_cimag(temp) / scalar;
150#include <smmintrin.h>
153volk_8ic_x2_s32f_multiply_conjugate_32fc_a_sse4_1(
lv_32fc_t* cVector,
157 unsigned int num_points)
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
162 __m128i x, y, realz, imagz;
167 __m128i conjugateSign = _mm_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1);
169 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
171 for (; number < quarterPoints; number++) {
173 x = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)a));
174 y = _mm_cvtepi8_epi16(_mm_loadl_epi64((__m128i*)b));
177 realz = _mm_madd_epi16(x, y);
180 y = _mm_sign_epi16(y, conjugateSign);
183 y = _mm_shufflehi_epi16(_mm_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
184 _MM_SHUFFLE(2, 3, 0, 1));
187 imagz = _mm_madd_epi16(x, y);
190 ret = _mm_cvtepi32_ps(_mm_unpacklo_epi32(realz, imagz));
193 ret = _mm_mul_ps(ret, invScalar);
196 _mm_store_ps((
float*)c, ret);
200 ret = _mm_cvtepi32_ps(_mm_unpackhi_epi32(realz, imagz));
203 ret = _mm_mul_ps(ret, invScalar);
206 _mm_store_ps((
float*)c, ret);
213 number = quarterPoints * 4;
214 float* cFloatPtr = (
float*)&cVector[number];
215 int8_t* a8Ptr = (int8_t*)&aVector[number];
216 int8_t* b8Ptr = (int8_t*)&bVector[number];
217 for (; number < num_points; number++) {
218 float aReal = (float)*a8Ptr++;
219 float aImag = (float)*a8Ptr++;
221 float bReal = (float)*b8Ptr++;
222 float bImag = (float)*b8Ptr++;
226 *cFloatPtr++ =
lv_creal(temp) / scalar;
227 *cFloatPtr++ =
lv_cimag(temp) / scalar;
233#ifdef LV_HAVE_GENERIC
240 unsigned int num_points)
242 unsigned int number = 0;
243 float* cPtr = (
float*)cVector;
244 const float invScalar = 1.0 / scalar;
245 int8_t* a8Ptr = (int8_t*)aVector;
246 int8_t* b8Ptr = (int8_t*)bVector;
247 for (number = 0; number < num_points; number++) {
248 float aReal = (float)*a8Ptr++;
249 float aImag = (float)*a8Ptr++;
251 float bReal = (float)*b8Ptr++;
252 float bImag = (float)*b8Ptr++;
256 *cPtr++ = (
lv_creal(temp) * invScalar);
257 *cPtr++ = (
lv_cimag(temp) * invScalar);
265#ifndef INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
266#define INCLUDED_volk_8ic_x2_s32f_multiply_conjugate_32fc_u_H
273#include <immintrin.h>
276volk_8ic_x2_s32f_multiply_conjugate_32fc_u_avx2(
lv_32fc_t* cVector,
280 unsigned int num_points)
282 unsigned int number = 0;
283 const unsigned int oneEigthPoints = num_points / 8;
285 __m256i x, y, realz, imagz;
286 __m256 ret, retlo, rethi;
290 __m256i conjugateSign =
291 _mm256_set_epi16(-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1);
293 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
295 for (; number < oneEigthPoints; number++) {
297 x = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)a));
298 y = _mm256_cvtepi8_epi16(_mm_loadu_si128((__m128i*)b));
301 realz = _mm256_madd_epi16(x, y);
304 y = _mm256_sign_epi16(y, conjugateSign);
307 y = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(y, _MM_SHUFFLE(2, 3, 0, 1)),
308 _MM_SHUFFLE(2, 3, 0, 1));
311 imagz = _mm256_madd_epi16(x, y);
314 retlo = _mm256_cvtepi32_ps(_mm256_unpacklo_epi32(realz, imagz));
317 retlo = _mm256_mul_ps(retlo, invScalar);
320 rethi = _mm256_cvtepi32_ps(_mm256_unpackhi_epi32(realz, imagz));
323 rethi = _mm256_mul_ps(rethi, invScalar);
325 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00100000);
326 _mm256_storeu_ps((
float*)c, ret);
329 ret = _mm256_permute2f128_ps(retlo, rethi, 0b00110001);
330 _mm256_storeu_ps((
float*)c, ret);
337 number = oneEigthPoints * 8;
338 float* cFloatPtr = (
float*)&cVector[number];
339 int8_t* a8Ptr = (int8_t*)&aVector[number];
340 int8_t* b8Ptr = (int8_t*)&bVector[number];
341 for (; number < num_points; number++) {
342 float aReal = (float)*a8Ptr++;
343 float aImag = (float)*a8Ptr++;
345 float bReal = (float)*b8Ptr++;
346 float bImag = (float)*b8Ptr++;
350 *cFloatPtr++ =
lv_creal(temp) / scalar;
351 *cFloatPtr++ =
lv_cimag(temp) / scalar;
static void volk_8ic_x2_s32f_multiply_conjugate_32fc_generic(lv_32fc_t *cVector, const lv_8sc_t *aVector, const lv_8sc_t *bVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_x2_s32f_multiply_conjugate_32fc.h:236
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_cmake(r, i)
Definition: volk_complex.h:68
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65