Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
86#ifndef INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
87#define INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H
88
89#include <float.h>
90#include <inttypes.h>
91#include <stdio.h>
92#include <volk/volk_complex.h>
93
94
95#ifdef LV_HAVE_GENERIC
96
97static inline void
99 const lv_32fc_t* aVector,
100 const lv_32fc_t* bVector,
101 const lv_32fc_t scalar,
102 unsigned int num_points)
103{
104 const lv_32fc_t* aPtr = aVector;
105 const lv_32fc_t* bPtr = bVector;
106 lv_32fc_t* cPtr = cVector;
107 unsigned int number = num_points;
108
109 // unwrap loop
110 while (number >= 8) {
111 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
112 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
113 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
114 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
115 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
116 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
117 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
118 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
119 number -= 8;
120 }
121
122 // clean up any remaining
123 while (number-- > 0) {
124 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
125 }
126}
127#endif /* LV_HAVE_GENERIC */
128
129
130#ifdef LV_HAVE_AVX
131#include <immintrin.h>
133
134static inline void
136 const lv_32fc_t* aVector,
137 const lv_32fc_t* bVector,
138 const lv_32fc_t scalar,
139 unsigned int num_points)
140{
141 unsigned int number = 0;
142 unsigned int i = 0;
143 const unsigned int quarterPoints = num_points / 4;
144 unsigned int isodd = num_points & 3;
145
146 __m256 x, y, s, z;
147 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
148
149 const lv_32fc_t* a = aVector;
150 const lv_32fc_t* b = bVector;
151 lv_32fc_t* c = cVector;
152
153 // Set up constant scalar vector
154 s = _mm256_loadu_ps((float*)v_scalar);
155
156 for (; number < quarterPoints; number++) {
157 x = _mm256_loadu_ps((float*)b);
158 y = _mm256_loadu_ps((float*)a);
160 z = _mm256_add_ps(y, z);
161 _mm256_storeu_ps((float*)c, z);
162
163 a += 4;
164 b += 4;
165 c += 4;
166 }
167
168 for (i = num_points - isodd; i < num_points; i++) {
169 *c++ = (*a++) + lv_conj(*b++) * scalar;
170 }
171}
172#endif /* LV_HAVE_AVX */
173
174
175#ifdef LV_HAVE_SSE3
176#include <pmmintrin.h>
178
179static inline void
181 const lv_32fc_t* aVector,
182 const lv_32fc_t* bVector,
183 const lv_32fc_t scalar,
184 unsigned int num_points)
185{
186 unsigned int number = 0;
187 const unsigned int halfPoints = num_points / 2;
188
189 __m128 x, y, s, z;
190 lv_32fc_t v_scalar[2] = { scalar, scalar };
191
192 const lv_32fc_t* a = aVector;
193 const lv_32fc_t* b = bVector;
194 lv_32fc_t* c = cVector;
195
196 // Set up constant scalar vector
197 s = _mm_loadu_ps((float*)v_scalar);
198
199 for (; number < halfPoints; number++) {
200 x = _mm_loadu_ps((float*)b);
201 y = _mm_loadu_ps((float*)a);
203 z = _mm_add_ps(y, z);
204 _mm_storeu_ps((float*)c, z);
205
206 a += 2;
207 b += 2;
208 c += 2;
209 }
210
211 if ((num_points % 2) != 0) {
212 *c = *a + lv_conj(*b) * scalar;
213 }
214}
215#endif /* LV_HAVE_SSE */
216
217
218#ifdef LV_HAVE_AVX
219#include <immintrin.h>
221
222static inline void
224 const lv_32fc_t* aVector,
225 const lv_32fc_t* bVector,
226 const lv_32fc_t scalar,
227 unsigned int num_points)
228{
229 unsigned int number = 0;
230 unsigned int i = 0;
231 const unsigned int quarterPoints = num_points / 4;
232 unsigned int isodd = num_points & 3;
233
234 __m256 x, y, s, z;
235 lv_32fc_t v_scalar[4] = { scalar, scalar, scalar, scalar };
236
237 const lv_32fc_t* a = aVector;
238 const lv_32fc_t* b = bVector;
239 lv_32fc_t* c = cVector;
240
241 // Set up constant scalar vector
242 s = _mm256_loadu_ps((float*)v_scalar);
243
244 for (; number < quarterPoints; number++) {
245 x = _mm256_load_ps((float*)b);
246 y = _mm256_load_ps((float*)a);
248 z = _mm256_add_ps(y, z);
249 _mm256_store_ps((float*)c, z);
250
251 a += 4;
252 b += 4;
253 c += 4;
254 }
255
256 for (i = num_points - isodd; i < num_points; i++) {
257 *c++ = (*a++) + lv_conj(*b++) * scalar;
258 }
259}
260#endif /* LV_HAVE_AVX */
261
262
263#ifdef LV_HAVE_SSE3
264#include <pmmintrin.h>
266
267static inline void
269 const lv_32fc_t* aVector,
270 const lv_32fc_t* bVector,
271 const lv_32fc_t scalar,
272 unsigned int num_points)
273{
274 unsigned int number = 0;
275 const unsigned int halfPoints = num_points / 2;
276
277 __m128 x, y, s, z;
278 lv_32fc_t v_scalar[2] = { scalar, scalar };
279
280 const lv_32fc_t* a = aVector;
281 const lv_32fc_t* b = bVector;
282 lv_32fc_t* c = cVector;
283
284 // Set up constant scalar vector
285 s = _mm_loadu_ps((float*)v_scalar);
286
287 for (; number < halfPoints; number++) {
288 x = _mm_load_ps((float*)b);
289 y = _mm_load_ps((float*)a);
291 z = _mm_add_ps(y, z);
292 _mm_store_ps((float*)c, z);
293
294 a += 2;
295 b += 2;
296 c += 2;
297 }
298
299 if ((num_points % 2) != 0) {
300 *c = *a + lv_conj(*b) * scalar;
301 }
302}
303#endif /* LV_HAVE_SSE */
304
305
306#ifdef LV_HAVE_NEON
307#include <arm_neon.h>
308
309static inline void
311 const lv_32fc_t* aVector,
312 const lv_32fc_t* bVector,
313 const lv_32fc_t scalar,
314 unsigned int num_points)
315{
316 const lv_32fc_t* bPtr = bVector;
317 const lv_32fc_t* aPtr = aVector;
318 lv_32fc_t* cPtr = cVector;
319 unsigned int number = num_points;
320 unsigned int quarter_points = num_points / 4;
321
322 float32x4x2_t a_val, b_val, c_val, scalar_val;
323 float32x4x2_t tmp_val;
324
325 scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
326 scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
327
328 for (number = 0; number < quarter_points; ++number) {
329 a_val = vld2q_f32((float*)aPtr);
330 b_val = vld2q_f32((float*)bPtr);
331 b_val.val[1] = vnegq_f32(b_val.val[1]);
332 __VOLK_PREFETCH(aPtr + 8);
333 __VOLK_PREFETCH(bPtr + 8);
334
335 tmp_val.val[1] = vmulq_f32(b_val.val[1], scalar_val.val[0]);
336 tmp_val.val[0] = vmulq_f32(b_val.val[0], scalar_val.val[0]);
337
338 tmp_val.val[1] = vmlaq_f32(tmp_val.val[1], b_val.val[0], scalar_val.val[1]);
339 tmp_val.val[0] = vmlsq_f32(tmp_val.val[0], b_val.val[1], scalar_val.val[1]);
340
341 c_val.val[1] = vaddq_f32(a_val.val[1], tmp_val.val[1]);
342 c_val.val[0] = vaddq_f32(a_val.val[0], tmp_val.val[0]);
343
344 vst2q_f32((float*)cPtr, c_val);
345
346 aPtr += 4;
347 bPtr += 4;
348 cPtr += 4;
349 }
350
351 for (number = quarter_points * 4; number < num_points; number++) {
352 *cPtr++ = (*aPtr++) + lv_conj(*bPtr++) * scalar;
353 }
354}
355#endif /* LV_HAVE_NEON */
356
357#endif /* INCLUDED_volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_H */
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:223
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:98
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:180
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:268
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:135
static void volk_32fc_x2_s32fc_multiply_conjugate_add_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32fc_multiply_conjugate_add_32fc.h:310
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define lv_conj(x)
Definition: volk_complex.h:91
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44