Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_s32fc_multiply_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
76#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
77#define INCLUDED_volk_32fc_s32fc_multiply_32fc_u_H
78
79#include <float.h>
80#include <inttypes.h>
81#include <stdio.h>
82#include <volk/volk_complex.h>
83
84#if LV_HAVE_AVX && LV_HAVE_FMA
85#include <immintrin.h>
86
87static inline void volk_32fc_s32fc_multiply_32fc_u_avx_fma(lv_32fc_t* cVector,
88 const lv_32fc_t* aVector,
89 const lv_32fc_t scalar,
90 unsigned int num_points)
91{
92 unsigned int number = 0;
93 unsigned int i = 0;
94 const unsigned int quarterPoints = num_points / 4;
95 unsigned int isodd = num_points & 3;
96 __m256 x, yl, yh, z, tmp1, tmp2;
97 lv_32fc_t* c = cVector;
98 const lv_32fc_t* a = aVector;
99
100 // Set up constant scalar vector
101 yl = _mm256_set1_ps(lv_creal(scalar));
102 yh = _mm256_set1_ps(lv_cimag(scalar));
103
104 for (; number < quarterPoints; number++) {
105 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
106
107 tmp1 = x;
108
109 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
110
111 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
112
113 z = _mm256_fmaddsub_ps(
114 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
115
116 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
117
118 a += 4;
119 c += 4;
120 }
121
122 for (i = num_points - isodd; i < num_points; i++) {
123 *c++ = (*a++) * scalar;
124 }
125}
126#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
127
128#ifdef LV_HAVE_AVX
129#include <immintrin.h>
130
132 const lv_32fc_t* aVector,
133 const lv_32fc_t scalar,
134 unsigned int num_points)
135{
136 unsigned int number = 0;
137 unsigned int i = 0;
138 const unsigned int quarterPoints = num_points / 4;
139 unsigned int isodd = num_points & 3;
140 __m256 x, yl, yh, z, tmp1, tmp2;
141 lv_32fc_t* c = cVector;
142 const lv_32fc_t* a = aVector;
143
144 // Set up constant scalar vector
145 yl = _mm256_set1_ps(lv_creal(scalar));
146 yh = _mm256_set1_ps(lv_cimag(scalar));
147
148 for (; number < quarterPoints; number++) {
149 x = _mm256_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
150
151 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
152
153 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
154
155 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
156
157 z = _mm256_addsub_ps(tmp1,
158 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
159
160 _mm256_storeu_ps((float*)c, z); // Store the results back into the C container
161
162 a += 4;
163 c += 4;
164 }
165
166 for (i = num_points - isodd; i < num_points; i++) {
167 *c++ = (*a++) * scalar;
168 }
169}
170#endif /* LV_HAVE_AVX */
171
172#ifdef LV_HAVE_SSE3
173#include <pmmintrin.h>
174
176 const lv_32fc_t* aVector,
177 const lv_32fc_t scalar,
178 unsigned int num_points)
179{
180 unsigned int number = 0;
181 const unsigned int halfPoints = num_points / 2;
182
183 __m128 x, yl, yh, z, tmp1, tmp2;
184 lv_32fc_t* c = cVector;
185 const lv_32fc_t* a = aVector;
186
187 // Set up constant scalar vector
188 yl = _mm_set_ps1(lv_creal(scalar));
189 yh = _mm_set_ps1(lv_cimag(scalar));
190
191 for (; number < halfPoints; number++) {
192
193 x = _mm_loadu_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
194
195 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
196
197 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
198
199 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
200
201 z = _mm_addsub_ps(tmp1,
202 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
203
204 _mm_storeu_ps((float*)c, z); // Store the results back into the C container
205
206 a += 2;
207 c += 2;
208 }
209
210 if ((num_points % 2) != 0) {
211 *c = (*a) * scalar;
212 }
213}
214#endif /* LV_HAVE_SSE */
215
216#ifdef LV_HAVE_GENERIC
217
219 const lv_32fc_t* aVector,
220 const lv_32fc_t scalar,
221 unsigned int num_points)
222{
223 lv_32fc_t* cPtr = cVector;
224 const lv_32fc_t* aPtr = aVector;
225 unsigned int number = num_points;
226
227 // unwrap loop
228 while (number >= 8) {
229 *cPtr++ = (*aPtr++) * scalar;
230 *cPtr++ = (*aPtr++) * scalar;
231 *cPtr++ = (*aPtr++) * scalar;
232 *cPtr++ = (*aPtr++) * scalar;
233 *cPtr++ = (*aPtr++) * scalar;
234 *cPtr++ = (*aPtr++) * scalar;
235 *cPtr++ = (*aPtr++) * scalar;
236 *cPtr++ = (*aPtr++) * scalar;
237 number -= 8;
238 }
239
240 // clean up any remaining
241 while (number-- > 0)
242 *cPtr++ = *aPtr++ * scalar;
243}
244#endif /* LV_HAVE_GENERIC */
245
246
247#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_u_H */
248#ifndef INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
249#define INCLUDED_volk_32fc_s32fc_multiply_32fc_a_H
250
251#include <float.h>
252#include <inttypes.h>
253#include <stdio.h>
254#include <volk/volk_complex.h>
255
256#if LV_HAVE_AVX && LV_HAVE_FMA
257#include <immintrin.h>
258
259static inline void volk_32fc_s32fc_multiply_32fc_a_avx_fma(lv_32fc_t* cVector,
260 const lv_32fc_t* aVector,
261 const lv_32fc_t scalar,
262 unsigned int num_points)
263{
264 unsigned int number = 0;
265 unsigned int i = 0;
266 const unsigned int quarterPoints = num_points / 4;
267 unsigned int isodd = num_points & 3;
268 __m256 x, yl, yh, z, tmp1, tmp2;
269 lv_32fc_t* c = cVector;
270 const lv_32fc_t* a = aVector;
271
272 // Set up constant scalar vector
273 yl = _mm256_set1_ps(lv_creal(scalar));
274 yh = _mm256_set1_ps(lv_cimag(scalar));
275
276 for (; number < quarterPoints; number++) {
277 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
278
279 tmp1 = x;
280
281 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
282
283 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
284
285 z = _mm256_fmaddsub_ps(
286 tmp1, yl, tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
287
288 _mm256_store_ps((float*)c, z); // Store the results back into the C container
289
290 a += 4;
291 c += 4;
292 }
293
294 for (i = num_points - isodd; i < num_points; i++) {
295 *c++ = (*a++) * scalar;
296 }
297}
298#endif /* LV_HAVE_AVX && LV_HAVE_FMA */
299
300
301#ifdef LV_HAVE_AVX
302#include <immintrin.h>
303
305 const lv_32fc_t* aVector,
306 const lv_32fc_t scalar,
307 unsigned int num_points)
308{
309 unsigned int number = 0;
310 unsigned int i = 0;
311 const unsigned int quarterPoints = num_points / 4;
312 unsigned int isodd = num_points & 3;
313 __m256 x, yl, yh, z, tmp1, tmp2;
314 lv_32fc_t* c = cVector;
315 const lv_32fc_t* a = aVector;
316
317 // Set up constant scalar vector
318 yl = _mm256_set1_ps(lv_creal(scalar));
319 yh = _mm256_set1_ps(lv_cimag(scalar));
320
321 for (; number < quarterPoints; number++) {
322 x = _mm256_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
323
324 tmp1 = _mm256_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
325
326 x = _mm256_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
327
328 tmp2 = _mm256_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
329
330 z = _mm256_addsub_ps(tmp1,
331 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
332
333 _mm256_store_ps((float*)c, z); // Store the results back into the C container
334
335 a += 4;
336 c += 4;
337 }
338
339 for (i = num_points - isodd; i < num_points; i++) {
340 *c++ = (*a++) * scalar;
341 }
342}
343#endif /* LV_HAVE_AVX */
344
345#ifdef LV_HAVE_SSE3
346#include <pmmintrin.h>
347
349 const lv_32fc_t* aVector,
350 const lv_32fc_t scalar,
351 unsigned int num_points)
352{
353 unsigned int number = 0;
354 const unsigned int halfPoints = num_points / 2;
355
356 __m128 x, yl, yh, z, tmp1, tmp2;
357 lv_32fc_t* c = cVector;
358 const lv_32fc_t* a = aVector;
359
360 // Set up constant scalar vector
361 yl = _mm_set_ps1(lv_creal(scalar));
362 yh = _mm_set_ps1(lv_cimag(scalar));
363
364 for (; number < halfPoints; number++) {
365
366 x = _mm_load_ps((float*)a); // Load the ar + ai, br + bi as ar,ai,br,bi
367
368 tmp1 = _mm_mul_ps(x, yl); // tmp1 = ar*cr,ai*cr,br*dr,bi*dr
369
370 x = _mm_shuffle_ps(x, x, 0xB1); // Re-arrange x to be ai,ar,bi,br
371
372 tmp2 = _mm_mul_ps(x, yh); // tmp2 = ai*ci,ar*ci,bi*di,br*di
373
374 z = _mm_addsub_ps(tmp1,
375 tmp2); // ar*cr-ai*ci, ai*cr+ar*ci, br*dr-bi*di, bi*dr+br*di
376
377 _mm_store_ps((float*)c, z); // Store the results back into the C container
378
379 a += 2;
380 c += 2;
381 }
382
383 if ((num_points % 2) != 0) {
384 *c = (*a) * scalar;
385 }
386}
387#endif /* LV_HAVE_SSE */
388
389#ifdef LV_HAVE_NEON
390#include <arm_neon.h>
391
393 const lv_32fc_t* aVector,
394 const lv_32fc_t scalar,
395 unsigned int num_points)
396{
397 lv_32fc_t* cPtr = cVector;
398 const lv_32fc_t* aPtr = aVector;
399 unsigned int number = num_points;
400 unsigned int quarter_points = num_points / 4;
401
402 float32x4x2_t a_val, scalar_val;
403 float32x4x2_t tmp_imag;
404
405 scalar_val.val[0] = vld1q_dup_f32((const float*)&scalar);
406 scalar_val.val[1] = vld1q_dup_f32(((const float*)&scalar) + 1);
407 for (number = 0; number < quarter_points; ++number) {
408 a_val = vld2q_f32((float*)aPtr);
409 tmp_imag.val[1] = vmulq_f32(a_val.val[1], scalar_val.val[0]);
410 tmp_imag.val[0] = vmulq_f32(a_val.val[0], scalar_val.val[0]);
411
412 tmp_imag.val[1] = vmlaq_f32(tmp_imag.val[1], a_val.val[0], scalar_val.val[1]);
413 tmp_imag.val[0] = vmlsq_f32(tmp_imag.val[0], a_val.val[1], scalar_val.val[1]);
414
415 vst2q_f32((float*)cPtr, tmp_imag);
416 aPtr += 4;
417 cPtr += 4;
418 }
419
420 for (number = quarter_points * 4; number < num_points; number++) {
421 *cPtr++ = *aPtr++ * scalar;
422 }
423}
424#endif /* LV_HAVE_NEON */
425
426#ifdef LV_HAVE_GENERIC
427
429 const lv_32fc_t* aVector,
430 const lv_32fc_t scalar,
431 unsigned int num_points)
432{
433 lv_32fc_t* cPtr = cVector;
434 const lv_32fc_t* aPtr = aVector;
435 unsigned int number = num_points;
436
437 // unwrap loop
438 while (number >= 8) {
439 *cPtr++ = (*aPtr++) * scalar;
440 *cPtr++ = (*aPtr++) * scalar;
441 *cPtr++ = (*aPtr++) * scalar;
442 *cPtr++ = (*aPtr++) * scalar;
443 *cPtr++ = (*aPtr++) * scalar;
444 *cPtr++ = (*aPtr++) * scalar;
445 *cPtr++ = (*aPtr++) * scalar;
446 *cPtr++ = (*aPtr++) * scalar;
447 number -= 8;
448 }
449
450 // clean up any remaining
451 while (number-- > 0)
452 *cPtr++ = *aPtr++ * scalar;
453}
454#endif /* LV_HAVE_GENERIC */
455
456#endif /* INCLUDED_volk_32fc_x2_multiply_32fc_a_H */
static void volk_32fc_s32fc_multiply_32fc_a_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:428
static void volk_32fc_s32fc_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:218
static void volk_32fc_s32fc_multiply_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:131
static void volk_32fc_s32fc_multiply_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:175
static void volk_32fc_s32fc_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:304
static void volk_32fc_s32fc_multiply_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:348
static void volk_32fc_s32fc_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t scalar, unsigned int num_points)
Definition: volk_32fc_s32fc_multiply_32fc.h:392
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25