Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
79#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
80#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H
81
82#include <volk/volk_complex.h>
83
84
85static inline void calculate_scaled_distances(float* target,
86 const lv_32fc_t symbol,
87 const lv_32fc_t* points,
88 const float scalar,
89 const unsigned int num_points)
90{
91 lv_32fc_t diff;
92 for (unsigned int i = 0; i < num_points; ++i) {
93 /*
94 * Calculate: |y - x|^2 * SNR_lin
95 * Compare C++: *target++ = scalar * std::norm(symbol - *constellation++);
96 */
97 diff = symbol - *points++;
98 *target++ =
99 scalar * (lv_creal(diff) * lv_creal(diff) + lv_cimag(diff) * lv_cimag(diff));
100 }
101}
102
103
104#ifdef LV_HAVE_AVX2
105#include <immintrin.h>
107
108static inline void
109volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx2(float* target,
110 lv_32fc_t* src0,
111 lv_32fc_t* points,
112 float scalar,
113 unsigned int num_points)
114{
115 const unsigned int num_bytes = num_points * 8;
116 __m128 xmm9, xmm10;
117 __m256 xmm4, xmm6;
118 __m256 xmm_points0, xmm_points1, xmm_result;
119
120 const unsigned int bound = num_bytes >> 6;
121
122 // load complex value into all parts of the register.
123 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
124 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
125
126 // Load scalar into all 8 parts of the register
127 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
128 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
129
130 // Set permutation constant
131 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
132
133 for (unsigned int i = 0; i < bound; ++i) {
134 xmm_points0 = _mm256_load_ps((float*)points);
135 xmm_points1 = _mm256_load_ps((float*)(points + 4));
136 points += 8;
137 __VOLK_PREFETCH(points);
138
140 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
141
142 _mm256_store_ps(target, xmm_result);
143 target += 8;
144 }
145
146 if (num_bytes >> 5 & 1) {
147 xmm_points0 = _mm256_load_ps((float*)points);
148
149 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
150
151 points += 4;
152
153 xmm6 = _mm256_mul_ps(xmm4, xmm4);
154
155 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
156 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
157
158 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
159
160 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
161 _mm_store_ps(target, xmm9);
162 target += 4;
163 }
164
165 if (num_bytes >> 4 & 1) {
166 xmm9 = _mm_load_ps((float*)points);
167
168 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
169
170 points += 2;
171
172 xmm9 = _mm_mul_ps(xmm10, xmm10);
173
174 xmm10 = _mm_hadd_ps(xmm9, xmm9);
175
176 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
177
178 _mm_storeh_pi((__m64*)target, xmm10);
179 target += 2;
180 }
181
182 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
183}
184
185#endif /*LV_HAVE_AVX2*/
186
187
188#ifdef LV_HAVE_AVX
189#include <immintrin.h>
191
192static inline void
194 lv_32fc_t* src0,
195 lv_32fc_t* points,
196 float scalar,
197 unsigned int num_points)
198{
199 const int eightsPoints = num_points / 8;
200 const int remainder = num_points - 8 * eightsPoints;
201
202 __m256 xmm_points0, xmm_points1, xmm_result;
203
204 // load complex value into all parts of the register.
205 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
206
207 // Load scalar into all 8 parts of the register
208 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
209
210 for (int i = 0; i < eightsPoints; ++i) {
211 xmm_points0 = _mm256_load_ps((float*)points);
212 xmm_points1 = _mm256_load_ps((float*)(points + 4));
213 points += 8;
214
215 xmm_result = _mm256_scaled_norm_dist_ps(
216 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
217
218 _mm256_store_ps(target, xmm_result);
219 target += 8;
220 }
221
222 const lv_32fc_t symbol = *src0;
223 calculate_scaled_distances(target, symbol, points, scalar, remainder);
224}
225
226#endif /* LV_HAVE_AVX */
227
228
229#ifdef LV_HAVE_SSE3
230#include <pmmintrin.h>
232
233static inline void
235 lv_32fc_t* src0,
236 lv_32fc_t* points,
237 float scalar,
238 unsigned int num_points)
239{
240 __m128 xmm_points0, xmm_points1, xmm_result;
241
242 /*
243 * First do 4 values in every loop iteration.
244 * There may be up to 3 values left.
245 * leftovers0 indicates if at least 2 more are available for SSE execution.
246 * leftovers1 indicates if there is a single element left.
247 */
248 const int quarterPoints = num_points / 4;
249 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
250 const int leftovers1 = num_points % 2;
251
252 // load complex value into both parts of the register.
253 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
254
255 // Load scalar into all 4 parts of the register
256 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
257
258 for (int i = 0; i < quarterPoints; ++i) {
259 xmm_points0 = _mm_load_ps((float*)points);
260 xmm_points1 = _mm_load_ps((float*)(points + 2));
261 points += 4;
262 __VOLK_PREFETCH(points);
263 // calculate distances
264 xmm_result = _mm_scaled_norm_dist_ps_sse3(
265 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
266
267 _mm_store_ps(target, xmm_result);
268 target += 4;
269 }
270
271 for (int i = 0; i < leftovers0; ++i) {
272 xmm_points0 = _mm_load_ps((float*)points);
273 points += 2;
274
275 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
276 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
277 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
278 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
279
280 _mm_storeh_pi((__m64*)target, xmm_result);
281 target += 2;
282 }
283
284 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
285}
286
287#endif /*LV_HAVE_SSE3*/
288
289#ifdef LV_HAVE_SSE
291#include <xmmintrin.h>
292static inline void
294 lv_32fc_t* src0,
295 lv_32fc_t* points,
296 float scalar,
297 unsigned int num_points)
298{
299 const __m128 xmm_scalar = _mm_set1_ps(scalar);
300 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
301
302 for (unsigned i = 0; i < num_points / 4; ++i) {
303 __m128 xmm_points0 = _mm_load_ps((float*)points);
304 __m128 xmm_points1 = _mm_load_ps((float*)(points + 2));
305 points += 4;
306 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
307 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
308 _mm_store_ps((float*)target, xmm_result);
309 target += 4;
310 }
311
312 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
313}
314#endif // LV_HAVE_SSE
315
316#ifdef LV_HAVE_GENERIC
317static inline void
319 lv_32fc_t* src0,
320 lv_32fc_t* points,
321 float scalar,
322 unsigned int num_points)
323{
324 const lv_32fc_t symbol = *src0;
325 calculate_scaled_distances(target, symbol, points, scalar, num_points);
326}
327
328#endif /*LV_HAVE_GENERIC*/
329
330
331#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_H*/
332
333#ifndef INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
334#define INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H
335
336#include <volk/volk_complex.h>
337
338
339#ifdef LV_HAVE_AVX2
340#include <immintrin.h>
342
343static inline void
344volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx2(float* target,
345 lv_32fc_t* src0,
346 lv_32fc_t* points,
347 float scalar,
348 unsigned int num_points)
349{
350 const unsigned int num_bytes = num_points * 8;
351 __m128 xmm9, xmm10;
352 __m256 xmm4, xmm6;
353 __m256 xmm_points0, xmm_points1, xmm_result;
354
355 const unsigned int bound = num_bytes >> 6;
356
357 // load complex value into all parts of the register.
358 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
359 const __m128 xmm128_symbol = _mm256_extractf128_ps(xmm_symbol, 1);
360
361 // Load scalar into all 8 parts of the register
362 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
363 const __m128 xmm128_scalar = _mm256_extractf128_ps(xmm_scalar, 1);
364
365 // Set permutation constant
366 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
367
368 for (unsigned int i = 0; i < bound; ++i) {
369 xmm_points0 = _mm256_loadu_ps((float*)points);
370 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
371 points += 8;
372 __VOLK_PREFETCH(points);
373
375 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
376
377 _mm256_storeu_ps(target, xmm_result);
378 target += 8;
379 }
380
381 if (num_bytes >> 5 & 1) {
382 xmm_points0 = _mm256_loadu_ps((float*)points);
383
384 xmm4 = _mm256_sub_ps(xmm_symbol, xmm_points0);
385
386 points += 4;
387
388 xmm6 = _mm256_mul_ps(xmm4, xmm4);
389
390 xmm4 = _mm256_hadd_ps(xmm6, xmm6);
391 xmm4 = _mm256_permutevar8x32_ps(xmm4, idx);
392
393 xmm_result = _mm256_mul_ps(xmm4, xmm_scalar);
394
395 xmm9 = _mm256_extractf128_ps(xmm_result, 1);
396 _mm_storeu_ps(target, xmm9);
397 target += 4;
398 }
399
400 if (num_bytes >> 4 & 1) {
401 xmm9 = _mm_loadu_ps((float*)points);
402
403 xmm10 = _mm_sub_ps(xmm128_symbol, xmm9);
404
405 points += 2;
406
407 xmm9 = _mm_mul_ps(xmm10, xmm10);
408
409 xmm10 = _mm_hadd_ps(xmm9, xmm9);
410
411 xmm10 = _mm_mul_ps(xmm10, xmm128_scalar);
412
413 _mm_storeh_pi((__m64*)target, xmm10);
414 target += 2;
415 }
416
417 calculate_scaled_distances(target, src0[0], points, scalar, (num_bytes >> 3) & 1);
418}
419
420#endif /*LV_HAVE_AVX2*/
421
422
423#ifdef LV_HAVE_AVX
424#include <immintrin.h>
426
427static inline void
429 lv_32fc_t* src0,
430 lv_32fc_t* points,
431 float scalar,
432 unsigned int num_points)
433{
434 const int eightsPoints = num_points / 8;
435 const int remainder = num_points - 8 * eightsPoints;
436
437 __m256 xmm_points0, xmm_points1, xmm_result;
438
439 // load complex value into all parts of the register.
440 const __m256 xmm_symbol = _mm256_castpd_ps(_mm256_broadcast_sd((const double*)src0));
441
442 // Load scalar into all 8 parts of the register
443 const __m256 xmm_scalar = _mm256_broadcast_ss(&scalar);
444
445 for (int i = 0; i < eightsPoints; ++i) {
446 xmm_points0 = _mm256_loadu_ps((float*)points);
447 xmm_points1 = _mm256_loadu_ps((float*)(points + 4));
448 points += 8;
449
450 xmm_result = _mm256_scaled_norm_dist_ps(
451 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
452
453 _mm256_storeu_ps(target, xmm_result);
454 target += 8;
455 }
456
457 const lv_32fc_t symbol = *src0;
458 calculate_scaled_distances(target, symbol, points, scalar, remainder);
459}
460
461#endif /* LV_HAVE_AVX */
462
463
464#ifdef LV_HAVE_SSE3
465#include <pmmintrin.h>
467
468static inline void
470 lv_32fc_t* src0,
471 lv_32fc_t* points,
472 float scalar,
473 unsigned int num_points)
474{
475 __m128 xmm_points0, xmm_points1, xmm_result;
476
477 /*
478 * First do 4 values in every loop iteration.
479 * There may be up to 3 values left.
480 * leftovers0 indicates if at least 2 more are available for SSE execution.
481 * leftovers1 indicates if there is a single element left.
482 */
483 const int quarterPoints = num_points / 4;
484 const int leftovers0 = (num_points / 2) - 2 * quarterPoints;
485 const int leftovers1 = num_points % 2;
486
487 // load complex value into both parts of the register.
488 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
489
490 // Load scalar into all 4 parts of the register
491 const __m128 xmm_scalar = _mm_load1_ps(&scalar);
492
493 for (int i = 0; i < quarterPoints; ++i) {
494 xmm_points0 = _mm_loadu_ps((float*)points);
495 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
496 points += 4;
497 __VOLK_PREFETCH(points);
498 // calculate distances
499 xmm_result = _mm_scaled_norm_dist_ps_sse3(
500 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
501
502 _mm_storeu_ps(target, xmm_result);
503 target += 4;
504 }
505
506 for (int i = 0; i < leftovers0; ++i) {
507 xmm_points0 = _mm_loadu_ps((float*)points);
508 points += 2;
509
510 xmm_points0 = _mm_sub_ps(xmm_symbol, xmm_points0);
511 xmm_points0 = _mm_mul_ps(xmm_points0, xmm_points0);
512 xmm_points0 = _mm_hadd_ps(xmm_points0, xmm_points0);
513 xmm_result = _mm_mul_ps(xmm_points0, xmm_scalar);
514
515 _mm_storeh_pi((__m64*)target, xmm_result);
516 target += 2;
517 }
518
519 calculate_scaled_distances(target, src0[0], points, scalar, leftovers1);
520}
521
522#endif /*LV_HAVE_SSE3*/
523
524#ifdef LV_HAVE_SSE
526#include <xmmintrin.h>
527static inline void
529 lv_32fc_t* src0,
530 lv_32fc_t* points,
531 float scalar,
532 unsigned int num_points)
533{
534 const __m128 xmm_scalar = _mm_set1_ps(scalar);
535 const __m128 xmm_symbol = _mm_castpd_ps(_mm_load1_pd((const double*)src0));
536
537 for (unsigned i = 0; i < num_points / 4; ++i) {
538 __m128 xmm_points0 = _mm_loadu_ps((float*)points);
539 __m128 xmm_points1 = _mm_loadu_ps((float*)(points + 2));
540 points += 4;
541 __m128 xmm_result = _mm_scaled_norm_dist_ps_sse(
542 xmm_symbol, xmm_symbol, xmm_points0, xmm_points1, xmm_scalar);
543 _mm_storeu_ps((float*)target, xmm_result);
544 target += 4;
545 }
546
547 calculate_scaled_distances(target, src0[0], points, scalar, num_points % 4);
548}
549#endif // LV_HAVE_SSE
550
551#endif /*INCLUDED_volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_H*/
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:193
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_avx(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:428
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:528
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:234
static void calculate_scaled_distances(float *target, const lv_32fc_t symbol, const lv_32fc_t *points, const float scalar, const unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:85
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_a_sse(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:293
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_generic(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:318
static void volk_32fc_x2_s32f_square_dist_scalar_mult_32f_u_sse3(float *target, lv_32fc_t *src0, lv_32fc_t *points, float scalar, unsigned int num_points)
Definition: volk_32fc_x2_s32f_square_dist_scalar_mult_32f.h:469
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_scaled_norm_dist_ps(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx_intrinsics.h:88
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static __m128 _mm_scaled_norm_dist_ps_sse3(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse3_intrinsics.h:63
static __m128 _mm_scaled_norm_dist_ps_sse(const __m128 symbols0, const __m128 symbols1, const __m128 points0, const __m128 points1, const __m128 scalar)
Definition: volk_sse_intrinsics.h:49