Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_index_max_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016, 2018-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
71#define INCLUDED_volk_32fc_index_max_32u_a_H
72
73#include <inttypes.h>
74#include <stdio.h>
75#include <volk/volk_common.h>
76#include <volk/volk_complex.h>
77
78#ifdef LV_HAVE_AVX2
79#include <immintrin.h>
81
82static inline void volk_32fc_index_max_32u_a_avx2_variant_0(uint32_t* target,
83 lv_32fc_t* src0,
84 uint32_t num_points)
85{
86 const __m256i indices_increment = _mm256_set1_epi32(8);
87 /*
88 * At the start of each loop iteration current_indices holds the indices of
89 * the complex numbers loaded from memory. Explanation for odd order is given
90 * in implementation of vector_32fc_index_max_variant0().
91 */
92 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
93
94 __m256 max_values = _mm256_setzero_ps();
95 __m256i max_indices = _mm256_setzero_si256();
96
97 for (unsigned i = 0; i < num_points / 8u; ++i) {
98 __m256 in0 = _mm256_load_ps((float*)src0);
99 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
101 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
102 src0 += 8;
103 }
104
105 // determine maximum value and index in the result of the vectorized loop
106 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
107 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
108 _mm256_store_ps(max_values_buffer, max_values);
109 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
110
111 float max = 0.f;
112 uint32_t index = 0;
113 for (unsigned i = 0; i < 8; i++) {
114 if (max_values_buffer[i] > max) {
115 max = max_values_buffer[i];
116 index = max_indices_buffer[i];
117 }
118 }
119
120 // handle tail not processed by the vectorized loop
121 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
122 const float abs_squared =
123 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
124 if (abs_squared > max) {
125 max = abs_squared;
126 index = i;
127 }
128 ++src0;
129 }
130
131 *target = index;
132}
133
134#endif /*LV_HAVE_AVX2*/
135
136#ifdef LV_HAVE_AVX2
137#include <immintrin.h>
139
140static inline void volk_32fc_index_max_32u_a_avx2_variant_1(uint32_t* target,
141 lv_32fc_t* src0,
142 uint32_t num_points)
143{
144 const __m256i indices_increment = _mm256_set1_epi32(8);
145 /*
146 * At the start of each loop iteration current_indices holds the indices of
147 * the complex numbers loaded from memory. Explanation for odd order is given
148 * in implementation of vector_32fc_index_max_variant0().
149 */
150 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
151
152 __m256 max_values = _mm256_setzero_ps();
153 __m256i max_indices = _mm256_setzero_si256();
154
155 for (unsigned i = 0; i < num_points / 8u; ++i) {
156 __m256 in0 = _mm256_load_ps((float*)src0);
157 __m256 in1 = _mm256_load_ps((float*)(src0 + 4));
159 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
160 src0 += 8;
161 }
162
163 // determine maximum value and index in the result of the vectorized loop
164 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
165 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
166 _mm256_store_ps(max_values_buffer, max_values);
167 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
168
169 float max = 0.f;
170 uint32_t index = 0;
171 for (unsigned i = 0; i < 8; i++) {
172 if (max_values_buffer[i] > max) {
173 max = max_values_buffer[i];
174 index = max_indices_buffer[i];
175 }
176 }
177
178 // handle tail not processed by the vectorized loop
179 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
180 const float abs_squared =
181 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
182 if (abs_squared > max) {
183 max = abs_squared;
184 index = i;
185 }
186 ++src0;
187 }
188
189 *target = index;
190}
191
192#endif /*LV_HAVE_AVX2*/
193
194#ifdef LV_HAVE_SSE3
195#include <pmmintrin.h>
196#include <xmmintrin.h>
197
198static inline void
199volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
200{
201 const uint32_t num_bytes = num_points * 8;
202
203 union bit128 holderf;
204 union bit128 holderi;
205 float sq_dist = 0.0;
206
207 union bit128 xmm5, xmm4;
208 __m128 xmm1, xmm2, xmm3;
209 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
210
211 xmm5.int_vec = _mm_setzero_si128();
212 xmm4.int_vec = _mm_setzero_si128();
213 holderf.int_vec = _mm_setzero_si128();
214 holderi.int_vec = _mm_setzero_si128();
215
216 int bound = num_bytes >> 5;
217 int i = 0;
218
219 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
220 xmm9 = _mm_setzero_si128();
221 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
222 xmm3 = _mm_setzero_ps();
223
224 for (; i < bound; ++i) {
225 xmm1 = _mm_load_ps((float*)src0);
226 xmm2 = _mm_load_ps((float*)&src0[2]);
227
228 src0 += 4;
229
230 xmm1 = _mm_mul_ps(xmm1, xmm1);
231 xmm2 = _mm_mul_ps(xmm2, xmm2);
232
233 xmm1 = _mm_hadd_ps(xmm1, xmm2);
234
235 xmm3 = _mm_max_ps(xmm1, xmm3);
236
237 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
238 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
239
240 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
241 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
242
243 xmm9 = _mm_add_epi32(xmm11, xmm12);
244
245 xmm8 = _mm_add_epi32(xmm8, xmm10);
246 }
247
248 if (num_bytes >> 4 & 1) {
249 xmm2 = _mm_load_ps((float*)src0);
250
251 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
252 xmm8 = bit128_p(&xmm1)->int_vec;
253
254 xmm2 = _mm_mul_ps(xmm2, xmm2);
255
256 src0 += 2;
257
258 xmm1 = _mm_hadd_ps(xmm2, xmm2);
259
260 xmm3 = _mm_max_ps(xmm1, xmm3);
261
262 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
263
264 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
265 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
266
267 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
268 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
269
270 xmm9 = _mm_add_epi32(xmm11, xmm12);
271
272 xmm8 = _mm_add_epi32(xmm8, xmm10);
273 }
274
275 if (num_bytes >> 3 & 1) {
276 sq_dist =
277 lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
278
279 xmm2 = _mm_load1_ps(&sq_dist);
280
281 xmm1 = xmm3;
282
283 xmm3 = _mm_max_ss(xmm3, xmm2);
284
285 xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
286 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
287
288 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
289
290 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
291 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
292
293 xmm9 = _mm_add_epi32(xmm11, xmm12);
294 }
295
296 _mm_store_ps((float*)&(holderf.f), xmm3);
297 _mm_store_si128(&(holderi.int_vec), xmm9);
298
299 target[0] = holderi.i[0];
300 sq_dist = holderf.f[0];
301 target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
302 sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
303 target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
304 sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
305 target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
306 sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
307}
308
309#endif /*LV_HAVE_SSE3*/
310
311#ifdef LV_HAVE_GENERIC
312static inline void
313volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
314{
315 const uint32_t num_bytes = num_points * 8;
316
317 float sq_dist = 0.0;
318 float max = 0.0;
319 uint32_t index = 0;
320
321 uint32_t i = 0;
322
323 for (; i<num_bytes>> 3; ++i) {
324 sq_dist =
325 lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
326
327 if (sq_dist > max) {
328 index = i;
329 max = sq_dist;
330 }
331 }
332 target[0] = index;
333}
334
335#endif /*LV_HAVE_GENERIC*/
336
337#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
338
339#ifndef INCLUDED_volk_32fc_index_max_32u_u_H
340#define INCLUDED_volk_32fc_index_max_32u_u_H
341
342#include <inttypes.h>
343#include <stdio.h>
344#include <volk/volk_common.h>
345#include <volk/volk_complex.h>
346
347#ifdef LV_HAVE_AVX2
348#include <immintrin.h>
350
351static inline void volk_32fc_index_max_32u_u_avx2_variant_0(uint32_t* target,
352 lv_32fc_t* src0,
353 uint32_t num_points)
354{
355 const __m256i indices_increment = _mm256_set1_epi32(8);
356 /*
357 * At the start of each loop iteration current_indices holds the indices of
358 * the complex numbers loaded from memory. Explanation for odd order is given
359 * in implementation of vector_32fc_index_max_variant0().
360 */
361 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
362
363 __m256 max_values = _mm256_setzero_ps();
364 __m256i max_indices = _mm256_setzero_si256();
365
366 for (unsigned i = 0; i < num_points / 8u; ++i) {
367 __m256 in0 = _mm256_loadu_ps((float*)src0);
368 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
370 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
371 src0 += 8;
372 }
373
374 // determine maximum value and index in the result of the vectorized loop
375 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
376 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
377 _mm256_store_ps(max_values_buffer, max_values);
378 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
379
380 float max = 0.f;
381 uint32_t index = 0;
382 for (unsigned i = 0; i < 8; i++) {
383 if (max_values_buffer[i] > max) {
384 max = max_values_buffer[i];
385 index = max_indices_buffer[i];
386 }
387 }
388
389 // handle tail not processed by the vectorized loop
390 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
391 const float abs_squared =
392 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
393 if (abs_squared > max) {
394 max = abs_squared;
395 index = i;
396 }
397 ++src0;
398 }
399
400 *target = index;
401}
402
403#endif /*LV_HAVE_AVX2*/
404
405#ifdef LV_HAVE_AVX2
406#include <immintrin.h>
408
409static inline void volk_32fc_index_max_32u_u_avx2_variant_1(uint32_t* target,
410 lv_32fc_t* src0,
411 uint32_t num_points)
412{
413 const __m256i indices_increment = _mm256_set1_epi32(8);
414 /*
415 * At the start of each loop iteration current_indices holds the indices of
416 * the complex numbers loaded from memory. Explanation for odd order is given
417 * in implementation of vector_32fc_index_max_variant0().
418 */
419 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
420
421 __m256 max_values = _mm256_setzero_ps();
422 __m256i max_indices = _mm256_setzero_si256();
423
424 for (unsigned i = 0; i < num_points / 8u; ++i) {
425 __m256 in0 = _mm256_loadu_ps((float*)src0);
426 __m256 in1 = _mm256_loadu_ps((float*)(src0 + 4));
428 in0, in1, &max_values, &max_indices, &current_indices, indices_increment);
429 src0 += 8;
430 }
431
432 // determine maximum value and index in the result of the vectorized loop
433 __VOLK_ATTR_ALIGNED(32) float max_values_buffer[8];
434 __VOLK_ATTR_ALIGNED(32) uint32_t max_indices_buffer[8];
435 _mm256_store_ps(max_values_buffer, max_values);
436 _mm256_store_si256((__m256i*)max_indices_buffer, max_indices);
437
438 float max = 0.f;
439 uint32_t index = 0;
440 for (unsigned i = 0; i < 8; i++) {
441 if (max_values_buffer[i] > max) {
442 max = max_values_buffer[i];
443 index = max_indices_buffer[i];
444 }
445 }
446
447 // handle tail not processed by the vectorized loop
448 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
449 const float abs_squared =
450 lv_creal(*src0) * lv_creal(*src0) + lv_cimag(*src0) * lv_cimag(*src0);
451 if (abs_squared > max) {
452 max = abs_squared;
453 index = i;
454 }
455 ++src0;
456 }
457
458 *target = index;
459}
460
461#endif /*LV_HAVE_AVX2*/
462
463#ifdef LV_HAVE_NEON
464#include <arm_neon.h>
466
467static inline void
468volk_32fc_index_max_32u_neon(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
469{
470 unsigned int number = 0;
471 const uint32_t quarter_points = num_points / 4;
472 const lv_32fc_t* src0Ptr = src0;
473
474 uint32_t indices[4] = { 0, 1, 2, 3 };
475 const uint32x4_t vec_indices_incr = vdupq_n_u32(4);
476 uint32x4_t vec_indices = vld1q_u32(indices);
477 uint32x4_t vec_max_indices = vec_indices;
478
479 if (num_points) {
480 float max = *src0Ptr;
481 uint32_t index = 0;
482
483 float32x4_t vec_max = vdupq_n_f32(*src0Ptr);
484
485 for (; number < quarter_points; number++) {
486 // Load complex and compute magnitude squared
487 const float32x4_t vec_mag2 =
488 _vmagnitudesquaredq_f32(vld2q_f32((float*)src0Ptr));
489 __VOLK_PREFETCH(src0Ptr += 4);
490 // a > b?
491 const uint32x4_t gt_mask = vcgtq_f32(vec_mag2, vec_max);
492 vec_max = vbslq_f32(gt_mask, vec_mag2, vec_max);
493 vec_max_indices = vbslq_u32(gt_mask, vec_indices, vec_max_indices);
494 vec_indices = vaddq_u32(vec_indices, vec_indices_incr);
495 }
496 uint32_t tmp_max_indices[4];
497 float tmp_max[4];
498 vst1q_u32(tmp_max_indices, vec_max_indices);
499 vst1q_f32(tmp_max, vec_max);
500
501 for (int i = 0; i < 4; i++) {
502 if (tmp_max[i] > max) {
503 max = tmp_max[i];
504 index = tmp_max_indices[i];
505 }
506 }
507
508 // Deal with the rest
509 for (number = quarter_points * 4; number < num_points; number++) {
510 const float re = lv_creal(*src0Ptr);
511 const float im = lv_cimag(*src0Ptr);
512 if ((re * re + im * im) > max) {
513 max = *src0Ptr;
514 index = number;
515 }
516 src0Ptr++;
517 }
518 *target = index;
519 }
520}
521
522#endif /*LV_HAVE_NEON*/
523
524#endif /*INCLUDED_volk_32fc_index_max_32u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_max_32u_generic(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:313
static void volk_32fc_index_max_32u_a_sse3(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:199
static void volk_32fc_index_max_32u_neon(uint32_t *target, lv_32fc_t *src0, uint32_t num_points)
Definition: volk_32fc_index_max_32u.h:468
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86