Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_index_min_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * VOLK is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * VOLK is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
76#ifndef INCLUDED_volk_32fc_index_min_16u_a_H
77#define INCLUDED_volk_32fc_index_min_16u_a_H
78
79#include <inttypes.h>
80#include <limits.h>
81#include <stdio.h>
82#include <volk/volk_common.h>
83#include <volk/volk_complex.h>
84
85#ifdef LV_HAVE_AVX2
86#include <immintrin.h>
88
89static inline void volk_32fc_index_min_16u_a_avx2_variant_0(uint16_t* target,
90 const lv_32fc_t* source,
91 uint32_t num_points)
92{
93 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
94
95 const __m256i indices_increment = _mm256_set1_epi32(8);
96 /*
97 * At the start of each loop iteration current_indices holds the indices of
98 * the complex numbers loaded from memory. Explanation for odd order is given
99 * in implementation of vector_32fc_index_min_variant0().
100 */
101 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
102
103 __m256 min_values = _mm256_set1_ps(FLT_MAX);
104 __m256i min_indices = _mm256_setzero_si256();
105
106 for (unsigned i = 0; i < num_points / 8u; ++i) {
107 __m256 in0 = _mm256_load_ps((float*)source);
108 __m256 in1 = _mm256_load_ps((float*)(source + 4));
110 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
111 source += 8;
112 }
113
114 // determine minimum value and index in the result of the vectorized loop
115 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
116 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
117 _mm256_store_ps(min_values_buffer, min_values);
118 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
119
120 float min = FLT_MAX;
121 uint32_t index = 0;
122 for (unsigned i = 0; i < 8; i++) {
123 if (min_values_buffer[i] < min) {
124 min = min_values_buffer[i];
125 index = min_indices_buffer[i];
126 }
127 }
128
129 // handle tail not processed by the vectorized loop
130 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
131 const float abs_squared =
132 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
133 if (abs_squared < min) {
134 min = abs_squared;
135 index = i;
136 }
137 ++source;
138 }
139
140 *target = index;
141}
142
143#endif /*LV_HAVE_AVX2*/
144
145#ifdef LV_HAVE_AVX2
146#include <immintrin.h>
148
149static inline void volk_32fc_index_min_16u_a_avx2_variant_1(uint16_t* target,
150 const lv_32fc_t* source,
151 uint32_t num_points)
152{
153 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
154
155 const __m256i indices_increment = _mm256_set1_epi32(8);
156 /*
157 * At the start of each loop iteration current_indices holds the indices of
158 * the complex numbers loaded from memory. Explanation for odd order is given
159 * in implementation of vector_32fc_index_min_variant0().
160 */
161 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
162
163 __m256 min_values = _mm256_set1_ps(FLT_MAX);
164 __m256i min_indices = _mm256_setzero_si256();
165
166 for (unsigned i = 0; i < num_points / 8u; ++i) {
167 __m256 in0 = _mm256_load_ps((float*)source);
168 __m256 in1 = _mm256_load_ps((float*)(source + 4));
170 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
171 source += 8;
172 }
173
174 // determine minimum value and index in the result of the vectorized loop
175 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
176 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
177 _mm256_store_ps(min_values_buffer, min_values);
178 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
179
180 float min = FLT_MAX;
181 uint32_t index = 0;
182 for (unsigned i = 0; i < 8; i++) {
183 if (min_values_buffer[i] < min) {
184 min = min_values_buffer[i];
185 index = min_indices_buffer[i];
186 }
187 }
188
189 // handle tail not processed by the vectorized loop
190 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
191 const float abs_squared =
192 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
193 if (abs_squared < min) {
194 min = abs_squared;
195 index = i;
196 }
197 ++source;
198 }
199
200 *target = index;
201}
202
203#endif /*LV_HAVE_AVX2*/
204
205#ifdef LV_HAVE_SSE3
206#include <pmmintrin.h>
207#include <xmmintrin.h>
208
209static inline void volk_32fc_index_min_16u_a_sse3(uint16_t* target,
210 const lv_32fc_t* source,
211 uint32_t num_points)
212{
213 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
214
215 union bit128 holderf;
216 union bit128 holderi;
217 float sq_dist = 0.0;
218
219 union bit128 xmm5, xmm4;
220 __m128 xmm1, xmm2, xmm3;
221 __m128i xmm8, xmm11, xmm12, xmm9, xmm10;
222
223 xmm5.int_vec = _mm_setzero_si128();
224 xmm4.int_vec = _mm_setzero_si128();
225 holderf.int_vec = _mm_setzero_si128();
226 holderi.int_vec = _mm_setzero_si128();
227
228 xmm8 = _mm_setr_epi32(0, 1, 2, 3);
229 xmm9 = _mm_setzero_si128();
230 xmm10 = _mm_setr_epi32(4, 4, 4, 4);
231 xmm3 = _mm_set_ps1(FLT_MAX);
232
233 int bound = num_points >> 2;
234
235 for (int i = 0; i < bound; ++i) {
236 xmm1 = _mm_load_ps((float*)source);
237 xmm2 = _mm_load_ps((float*)&source[2]);
238
239 source += 4;
240
241 xmm1 = _mm_mul_ps(xmm1, xmm1);
242 xmm2 = _mm_mul_ps(xmm2, xmm2);
243
244 xmm1 = _mm_hadd_ps(xmm1, xmm2);
245
246 xmm3 = _mm_min_ps(xmm1, xmm3);
247
248 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
249 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
250
251 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
252 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
253
254 xmm9 = _mm_add_epi32(xmm11, xmm12);
255
256 xmm8 = _mm_add_epi32(xmm8, xmm10);
257 }
258
259 if (num_points >> 1 & 1) {
260 xmm2 = _mm_load_ps((float*)source);
261
262 xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
263 xmm8 = bit128_p(&xmm1)->int_vec;
264
265 xmm2 = _mm_mul_ps(xmm2, xmm2);
266
267 source += 2;
268
269 xmm1 = _mm_hadd_ps(xmm2, xmm2);
270
271 xmm3 = _mm_min_ps(xmm1, xmm3);
272
273 xmm10 = _mm_setr_epi32(2, 2, 2, 2);
274
275 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
276 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
277
278 xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
279 xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
280
281 xmm9 = _mm_add_epi32(xmm11, xmm12);
282
283 xmm8 = _mm_add_epi32(xmm8, xmm10);
284 }
285
286 if (num_points & 1) {
287 sq_dist = lv_creal(source[0]) * lv_creal(source[0]) +
288 lv_cimag(source[0]) * lv_cimag(source[0]);
289
290 xmm2 = _mm_load1_ps(&sq_dist);
291
292 xmm1 = xmm3;
293
294 xmm3 = _mm_min_ss(xmm3, xmm2);
295
296 xmm4.float_vec = _mm_cmpgt_ps(xmm1, xmm3);
297 xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
298
299 xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
300
301 xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
302 xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
303
304 xmm9 = _mm_add_epi32(xmm11, xmm12);
305 }
306
307 _mm_store_ps((float*)&(holderf.f), xmm3);
308 _mm_store_si128(&(holderi.int_vec), xmm9);
309
310 target[0] = holderi.i[0];
311 sq_dist = holderf.f[0];
312 target[0] = (holderf.f[1] < sq_dist) ? holderi.i[1] : target[0];
313 sq_dist = (holderf.f[1] < sq_dist) ? holderf.f[1] : sq_dist;
314 target[0] = (holderf.f[2] < sq_dist) ? holderi.i[2] : target[0];
315 sq_dist = (holderf.f[2] < sq_dist) ? holderf.f[2] : sq_dist;
316 target[0] = (holderf.f[3] < sq_dist) ? holderi.i[3] : target[0];
317 sq_dist = (holderf.f[3] < sq_dist) ? holderf.f[3] : sq_dist;
318}
319
320#endif /*LV_HAVE_SSE3*/
321
322#ifdef LV_HAVE_GENERIC
323static inline void volk_32fc_index_min_16u_generic(uint16_t* target,
324 const lv_32fc_t* source,
325 uint32_t num_points)
326{
327 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
328
329 float sq_dist = 0.0;
330 float min = FLT_MAX;
331 uint16_t index = 0;
332
333 for (uint32_t i = 0; i < num_points; ++i) {
334 sq_dist = lv_creal(source[i]) * lv_creal(source[i]) +
335 lv_cimag(source[i]) * lv_cimag(source[i]);
336
337 if (sq_dist < min) {
338 index = i;
339 min = sq_dist;
340 }
341 }
342 target[0] = index;
343}
344
345#endif /*LV_HAVE_GENERIC*/
346
347#endif /*INCLUDED_volk_32fc_index_min_16u_a_H*/
348
349#ifndef INCLUDED_volk_32fc_index_min_16u_u_H
350#define INCLUDED_volk_32fc_index_min_16u_u_H
351
352#include <inttypes.h>
353#include <limits.h>
354#include <stdio.h>
355#include <volk/volk_common.h>
356#include <volk/volk_complex.h>
357
358#ifdef LV_HAVE_AVX2
359#include <immintrin.h>
361
362static inline void volk_32fc_index_min_16u_u_avx2_variant_0(uint16_t* target,
363 const lv_32fc_t* source,
364 uint32_t num_points)
365{
366 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
367
368 const __m256i indices_increment = _mm256_set1_epi32(8);
369 /*
370 * At the start of each loop iteration current_indices holds the indices of
371 * the complex numbers loaded from memory. Explanation for odd order is given
372 * in implementation of vector_32fc_index_min_variant0().
373 */
374 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
375
376 __m256 min_values = _mm256_set1_ps(FLT_MAX);
377 __m256i min_indices = _mm256_setzero_si256();
378
379 for (unsigned i = 0; i < num_points / 8u; ++i) {
380 __m256 in0 = _mm256_loadu_ps((float*)source);
381 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
383 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
384 source += 8;
385 }
386
387 // determine minimum value and index in the result of the vectorized loop
388 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
389 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
390 _mm256_store_ps(min_values_buffer, min_values);
391 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
392
393 float min = FLT_MAX;
394 uint32_t index = 0;
395 for (unsigned i = 0; i < 8; i++) {
396 if (min_values_buffer[i] < min) {
397 min = min_values_buffer[i];
398 index = min_indices_buffer[i];
399 }
400 }
401
402 // handle tail not processed by the vectorized loop
403 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
404 const float abs_squared =
405 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
406 if (abs_squared < min) {
407 min = abs_squared;
408 index = i;
409 }
410 ++source;
411 }
412
413 *target = index;
414}
415
416#endif /*LV_HAVE_AVX2*/
417
418#ifdef LV_HAVE_AVX2
419#include <immintrin.h>
421
422static inline void volk_32fc_index_min_16u_u_avx2_variant_1(uint16_t* target,
423 const lv_32fc_t* source,
424 uint32_t num_points)
425{
426 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
427
428 const __m256i indices_increment = _mm256_set1_epi32(8);
429 /*
430 * At the start of each loop iteration current_indices holds the indices of
431 * the complex numbers loaded from memory. Explanation for odd order is given
432 * in implementation of vector_32fc_index_min_variant0().
433 */
434 __m256i current_indices = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
435
436 __m256 min_values = _mm256_set1_ps(FLT_MAX);
437 __m256i min_indices = _mm256_setzero_si256();
438
439 for (unsigned i = 0; i < num_points / 8u; ++i) {
440 __m256 in0 = _mm256_loadu_ps((float*)source);
441 __m256 in1 = _mm256_loadu_ps((float*)(source + 4));
443 in0, in1, &min_values, &min_indices, &current_indices, indices_increment);
444 source += 8;
445 }
446
447 // determine minimum value and index in the result of the vectorized loop
448 __VOLK_ATTR_ALIGNED(32) float min_values_buffer[8];
449 __VOLK_ATTR_ALIGNED(32) uint32_t min_indices_buffer[8];
450 _mm256_store_ps(min_values_buffer, min_values);
451 _mm256_store_si256((__m256i*)min_indices_buffer, min_indices);
452
453 float min = FLT_MAX;
454 uint32_t index = 0;
455 for (unsigned i = 0; i < 8; i++) {
456 if (min_values_buffer[i] < min) {
457 min = min_values_buffer[i];
458 index = min_indices_buffer[i];
459 }
460 }
461
462 // handle tail not processed by the vectorized loop
463 for (unsigned i = num_points & (~7u); i < num_points; ++i) {
464 const float abs_squared =
465 lv_creal(*source) * lv_creal(*source) + lv_cimag(*source) * lv_cimag(*source);
466 if (abs_squared < min) {
467 min = abs_squared;
468 index = i;
469 }
470 ++source;
471 }
472
473 *target = index;
474}
475
476#endif /*LV_HAVE_AVX2*/
477
478#endif /*INCLUDED_volk_32fc_index_min_16u_u_H*/
Definition: volk_common.h:111
float f[4]
Definition: volk_common.h:115
__m128i int_vec
Definition: volk_common.h:123
uint32_t i[4]
Definition: volk_common.h:114
__m128 float_vec
Definition: volk_common.h:119
static void volk_32fc_index_min_16u_generic(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:323
static void volk_32fc_index_min_16u_a_sse3(uint16_t *target, const lv_32fc_t *source, uint32_t num_points)
Definition: volk_32fc_index_min_16u.h:209
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
#define bit128_p(x)
Definition: volk_common.h:142
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
float complex lv_32fc_t
Definition: volk_complex.h:65
for i
Definition: volk_config_fixed.tmpl.h:25