Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_index_min_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * VOLK is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * VOLK is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
65#ifndef INCLUDED_volk_32f_index_min_32u_a_H
66#define INCLUDED_volk_32f_index_min_32u_a_H
67
68#include <inttypes.h>
69#include <stdio.h>
70#include <volk/volk_common.h>
71
72#ifdef LV_HAVE_SSE4_1
73#include <smmintrin.h>
74
75static inline void volk_32f_index_min_32u_a_sse4_1(uint32_t* target,
76 const float* source,
77 uint32_t num_points)
78{
79 const uint32_t quarterPoints = num_points / 4;
80
81 float* inputPtr = (float*)source;
82
83 __m128 indexIncrementValues = _mm_set1_ps(4);
84 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
85
86 float min = source[0];
87 float index = 0;
88 __m128 minValues = _mm_set1_ps(min);
89 __m128 minValuesIndex = _mm_setzero_ps();
90 __m128 compareResults;
91 __m128 currentValues;
92
93 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
94 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
95
96 for (uint32_t number = 0; number < quarterPoints; number++) {
97
98 currentValues = _mm_load_ps(inputPtr);
99 inputPtr += 4;
100 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
101
102 compareResults = _mm_cmplt_ps(currentValues, minValues);
103
104 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
105 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
106 }
107
108 // Calculate the smallest value from the remaining 4 points
109 _mm_store_ps(minValuesBuffer, minValues);
110 _mm_store_ps(minIndexesBuffer, minValuesIndex);
111
112 for (uint32_t number = 0; number < 4; number++) {
113 if (minValuesBuffer[number] < min) {
114 index = minIndexesBuffer[number];
115 min = minValuesBuffer[number];
116 } else if (minValuesBuffer[number] == min) {
117 if (index > minIndexesBuffer[number])
118 index = minIndexesBuffer[number];
119 }
120 }
121
122 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
123 if (source[number] < min) {
124 index = number;
125 min = source[number];
126 }
127 }
128 target[0] = (uint32_t)index;
129}
130
131#endif /*LV_HAVE_SSE4_1*/
132
133
134#ifdef LV_HAVE_SSE
135
136#include <xmmintrin.h>
137
138static inline void
139volk_32f_index_min_32u_a_sse(uint32_t* target, const float* source, uint32_t num_points)
140{
141 const uint32_t quarterPoints = num_points / 4;
142
143 float* inputPtr = (float*)source;
144
145 __m128 indexIncrementValues = _mm_set1_ps(4);
146 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
147
148 float min = source[0];
149 float index = 0;
150 __m128 minValues = _mm_set1_ps(min);
151 __m128 minValuesIndex = _mm_setzero_ps();
152 __m128 compareResults;
153 __m128 currentValues;
154
155 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
156 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
157
158 for (uint32_t number = 0; number < quarterPoints; number++) {
159
160 currentValues = _mm_load_ps(inputPtr);
161 inputPtr += 4;
162 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
163
164 compareResults = _mm_cmplt_ps(currentValues, minValues);
165
166 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
167 _mm_andnot_ps(compareResults, minValuesIndex));
168
169 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
170 _mm_andnot_ps(compareResults, minValues));
171 }
172
173 // Calculate the smallest value from the remaining 4 points
174 _mm_store_ps(minValuesBuffer, minValues);
175 _mm_store_ps(minIndexesBuffer, minValuesIndex);
176
177 for (uint32_t number = 0; number < 4; number++) {
178 if (minValuesBuffer[number] < min) {
179 index = minIndexesBuffer[number];
180 min = minValuesBuffer[number];
181 } else if (minValuesBuffer[number] == min) {
182 if (index > minIndexesBuffer[number])
183 index = minIndexesBuffer[number];
184 }
185 }
186
187 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
188 if (source[number] < min) {
189 index = number;
190 min = source[number];
191 }
192 }
193 target[0] = (uint32_t)index;
194}
195
196#endif /*LV_HAVE_SSE*/
197
198
199#ifdef LV_HAVE_AVX
200#include <immintrin.h>
201
202static inline void
203volk_32f_index_min_32u_a_avx(uint32_t* target, const float* source, uint32_t num_points)
204{
205 const uint32_t quarterPoints = num_points / 8;
206
207 float* inputPtr = (float*)source;
208
209 __m256 indexIncrementValues = _mm256_set1_ps(8);
210 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
211
212 float min = source[0];
213 float index = 0;
214 __m256 minValues = _mm256_set1_ps(min);
215 __m256 minValuesIndex = _mm256_setzero_ps();
216 __m256 compareResults;
217 __m256 currentValues;
218
219 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
220 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
221
222 for (uint32_t number = 0; number < quarterPoints; number++) {
223 currentValues = _mm256_load_ps(inputPtr);
224 inputPtr += 8;
225 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
226 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
227 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
228 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
229 }
230
231 // Calculate the smallest value from the remaining 8 points
232 _mm256_store_ps(minValuesBuffer, minValues);
233 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
234
235 for (uint32_t number = 0; number < 8; number++) {
236 if (minValuesBuffer[number] < min) {
237 index = minIndexesBuffer[number];
238 min = minValuesBuffer[number];
239 } else if (minValuesBuffer[number] == min) {
240 if (index > minIndexesBuffer[number])
241 index = minIndexesBuffer[number];
242 }
243 }
244
245 for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
246 if (source[number] < min) {
247 index = number;
248 min = source[number];
249 }
250 }
251 target[0] = (uint32_t)index;
252}
253
254#endif /*LV_HAVE_AVX*/
255
256
257#ifdef LV_HAVE_NEON
258#include <arm_neon.h>
259
260static inline void
261volk_32f_index_min_32u_neon(uint32_t* target, const float* source, uint32_t num_points)
262{
263 const uint32_t quarterPoints = num_points / 4;
264
265 float* inputPtr = (float*)source;
266 float32x4_t indexIncrementValues = vdupq_n_f32(4);
268 float currentIndexes_float[4] = { -4.0f, -3.0f, -2.0f, -1.0f };
269 float32x4_t currentIndexes = vld1q_f32(currentIndexes_float);
270
271 float min = source[0];
272 float index = 0;
273 float32x4_t minValues = vdupq_n_f32(min);
274 uint32x4_t minValuesIndex = vmovq_n_u32(0);
275 uint32x4_t compareResults;
276 uint32x4_t currentIndexes_u;
277 float32x4_t currentValues;
278
279 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
280 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
281
282 for (uint32_t number = 0; number < quarterPoints; number++) {
283 currentValues = vld1q_f32(inputPtr);
284 inputPtr += 4;
285 currentIndexes = vaddq_f32(currentIndexes, indexIncrementValues);
286 currentIndexes_u = vcvtq_u32_f32(currentIndexes);
287 compareResults = vcgeq_f32(currentValues, minValues);
288 minValuesIndex = vorrq_u32(vandq_u32(compareResults, minValuesIndex),
289 vbicq_u32(currentIndexes_u, compareResults));
290 minValues = vminq_f32(currentValues, minValues);
291 }
292
293 // Calculate the smallest value from the remaining 4 points
294 vst1q_f32(minValuesBuffer, minValues);
295 vst1q_f32(minIndexesBuffer, vcvtq_f32_u32(minValuesIndex));
296 for (uint32_t number = 0; number < 4; number++) {
297 if (minValuesBuffer[number] < min) {
298 index = minIndexesBuffer[number];
299 min = minValuesBuffer[number];
300 } else if (minValues[number] == min) {
301 if (index > minIndexesBuffer[number])
302 index = minIndexesBuffer[number];
303 }
304 }
305
306 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
307 if (source[number] < min) {
308 index = number;
309 min = source[number];
310 }
311 }
312 target[0] = (uint32_t)index;
313}
314
315#endif /*LV_HAVE_NEON*/
316
317
318#ifdef LV_HAVE_GENERIC
319
320static inline void
321volk_32f_index_min_32u_generic(uint32_t* target, const float* source, uint32_t num_points)
322{
323 float min = source[0];
324 uint32_t index = 0;
325
326 for (uint32_t i = 1; i < num_points; ++i) {
327 if (source[i] < min) {
328 index = i;
329 min = source[i];
330 }
331 }
332 target[0] = index;
333}
334
335#endif /*LV_HAVE_GENERIC*/
336
337
338#endif /*INCLUDED_volk_32f_index_min_32u_a_H*/
339
340
341#ifndef INCLUDED_volk_32f_index_min_32u_u_H
342#define INCLUDED_volk_32f_index_min_32u_u_H
343
344#include <inttypes.h>
345#include <stdio.h>
346#include <volk/volk_common.h>
347
348
349#ifdef LV_HAVE_AVX
350#include <immintrin.h>
351
352static inline void
353volk_32f_index_min_32u_u_avx(uint32_t* target, const float* source, uint32_t num_points)
354{
355 const uint32_t quarterPoints = num_points / 8;
356
357 float* inputPtr = (float*)source;
358
359 __m256 indexIncrementValues = _mm256_set1_ps(8);
360 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
361
362 float min = source[0];
363 float index = 0;
364 __m256 minValues = _mm256_set1_ps(min);
365 __m256 minValuesIndex = _mm256_setzero_ps();
366 __m256 compareResults;
367 __m256 currentValues;
368
369 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
370 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
371
372 for (uint32_t number = 0; number < quarterPoints; number++) {
373 currentValues = _mm256_loadu_ps(inputPtr);
374 inputPtr += 8;
375 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
376 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
377 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
378 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
379 }
380
381 // Calculate the smalles value from the remaining 8 points
382 _mm256_store_ps(minValuesBuffer, minValues);
383 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
384
385 for (uint32_t number = 0; number < 8; number++) {
386 if (minValuesBuffer[number] < min) {
387 index = minIndexesBuffer[number];
388 min = minValuesBuffer[number];
389 } else if (minValuesBuffer[number] == min) {
390 if (index > minIndexesBuffer[number])
391 index = minIndexesBuffer[number];
392 }
393 }
394
395 for (uint32_t number = quarterPoints * 8; number < num_points; number++) {
396 if (source[number] < min) {
397 index = number;
398 min = source[number];
399 }
400 }
401 target[0] = (uint32_t)index;
402}
403
404#endif /*LV_HAVE_AVX*/
405
406
407#ifdef LV_HAVE_SSE4_1
408#include <smmintrin.h>
409
410static inline void volk_32f_index_min_32u_u_sse4_1(uint32_t* target,
411 const float* source,
412 uint32_t num_points)
413{
414 const uint32_t quarterPoints = num_points / 4;
415
416 float* inputPtr = (float*)source;
417
418 __m128 indexIncrementValues = _mm_set1_ps(4);
419 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
420
421 float min = source[0];
422 float index = 0;
423 __m128 minValues = _mm_set1_ps(min);
424 __m128 minValuesIndex = _mm_setzero_ps();
425 __m128 compareResults;
426 __m128 currentValues;
427
428 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
429 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
430
431 for (uint32_t number = 0; number < quarterPoints; number++) {
432 currentValues = _mm_loadu_ps(inputPtr);
433 inputPtr += 4;
434 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
435 compareResults = _mm_cmplt_ps(currentValues, minValues);
436 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
437 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
438 }
439
440 // Calculate the smallest value from the remaining 4 points
441 _mm_store_ps(minValuesBuffer, minValues);
442 _mm_store_ps(minIndexesBuffer, minValuesIndex);
443
444 for (uint32_t number = 0; number < 4; number++) {
445 if (minValuesBuffer[number] < min) {
446 index = minIndexesBuffer[number];
447 min = minValuesBuffer[number];
448 } else if (minValuesBuffer[number] == min) {
449 if (index > minIndexesBuffer[number])
450 index = minIndexesBuffer[number];
451 }
452 }
453
454 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
455 if (source[number] < min) {
456 index = number;
457 min = source[number];
458 }
459 }
460 target[0] = (uint32_t)index;
461}
462
463#endif /*LV_HAVE_SSE4_1*/
464
465#ifdef LV_HAVE_SSE
466#include <xmmintrin.h>
467
468static inline void
469volk_32f_index_min_32u_u_sse(uint32_t* target, const float* source, uint32_t num_points)
470{
471 const uint32_t quarterPoints = num_points / 4;
472
473 float* inputPtr = (float*)source;
474
475 __m128 indexIncrementValues = _mm_set1_ps(4);
476 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
477
478 float min = source[0];
479 float index = 0;
480 __m128 minValues = _mm_set1_ps(min);
481 __m128 minValuesIndex = _mm_setzero_ps();
482 __m128 compareResults;
483 __m128 currentValues;
484
485 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
486 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
487
488 for (uint32_t number = 0; number < quarterPoints; number++) {
489 currentValues = _mm_loadu_ps(inputPtr);
490 inputPtr += 4;
491 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
492 compareResults = _mm_cmplt_ps(currentValues, minValues);
493 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
494 _mm_andnot_ps(compareResults, minValuesIndex));
495 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
496 _mm_andnot_ps(compareResults, minValues));
497 }
498
499 // Calculate the smallest value from the remaining 4 points
500 _mm_store_ps(minValuesBuffer, minValues);
501 _mm_store_ps(minIndexesBuffer, minValuesIndex);
502
503 for (uint32_t number = 0; number < 4; number++) {
504 if (minValuesBuffer[number] < min) {
505 index = minIndexesBuffer[number];
506 min = minValuesBuffer[number];
507 } else if (minValuesBuffer[number] == min) {
508 if (index > minIndexesBuffer[number])
509 index = minIndexesBuffer[number];
510 }
511 }
512
513 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
514 if (source[number] < min) {
515 index = number;
516 min = source[number];
517 }
518 }
519 target[0] = (uint32_t)index;
520}
521
522#endif /*LV_HAVE_SSE*/
523
524#endif /*INCLUDED_volk_32f_index_min_32u_u_H*/
static void volk_32f_index_min_32u_neon(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:261
static void volk_32f_index_min_32u_a_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:139
static void volk_32f_index_min_32u_u_sse(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:469
static void volk_32f_index_min_32u_generic(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:321
static void volk_32f_index_min_32u_a_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:203
static void volk_32f_index_min_32u_u_avx(uint32_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_32u.h:353
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25