Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_index_min_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of VOLK
6 *
7 * VOLK is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * VOLK is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_index_min_16u_a_H
72#define INCLUDED_volk_32f_index_min_16u_a_H
73
74#include <inttypes.h>
75#include <limits.h>
76#include <stdio.h>
77#include <volk/volk_common.h>
78
79#ifdef LV_HAVE_AVX
80#include <immintrin.h>
81
82static inline void
83volk_32f_index_min_16u_a_avx(uint16_t* target, const float* source, uint32_t num_points)
84{
85 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
86 const uint32_t eighthPoints = num_points / 8;
87
88 float* inputPtr = (float*)source;
89
90 __m256 indexIncrementValues = _mm256_set1_ps(8);
91 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
92
93 float min = source[0];
94 float index = 0;
95 __m256 minValues = _mm256_set1_ps(min);
96 __m256 minValuesIndex = _mm256_setzero_ps();
97 __m256 compareResults;
98 __m256 currentValues;
99
100 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
101 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
102
103 for (uint32_t number = 0; number < eighthPoints; number++) {
104
105 currentValues = _mm256_load_ps(inputPtr);
106 inputPtr += 8;
107 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
108
109 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
110
111 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
112 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
113 }
114
115 // Calculate the smallest value from the remaining 4 points
116 _mm256_store_ps(minValuesBuffer, minValues);
117 _mm256_store_ps(minIndexesBuffer, minValuesIndex);
118
119 for (uint32_t number = 0; number < 8; number++) {
120 if (minValuesBuffer[number] < min) {
121 index = minIndexesBuffer[number];
122 min = minValuesBuffer[number];
123 } else if (minValuesBuffer[number] == min) {
124 if (index > minIndexesBuffer[number])
125 index = minIndexesBuffer[number];
126 }
127 }
128
129 for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
130 if (source[number] < min) {
131 index = number;
132 min = source[number];
133 }
134 }
135 target[0] = (uint16_t)index;
136}
137
138#endif /*LV_HAVE_AVX*/
139
140#ifdef LV_HAVE_SSE4_1
141#include <smmintrin.h>
142
143static inline void volk_32f_index_min_16u_a_sse4_1(uint16_t* target,
144 const float* source,
145 uint32_t num_points)
146{
147 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
148 const uint32_t quarterPoints = num_points / 4;
149
150 float* inputPtr = (float*)source;
151
152 __m128 indexIncrementValues = _mm_set1_ps(4);
153 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
154
155 float min = source[0];
156 float index = 0;
157 __m128 minValues = _mm_set1_ps(min);
158 __m128 minValuesIndex = _mm_setzero_ps();
159 __m128 compareResults;
160 __m128 currentValues;
161
162 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
163 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
164
165 for (uint32_t number = 0; number < quarterPoints; number++) {
166
167 currentValues = _mm_load_ps(inputPtr);
168 inputPtr += 4;
169 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
170
171 compareResults = _mm_cmplt_ps(currentValues, minValues);
172
173 minValuesIndex = _mm_blendv_ps(minValuesIndex, currentIndexes, compareResults);
174 minValues = _mm_blendv_ps(minValues, currentValues, compareResults);
175 }
176
177 // Calculate the smallest value from the remaining 4 points
178 _mm_store_ps(minValuesBuffer, minValues);
179 _mm_store_ps(minIndexesBuffer, minValuesIndex);
180
181 for (uint32_t number = 0; number < 4; number++) {
182 if (minValuesBuffer[number] < min) {
183 index = minIndexesBuffer[number];
184 min = minValuesBuffer[number];
185 } else if (minValuesBuffer[number] == min) {
186 if (index > minIndexesBuffer[number])
187 index = minIndexesBuffer[number];
188 }
189 }
190
191 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
192 if (source[number] < min) {
193 index = number;
194 min = source[number];
195 }
196 }
197 target[0] = (uint16_t)index;
198}
199
200#endif /*LV_HAVE_SSE4_1*/
201
202
203#ifdef LV_HAVE_SSE
204
205#include <xmmintrin.h>
206
207static inline void
208volk_32f_index_min_16u_a_sse(uint16_t* target, const float* source, uint32_t num_points)
209{
210 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
211 const uint32_t quarterPoints = num_points / 4;
212
213 float* inputPtr = (float*)source;
214
215 __m128 indexIncrementValues = _mm_set1_ps(4);
216 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
217
218 float min = source[0];
219 float index = 0;
220 __m128 minValues = _mm_set1_ps(min);
221 __m128 minValuesIndex = _mm_setzero_ps();
222 __m128 compareResults;
223 __m128 currentValues;
224
225 __VOLK_ATTR_ALIGNED(16) float minValuesBuffer[4];
226 __VOLK_ATTR_ALIGNED(16) float minIndexesBuffer[4];
227
228 for (uint32_t number = 0; number < quarterPoints; number++) {
229
230 currentValues = _mm_load_ps(inputPtr);
231 inputPtr += 4;
232 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
233
234 compareResults = _mm_cmplt_ps(currentValues, minValues);
235
236 minValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
237 _mm_andnot_ps(compareResults, minValuesIndex));
238 minValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
239 _mm_andnot_ps(compareResults, minValues));
240 }
241
242 // Calculate the smallest value from the remaining 4 points
243 _mm_store_ps(minValuesBuffer, minValues);
244 _mm_store_ps(minIndexesBuffer, minValuesIndex);
245
246 for (uint32_t number = 0; number < 4; number++) {
247 if (minValuesBuffer[number] < min) {
248 index = minIndexesBuffer[number];
249 min = minValuesBuffer[number];
250 } else if (minValuesBuffer[number] == min) {
251 if (index > minIndexesBuffer[number])
252 index = minIndexesBuffer[number];
253 }
254 }
255
256 for (uint32_t number = quarterPoints * 4; number < num_points; number++) {
257 if (source[number] < min) {
258 index = number;
259 min = source[number];
260 }
261 }
262 target[0] = (uint16_t)index;
263}
264
265#endif /*LV_HAVE_SSE*/
266
267
268#ifdef LV_HAVE_GENERIC
269
270static inline void
271volk_32f_index_min_16u_generic(uint16_t* target, const float* source, uint32_t num_points)
272{
273 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
274
275 float min = source[0];
276 uint16_t index = 0;
277
278 for (uint32_t i = 1; i < num_points; ++i) {
279 if (source[i] < min) {
280 index = i;
281 min = source[i];
282 }
283 }
284 target[0] = index;
285}
286
287#endif /*LV_HAVE_GENERIC*/
288
289
290#endif /*INCLUDED_volk_32f_index_min_16u_a_H*/
291
292
293#ifndef INCLUDED_volk_32f_index_min_16u_u_H
294#define INCLUDED_volk_32f_index_min_16u_u_H
295
296#include <inttypes.h>
297#include <limits.h>
298#include <stdio.h>
299#include <volk/volk_common.h>
300
301#ifdef LV_HAVE_AVX
302#include <immintrin.h>
303
304static inline void
305volk_32f_index_min_16u_u_avx(uint16_t* target, const float* source, uint32_t num_points)
306{
307 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
308 const uint32_t eighthPoints = num_points / 8;
309
310 float* inputPtr = (float*)source;
311
312 __m256 indexIncrementValues = _mm256_set1_ps(8);
313 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
314
315 float min = source[0];
316 float index = 0;
317 __m256 minValues = _mm256_set1_ps(min);
318 __m256 minValuesIndex = _mm256_setzero_ps();
319 __m256 compareResults;
320 __m256 currentValues;
321
322 __VOLK_ATTR_ALIGNED(32) float minValuesBuffer[8];
323 __VOLK_ATTR_ALIGNED(32) float minIndexesBuffer[8];
324
325 for (uint32_t number = 0; number < eighthPoints; number++) {
326
327 currentValues = _mm256_loadu_ps(inputPtr);
328 inputPtr += 8;
329 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
330
331 compareResults = _mm256_cmp_ps(currentValues, minValues, _CMP_LT_OS);
332
333 minValuesIndex = _mm256_blendv_ps(minValuesIndex, currentIndexes, compareResults);
334 minValues = _mm256_blendv_ps(minValues, currentValues, compareResults);
335 }
336
337 // Calculate the smallest value from the remaining 4 points
338 _mm256_storeu_ps(minValuesBuffer, minValues);
339 _mm256_storeu_ps(minIndexesBuffer, minValuesIndex);
340
341 for (uint32_t number = 0; number < 8; number++) {
342 if (minValuesBuffer[number] < min) {
343 index = minIndexesBuffer[number];
344 min = minValuesBuffer[number];
345 } else if (minValuesBuffer[number] == min) {
346 if (index > minIndexesBuffer[number])
347 index = minIndexesBuffer[number];
348 }
349 }
350
351 for (uint32_t number = eighthPoints * 8; number < num_points; number++) {
352 if (source[number] < min) {
353 index = number;
354 min = source[number];
355 }
356 }
357 target[0] = (uint16_t)index;
358}
359
360#endif /*LV_HAVE_AVX*/
361
362#endif /*INCLUDED_volk_32f_index_min_16u_u_H*/
static void volk_32f_index_min_16u_a_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:83
static void volk_32f_index_min_16u_generic(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:271
static void volk_32f_index_min_16u_a_sse(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:208
static void volk_32f_index_min_16u_u_avx(uint16_t *target, const float *source, uint32_t num_points)
Definition: volk_32f_index_min_16u.h:305
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25