Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_index_max_16u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_index_max_16u_a_H
72#define INCLUDED_volk_32f_index_max_16u_a_H
73
74#include <inttypes.h>
75#include <limits.h>
76#include <stdio.h>
77#include <volk/volk_common.h>
78
79#ifdef LV_HAVE_AVX
80#include <immintrin.h>
81
82static inline void
83volk_32f_index_max_16u_a_avx(uint16_t* target, const float* src0, uint32_t num_points)
84{
85 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
86
87 uint32_t number = 0;
88 const uint32_t eighthPoints = num_points / 8;
89
90 float* inputPtr = (float*)src0;
91
92 __m256 indexIncrementValues = _mm256_set1_ps(8);
93 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
94
95 float max = src0[0];
96 float index = 0;
97 __m256 maxValues = _mm256_set1_ps(max);
98 __m256 maxValuesIndex = _mm256_setzero_ps();
99 __m256 compareResults;
100 __m256 currentValues;
101
102 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
103 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
104
105 for (; number < eighthPoints; number++) {
106
107 currentValues = _mm256_load_ps(inputPtr);
108 inputPtr += 8;
109 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
110
111 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
112
113 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
114 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
115 }
116
117 // Calculate the largest value from the remaining 4 points
118 _mm256_store_ps(maxValuesBuffer, maxValues);
119 _mm256_store_ps(maxIndexesBuffer, maxValuesIndex);
120
121 for (number = 0; number < 8; number++) {
122 if (maxValuesBuffer[number] > max) {
123 index = maxIndexesBuffer[number];
124 max = maxValuesBuffer[number];
125 } else if (maxValuesBuffer[number] == max) {
126 if (index > maxIndexesBuffer[number])
127 index = maxIndexesBuffer[number];
128 }
129 }
130
131 number = eighthPoints * 8;
132 for (; number < num_points; number++) {
133 if (src0[number] > max) {
134 index = number;
135 max = src0[number];
136 }
137 }
138 target[0] = (uint16_t)index;
139}
140
141#endif /*LV_HAVE_AVX*/
142
143#ifdef LV_HAVE_SSE4_1
144#include <smmintrin.h>
145
146static inline void
147volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0, uint32_t num_points)
148{
149 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
150
151 uint32_t number = 0;
152 const uint32_t quarterPoints = num_points / 4;
153
154 float* inputPtr = (float*)src0;
155
156 __m128 indexIncrementValues = _mm_set1_ps(4);
157 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
158
159 float max = src0[0];
160 float index = 0;
161 __m128 maxValues = _mm_set1_ps(max);
162 __m128 maxValuesIndex = _mm_setzero_ps();
163 __m128 compareResults;
164 __m128 currentValues;
165
166 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
167 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
168
169 for (; number < quarterPoints; number++) {
170
171 currentValues = _mm_load_ps(inputPtr);
172 inputPtr += 4;
173 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
174
175 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
176
177 maxValuesIndex = _mm_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
178 maxValues = _mm_blendv_ps(maxValues, currentValues, compareResults);
179 }
180
181 // Calculate the largest value from the remaining 4 points
182 _mm_store_ps(maxValuesBuffer, maxValues);
183 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
184
185 for (number = 0; number < 4; number++) {
186 if (maxValuesBuffer[number] > max) {
187 index = maxIndexesBuffer[number];
188 max = maxValuesBuffer[number];
189 } else if (maxValuesBuffer[number] == max) {
190 if (index > maxIndexesBuffer[number])
191 index = maxIndexesBuffer[number];
192 }
193 }
194
195 number = quarterPoints * 4;
196 for (; number < num_points; number++) {
197 if (src0[number] > max) {
198 index = number;
199 max = src0[number];
200 }
201 }
202 target[0] = (uint16_t)index;
203}
204
205#endif /*LV_HAVE_SSE4_1*/
206
207
208#ifdef LV_HAVE_SSE
209
210#include <xmmintrin.h>
211
212static inline void
213volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0, uint32_t num_points)
214{
215 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
216
217 uint32_t number = 0;
218 const uint32_t quarterPoints = num_points / 4;
219
220 float* inputPtr = (float*)src0;
221
222 __m128 indexIncrementValues = _mm_set1_ps(4);
223 __m128 currentIndexes = _mm_set_ps(-1, -2, -3, -4);
224
225 float max = src0[0];
226 float index = 0;
227 __m128 maxValues = _mm_set1_ps(max);
228 __m128 maxValuesIndex = _mm_setzero_ps();
229 __m128 compareResults;
230 __m128 currentValues;
231
232 __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
233 __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
234
235 for (; number < quarterPoints; number++) {
236
237 currentValues = _mm_load_ps(inputPtr);
238 inputPtr += 4;
239 currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
240
241 compareResults = _mm_cmpgt_ps(currentValues, maxValues);
242
243 maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, currentIndexes),
244 _mm_andnot_ps(compareResults, maxValuesIndex));
245 maxValues = _mm_or_ps(_mm_and_ps(compareResults, currentValues),
246 _mm_andnot_ps(compareResults, maxValues));
247 }
248
249 // Calculate the largest value from the remaining 4 points
250 _mm_store_ps(maxValuesBuffer, maxValues);
251 _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
252
253 for (number = 0; number < 4; number++) {
254 if (maxValuesBuffer[number] > max) {
255 index = maxIndexesBuffer[number];
256 max = maxValuesBuffer[number];
257 } else if (maxValuesBuffer[number] == max) {
258 if (index > maxIndexesBuffer[number])
259 index = maxIndexesBuffer[number];
260 }
261 }
262
263 number = quarterPoints * 4;
264 for (; number < num_points; number++) {
265 if (src0[number] > max) {
266 index = number;
267 max = src0[number];
268 }
269 }
270 target[0] = (uint16_t)index;
271}
272
273#endif /*LV_HAVE_SSE*/
274
275
276#ifdef LV_HAVE_GENERIC
277
278static inline void
279volk_32f_index_max_16u_generic(uint16_t* target, const float* src0, uint32_t num_points)
280{
281 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
282
283 float max = src0[0];
284 uint16_t index = 0;
285
286 uint32_t i = 1;
287
288 for (; i < num_points; ++i) {
289 if (src0[i] > max) {
290 index = i;
291 max = src0[i];
292 }
293 }
294 target[0] = index;
295}
296
297#endif /*LV_HAVE_GENERIC*/
298
299
300#endif /*INCLUDED_volk_32f_index_max_16u_a_H*/
301
302
303#ifndef INCLUDED_volk_32f_index_max_16u_u_H
304#define INCLUDED_volk_32f_index_max_16u_u_H
305
306#include <inttypes.h>
307#include <limits.h>
308#include <stdio.h>
309#include <volk/volk_common.h>
310
311#ifdef LV_HAVE_AVX
312#include <immintrin.h>
313
314static inline void
315volk_32f_index_max_16u_u_avx(uint16_t* target, const float* src0, uint32_t num_points)
316{
317 num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
318
319 uint32_t number = 0;
320 const uint32_t eighthPoints = num_points / 8;
321
322 float* inputPtr = (float*)src0;
323
324 __m256 indexIncrementValues = _mm256_set1_ps(8);
325 __m256 currentIndexes = _mm256_set_ps(-1, -2, -3, -4, -5, -6, -7, -8);
326
327 float max = src0[0];
328 float index = 0;
329 __m256 maxValues = _mm256_set1_ps(max);
330 __m256 maxValuesIndex = _mm256_setzero_ps();
331 __m256 compareResults;
332 __m256 currentValues;
333
334 __VOLK_ATTR_ALIGNED(32) float maxValuesBuffer[8];
335 __VOLK_ATTR_ALIGNED(32) float maxIndexesBuffer[8];
336
337 for (; number < eighthPoints; number++) {
338
339 currentValues = _mm256_loadu_ps(inputPtr);
340 inputPtr += 8;
341 currentIndexes = _mm256_add_ps(currentIndexes, indexIncrementValues);
342
343 compareResults = _mm256_cmp_ps(currentValues, maxValues, _CMP_GT_OS);
344
345 maxValuesIndex = _mm256_blendv_ps(maxValuesIndex, currentIndexes, compareResults);
346 maxValues = _mm256_blendv_ps(maxValues, currentValues, compareResults);
347 }
348
349 // Calculate the largest value from the remaining 4 points
350 _mm256_storeu_ps(maxValuesBuffer, maxValues);
351 _mm256_storeu_ps(maxIndexesBuffer, maxValuesIndex);
352
353 for (number = 0; number < 8; number++) {
354 if (maxValuesBuffer[number] > max) {
355 index = maxIndexesBuffer[number];
356 max = maxValuesBuffer[number];
357 } else if (maxValuesBuffer[number] == max) {
358 if (index > maxIndexesBuffer[number])
359 index = maxIndexesBuffer[number];
360 }
361 }
362
363 number = eighthPoints * 8;
364 for (; number < num_points; number++) {
365 if (src0[number] > max) {
366 index = number;
367 max = src0[number];
368 }
369 }
370 target[0] = (uint16_t)index;
371}
372
373#endif /*LV_HAVE_AVX*/
374
375#endif /*INCLUDED_volk_32f_index_max_16u_u_H*/
static void volk_32f_index_max_16u_u_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:315
static void volk_32f_index_max_16u_a_avx(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:83
static void volk_32f_index_max_16u_generic(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:279
static void volk_32f_index_max_16u_a_sse(uint16_t *target, const float *src0, uint32_t num_points)
Definition: volk_32f_index_max_16u.h:213
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
for i
Definition: volk_config_fixed.tmpl.h:25