Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_s32f_convert_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
68#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
69#define INCLUDED_volk_32f_s32f_convert_16i_u_H
70
71#include <inttypes.h>
72#include <limits.h>
73#include <stdio.h>
74
75#ifdef LV_HAVE_AVX2
76#include <immintrin.h>
77
78static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
79 const float* inputVector,
80 const float scalar,
81 unsigned int num_points)
82{
83 unsigned int number = 0;
84
85 const unsigned int sixteenthPoints = num_points / 16;
86
87 const float* inputVectorPtr = (const float*)inputVector;
88 int16_t* outputVectorPtr = outputVector;
89
90 float min_val = SHRT_MIN;
91 float max_val = SHRT_MAX;
92 float r;
93
94 __m256 vScalar = _mm256_set1_ps(scalar);
95 __m256 inputVal1, inputVal2;
96 __m256i intInputVal1, intInputVal2;
97 __m256 ret1, ret2;
98 __m256 vmin_val = _mm256_set1_ps(min_val);
99 __m256 vmax_val = _mm256_set1_ps(max_val);
100
101 for (; number < sixteenthPoints; number++) {
102 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
103 inputVectorPtr += 8;
104 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
105 inputVectorPtr += 8;
106
107 // Scale and clip
108 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
109 vmin_val);
110 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
111 vmin_val);
112
113 intInputVal1 = _mm256_cvtps_epi32(ret1);
114 intInputVal2 = _mm256_cvtps_epi32(ret2);
115
116 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
117 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
118
119 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
120 outputVectorPtr += 16;
121 }
122
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 r = inputVector[number] * scalar;
126 if (r > max_val)
127 r = max_val;
128 else if (r < min_val)
129 r = min_val;
130 outputVector[number] = (int16_t)rintf(r);
131 }
132}
133#endif /* LV_HAVE_AVX2 */
134
135
136#ifdef LV_HAVE_AVX
137#include <immintrin.h>
138
139static inline void volk_32f_s32f_convert_16i_u_avx(int16_t* outputVector,
140 const float* inputVector,
141 const float scalar,
142 unsigned int num_points)
143{
144 unsigned int number = 0;
145
146 const unsigned int eighthPoints = num_points / 8;
147
148 const float* inputVectorPtr = (const float*)inputVector;
149 int16_t* outputVectorPtr = outputVector;
150
151 float min_val = SHRT_MIN;
152 float max_val = SHRT_MAX;
153 float r;
154
155 __m256 vScalar = _mm256_set1_ps(scalar);
156 __m256 inputVal, ret;
157 __m256i intInputVal;
158 __m128i intInputVal1, intInputVal2;
159 __m256 vmin_val = _mm256_set1_ps(min_val);
160 __m256 vmax_val = _mm256_set1_ps(max_val);
161
162 for (; number < eighthPoints; number++) {
163 inputVal = _mm256_loadu_ps(inputVectorPtr);
164 inputVectorPtr += 8;
165
166 // Scale and clip
167 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
168 vmin_val);
169
170 intInputVal = _mm256_cvtps_epi32(ret);
171
172 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
173 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
174
175 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
176
177 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
178 outputVectorPtr += 8;
179 }
180
181 number = eighthPoints * 8;
182 for (; number < num_points; number++) {
183 r = inputVector[number] * scalar;
184 if (r > max_val)
185 r = max_val;
186 else if (r < min_val)
187 r = min_val;
188 outputVector[number] = (int16_t)rintf(r);
189 }
190}
191#endif /* LV_HAVE_AVX */
192
193
194#ifdef LV_HAVE_SSE2
195#include <emmintrin.h>
196
197static inline void volk_32f_s32f_convert_16i_u_sse2(int16_t* outputVector,
198 const float* inputVector,
199 const float scalar,
200 unsigned int num_points)
201{
202 unsigned int number = 0;
203
204 const unsigned int eighthPoints = num_points / 8;
205
206 const float* inputVectorPtr = (const float*)inputVector;
207 int16_t* outputVectorPtr = outputVector;
208
209 float min_val = SHRT_MIN;
210 float max_val = SHRT_MAX;
211 float r;
212
213 __m128 vScalar = _mm_set_ps1(scalar);
214 __m128 inputVal1, inputVal2;
215 __m128i intInputVal1, intInputVal2;
216 __m128 ret1, ret2;
217 __m128 vmin_val = _mm_set_ps1(min_val);
218 __m128 vmax_val = _mm_set_ps1(max_val);
219
220 for (; number < eighthPoints; number++) {
221 inputVal1 = _mm_loadu_ps(inputVectorPtr);
222 inputVectorPtr += 4;
223 inputVal2 = _mm_loadu_ps(inputVectorPtr);
224 inputVectorPtr += 4;
225
226 // Scale and clip
227 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
228 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
229
230 intInputVal1 = _mm_cvtps_epi32(ret1);
231 intInputVal2 = _mm_cvtps_epi32(ret2);
232
233 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
234
235 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
236 outputVectorPtr += 8;
237 }
238
239 number = eighthPoints * 8;
240 for (; number < num_points; number++) {
241 r = inputVector[number] * scalar;
242 if (r > max_val)
243 r = max_val;
244 else if (r < min_val)
245 r = min_val;
246 outputVector[number] = (int16_t)rintf(r);
247 }
248}
249#endif /* LV_HAVE_SSE2 */
250
251
252#ifdef LV_HAVE_SSE
253#include <xmmintrin.h>
254
255static inline void volk_32f_s32f_convert_16i_u_sse(int16_t* outputVector,
256 const float* inputVector,
257 const float scalar,
258 unsigned int num_points)
259{
260 unsigned int number = 0;
261
262 const unsigned int quarterPoints = num_points / 4;
263
264 const float* inputVectorPtr = (const float*)inputVector;
265 int16_t* outputVectorPtr = outputVector;
266
267 float min_val = SHRT_MIN;
268 float max_val = SHRT_MAX;
269 float r;
270
271 __m128 vScalar = _mm_set_ps1(scalar);
272 __m128 ret;
273 __m128 vmin_val = _mm_set_ps1(min_val);
274 __m128 vmax_val = _mm_set_ps1(max_val);
275
276 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
277
278 for (; number < quarterPoints; number++) {
279 ret = _mm_loadu_ps(inputVectorPtr);
280 inputVectorPtr += 4;
281
282 // Scale and clip
283 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
284
285 _mm_store_ps(outputFloatBuffer, ret);
286 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
287 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
288 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
289 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
290 }
291
292 number = quarterPoints * 4;
293 for (; number < num_points; number++) {
294 r = inputVector[number] * scalar;
295 if (r > max_val)
296 r = max_val;
297 else if (r < min_val)
298 r = min_val;
299 outputVector[number] = (int16_t)rintf(r);
300 }
301}
302#endif /* LV_HAVE_SSE */
303
304
305#ifdef LV_HAVE_GENERIC
306
307static inline void volk_32f_s32f_convert_16i_generic(int16_t* outputVector,
308 const float* inputVector,
309 const float scalar,
310 unsigned int num_points)
311{
312 int16_t* outputVectorPtr = outputVector;
313 const float* inputVectorPtr = inputVector;
314 unsigned int number = 0;
315 float min_val = SHRT_MIN;
316 float max_val = SHRT_MAX;
317 float r;
318
319 for (number = 0; number < num_points; number++) {
320 r = *inputVectorPtr++ * scalar;
321 if (r > max_val)
322 r = max_val;
323 else if (r < min_val)
324 r = min_val;
325 *outputVectorPtr++ = (int16_t)rintf(r);
326 }
327}
328#endif /* LV_HAVE_GENERIC */
329
330
331#endif /* INCLUDED_volk_32f_s32f_convert_16i_u_H */
332#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
333#define INCLUDED_volk_32f_s32f_convert_16i_a_H
334
335#include <inttypes.h>
336#include <math.h>
337#include <stdio.h>
338#include <volk/volk_common.h>
339
340#ifdef LV_HAVE_AVX2
341#include <immintrin.h>
342
343static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
344 const float* inputVector,
345 const float scalar,
346 unsigned int num_points)
347{
348 unsigned int number = 0;
349
350 const unsigned int sixteenthPoints = num_points / 16;
351
352 const float* inputVectorPtr = (const float*)inputVector;
353 int16_t* outputVectorPtr = outputVector;
354
355 float min_val = SHRT_MIN;
356 float max_val = SHRT_MAX;
357 float r;
358
359 __m256 vScalar = _mm256_set1_ps(scalar);
360 __m256 inputVal1, inputVal2;
361 __m256i intInputVal1, intInputVal2;
362 __m256 ret1, ret2;
363 __m256 vmin_val = _mm256_set1_ps(min_val);
364 __m256 vmax_val = _mm256_set1_ps(max_val);
365
366 for (; number < sixteenthPoints; number++) {
367 inputVal1 = _mm256_load_ps(inputVectorPtr);
368 inputVectorPtr += 8;
369 inputVal2 = _mm256_load_ps(inputVectorPtr);
370 inputVectorPtr += 8;
371
372 // Scale and clip
373 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
374 vmin_val);
375 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
376 vmin_val);
377
378 intInputVal1 = _mm256_cvtps_epi32(ret1);
379 intInputVal2 = _mm256_cvtps_epi32(ret2);
380
381 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
382 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
383
384 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
385 outputVectorPtr += 16;
386 }
387
388 number = sixteenthPoints * 16;
389 for (; number < num_points; number++) {
390 r = inputVector[number] * scalar;
391 if (r > max_val)
392 r = max_val;
393 else if (r < min_val)
394 r = min_val;
395 outputVector[number] = (int16_t)rintf(r);
396 }
397}
398#endif /* LV_HAVE_AVX2 */
399
400
401#ifdef LV_HAVE_AVX
402#include <immintrin.h>
403
404static inline void volk_32f_s32f_convert_16i_a_avx(int16_t* outputVector,
405 const float* inputVector,
406 const float scalar,
407 unsigned int num_points)
408{
409 unsigned int number = 0;
410
411 const unsigned int eighthPoints = num_points / 8;
412
413 const float* inputVectorPtr = (const float*)inputVector;
414 int16_t* outputVectorPtr = outputVector;
415
416 float min_val = SHRT_MIN;
417 float max_val = SHRT_MAX;
418 float r;
419
420 __m256 vScalar = _mm256_set1_ps(scalar);
421 __m256 inputVal, ret;
422 __m256i intInputVal;
423 __m128i intInputVal1, intInputVal2;
424 __m256 vmin_val = _mm256_set1_ps(min_val);
425 __m256 vmax_val = _mm256_set1_ps(max_val);
426
427 for (; number < eighthPoints; number++) {
428 inputVal = _mm256_load_ps(inputVectorPtr);
429 inputVectorPtr += 8;
430
431 // Scale and clip
432 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
433 vmin_val);
434
435 intInputVal = _mm256_cvtps_epi32(ret);
436
437 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
438 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
439
440 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
441
442 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
443 outputVectorPtr += 8;
444 }
445
446 number = eighthPoints * 8;
447 for (; number < num_points; number++) {
448 r = inputVector[number] * scalar;
449 if (r > max_val)
450 r = max_val;
451 else if (r < min_val)
452 r = min_val;
453 outputVector[number] = (int16_t)rintf(r);
454 }
455}
456#endif /* LV_HAVE_AVX */
457
458#ifdef LV_HAVE_SSE2
459#include <emmintrin.h>
460
461static inline void volk_32f_s32f_convert_16i_a_sse2(int16_t* outputVector,
462 const float* inputVector,
463 const float scalar,
464 unsigned int num_points)
465{
466 unsigned int number = 0;
467
468 const unsigned int eighthPoints = num_points / 8;
469
470 const float* inputVectorPtr = (const float*)inputVector;
471 int16_t* outputVectorPtr = outputVector;
472
473 float min_val = SHRT_MIN;
474 float max_val = SHRT_MAX;
475 float r;
476
477 __m128 vScalar = _mm_set_ps1(scalar);
478 __m128 inputVal1, inputVal2;
479 __m128i intInputVal1, intInputVal2;
480 __m128 ret1, ret2;
481 __m128 vmin_val = _mm_set_ps1(min_val);
482 __m128 vmax_val = _mm_set_ps1(max_val);
483
484 for (; number < eighthPoints; number++) {
485 inputVal1 = _mm_load_ps(inputVectorPtr);
486 inputVectorPtr += 4;
487 inputVal2 = _mm_load_ps(inputVectorPtr);
488 inputVectorPtr += 4;
489
490 // Scale and clip
491 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
492 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
493
494 intInputVal1 = _mm_cvtps_epi32(ret1);
495 intInputVal2 = _mm_cvtps_epi32(ret2);
496
497 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
498
499 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
500 outputVectorPtr += 8;
501 }
502
503 number = eighthPoints * 8;
504 for (; number < num_points; number++) {
505 r = inputVector[number] * scalar;
506 if (r > max_val)
507 r = max_val;
508 else if (r < min_val)
509 r = min_val;
510 outputVector[number] = (int16_t)rintf(r);
511 }
512}
513#endif /* LV_HAVE_SSE2 */
514
515
516#ifdef LV_HAVE_SSE
517#include <xmmintrin.h>
518
519static inline void volk_32f_s32f_convert_16i_a_sse(int16_t* outputVector,
520 const float* inputVector,
521 const float scalar,
522 unsigned int num_points)
523{
524 unsigned int number = 0;
525
526 const unsigned int quarterPoints = num_points / 4;
527
528 const float* inputVectorPtr = (const float*)inputVector;
529 int16_t* outputVectorPtr = outputVector;
530
531 float min_val = SHRT_MIN;
532 float max_val = SHRT_MAX;
533 float r;
534
535 __m128 vScalar = _mm_set_ps1(scalar);
536 __m128 ret;
537 __m128 vmin_val = _mm_set_ps1(min_val);
538 __m128 vmax_val = _mm_set_ps1(max_val);
539
540 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
541
542 for (; number < quarterPoints; number++) {
543 ret = _mm_load_ps(inputVectorPtr);
544 inputVectorPtr += 4;
545
546 // Scale and clip
547 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
548
549 _mm_store_ps(outputFloatBuffer, ret);
550 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
551 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
552 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
553 *outputVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
554 }
555
556 number = quarterPoints * 4;
557 for (; number < num_points; number++) {
558 r = inputVector[number] * scalar;
559 if (r > max_val)
560 r = max_val;
561 else if (r < min_val)
562 r = min_val;
563 outputVector[number] = (int16_t)rintf(r);
564 }
565}
566#endif /* LV_HAVE_SSE */
567
568
569#ifdef LV_HAVE_GENERIC
570
571static inline void volk_32f_s32f_convert_16i_a_generic(int16_t* outputVector,
572 const float* inputVector,
573 const float scalar,
574 unsigned int num_points)
575{
576 int16_t* outputVectorPtr = outputVector;
577 const float* inputVectorPtr = inputVector;
578 unsigned int number = 0;
579 float min_val = SHRT_MIN;
580 float max_val = SHRT_MAX;
581 float r;
582
583 for (number = 0; number < num_points; number++) {
584 r = *inputVectorPtr++ * scalar;
585 if (r < min_val)
586 r = min_val;
587 else if (r > max_val)
588 r = max_val;
589 *outputVectorPtr++ = (int16_t)rintf(r);
590 }
591}
592#endif /* LV_HAVE_GENERIC */
593
594#endif /* INCLUDED_volk_32f_s32f_convert_16i_a_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:461
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:255
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:404
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:197
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:571
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:139
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:307
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:519
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56