68#ifndef INCLUDED_volk_32f_s32f_convert_16i_u_H
69#define INCLUDED_volk_32f_s32f_convert_16i_u_H
78static inline void volk_32f_s32f_convert_16i_u_avx2(int16_t* outputVector,
79 const float* inputVector,
81 unsigned int num_points)
83 unsigned int number = 0;
85 const unsigned int sixteenthPoints = num_points / 16;
87 const float* inputVectorPtr = (
const float*)inputVector;
88 int16_t* outputVectorPtr = outputVector;
90 float min_val = SHRT_MIN;
91 float max_val = SHRT_MAX;
94 __m256 vScalar = _mm256_set1_ps(scalar);
95 __m256 inputVal1, inputVal2;
96 __m256i intInputVal1, intInputVal2;
98 __m256 vmin_val = _mm256_set1_ps(min_val);
99 __m256 vmax_val = _mm256_set1_ps(max_val);
101 for (; number < sixteenthPoints; number++) {
102 inputVal1 = _mm256_loadu_ps(inputVectorPtr);
104 inputVal2 = _mm256_loadu_ps(inputVectorPtr);
108 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
110 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
113 intInputVal1 = _mm256_cvtps_epi32(ret1);
114 intInputVal2 = _mm256_cvtps_epi32(ret2);
116 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
117 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
119 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
120 outputVectorPtr += 16;
123 number = sixteenthPoints * 16;
124 for (; number < num_points; number++) {
125 r = inputVector[number] * scalar;
128 else if (r < min_val)
130 outputVector[number] = (int16_t)
rintf(r);
137#include <immintrin.h>
140 const float* inputVector,
142 unsigned int num_points)
144 unsigned int number = 0;
146 const unsigned int eighthPoints = num_points / 8;
148 const float* inputVectorPtr = (
const float*)inputVector;
149 int16_t* outputVectorPtr = outputVector;
151 float min_val = SHRT_MIN;
152 float max_val = SHRT_MAX;
155 __m256 vScalar = _mm256_set1_ps(scalar);
156 __m256 inputVal, ret;
158 __m128i intInputVal1, intInputVal2;
159 __m256 vmin_val = _mm256_set1_ps(min_val);
160 __m256 vmax_val = _mm256_set1_ps(max_val);
162 for (; number < eighthPoints; number++) {
163 inputVal = _mm256_loadu_ps(inputVectorPtr);
167 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
170 intInputVal = _mm256_cvtps_epi32(ret);
172 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
173 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
175 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
177 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
178 outputVectorPtr += 8;
181 number = eighthPoints * 8;
182 for (; number < num_points; number++) {
183 r = inputVector[number] * scalar;
186 else if (r < min_val)
188 outputVector[number] = (int16_t)
rintf(r);
195#include <emmintrin.h>
198 const float* inputVector,
200 unsigned int num_points)
202 unsigned int number = 0;
204 const unsigned int eighthPoints = num_points / 8;
206 const float* inputVectorPtr = (
const float*)inputVector;
207 int16_t* outputVectorPtr = outputVector;
209 float min_val = SHRT_MIN;
210 float max_val = SHRT_MAX;
213 __m128 vScalar = _mm_set_ps1(scalar);
214 __m128 inputVal1, inputVal2;
215 __m128i intInputVal1, intInputVal2;
217 __m128 vmin_val = _mm_set_ps1(min_val);
218 __m128 vmax_val = _mm_set_ps1(max_val);
220 for (; number < eighthPoints; number++) {
221 inputVal1 = _mm_loadu_ps(inputVectorPtr);
223 inputVal2 = _mm_loadu_ps(inputVectorPtr);
227 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
228 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
230 intInputVal1 = _mm_cvtps_epi32(ret1);
231 intInputVal2 = _mm_cvtps_epi32(ret2);
233 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
235 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
236 outputVectorPtr += 8;
239 number = eighthPoints * 8;
240 for (; number < num_points; number++) {
241 r = inputVector[number] * scalar;
244 else if (r < min_val)
246 outputVector[number] = (int16_t)
rintf(r);
253#include <xmmintrin.h>
256 const float* inputVector,
258 unsigned int num_points)
260 unsigned int number = 0;
262 const unsigned int quarterPoints = num_points / 4;
264 const float* inputVectorPtr = (
const float*)inputVector;
265 int16_t* outputVectorPtr = outputVector;
267 float min_val = SHRT_MIN;
268 float max_val = SHRT_MAX;
271 __m128 vScalar = _mm_set_ps1(scalar);
273 __m128 vmin_val = _mm_set_ps1(min_val);
274 __m128 vmax_val = _mm_set_ps1(max_val);
278 for (; number < quarterPoints; number++) {
279 ret = _mm_loadu_ps(inputVectorPtr);
283 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
285 _mm_store_ps(outputFloatBuffer, ret);
286 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
287 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
288 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
289 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
292 number = quarterPoints * 4;
293 for (; number < num_points; number++) {
294 r = inputVector[number] * scalar;
297 else if (r < min_val)
299 outputVector[number] = (int16_t)
rintf(r);
305#ifdef LV_HAVE_GENERIC
308 const float* inputVector,
310 unsigned int num_points)
312 int16_t* outputVectorPtr = outputVector;
313 const float* inputVectorPtr = inputVector;
314 unsigned int number = 0;
315 float min_val = SHRT_MIN;
316 float max_val = SHRT_MAX;
319 for (number = 0; number < num_points; number++) {
320 r = *inputVectorPtr++ * scalar;
323 else if (r < min_val)
325 *outputVectorPtr++ = (int16_t)
rintf(r);
332#ifndef INCLUDED_volk_32f_s32f_convert_16i_a_H
333#define INCLUDED_volk_32f_s32f_convert_16i_a_H
341#include <immintrin.h>
343static inline void volk_32f_s32f_convert_16i_a_avx2(int16_t* outputVector,
344 const float* inputVector,
346 unsigned int num_points)
348 unsigned int number = 0;
350 const unsigned int sixteenthPoints = num_points / 16;
352 const float* inputVectorPtr = (
const float*)inputVector;
353 int16_t* outputVectorPtr = outputVector;
355 float min_val = SHRT_MIN;
356 float max_val = SHRT_MAX;
359 __m256 vScalar = _mm256_set1_ps(scalar);
360 __m256 inputVal1, inputVal2;
361 __m256i intInputVal1, intInputVal2;
363 __m256 vmin_val = _mm256_set1_ps(min_val);
364 __m256 vmax_val = _mm256_set1_ps(max_val);
366 for (; number < sixteenthPoints; number++) {
367 inputVal1 = _mm256_load_ps(inputVectorPtr);
369 inputVal2 = _mm256_load_ps(inputVectorPtr);
373 ret1 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal1, vScalar), vmax_val),
375 ret2 = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal2, vScalar), vmax_val),
378 intInputVal1 = _mm256_cvtps_epi32(ret1);
379 intInputVal2 = _mm256_cvtps_epi32(ret2);
381 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
382 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0b11011000);
384 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
385 outputVectorPtr += 16;
388 number = sixteenthPoints * 16;
389 for (; number < num_points; number++) {
390 r = inputVector[number] * scalar;
393 else if (r < min_val)
395 outputVector[number] = (int16_t)
rintf(r);
402#include <immintrin.h>
405 const float* inputVector,
407 unsigned int num_points)
409 unsigned int number = 0;
411 const unsigned int eighthPoints = num_points / 8;
413 const float* inputVectorPtr = (
const float*)inputVector;
414 int16_t* outputVectorPtr = outputVector;
416 float min_val = SHRT_MIN;
417 float max_val = SHRT_MAX;
420 __m256 vScalar = _mm256_set1_ps(scalar);
421 __m256 inputVal, ret;
423 __m128i intInputVal1, intInputVal2;
424 __m256 vmin_val = _mm256_set1_ps(min_val);
425 __m256 vmax_val = _mm256_set1_ps(max_val);
427 for (; number < eighthPoints; number++) {
428 inputVal = _mm256_load_ps(inputVectorPtr);
432 ret = _mm256_max_ps(_mm256_min_ps(_mm256_mul_ps(inputVal, vScalar), vmax_val),
435 intInputVal = _mm256_cvtps_epi32(ret);
437 intInputVal1 = _mm256_extractf128_si256(intInputVal, 0);
438 intInputVal2 = _mm256_extractf128_si256(intInputVal, 1);
440 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
442 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
443 outputVectorPtr += 8;
446 number = eighthPoints * 8;
447 for (; number < num_points; number++) {
448 r = inputVector[number] * scalar;
451 else if (r < min_val)
453 outputVector[number] = (int16_t)
rintf(r);
459#include <emmintrin.h>
462 const float* inputVector,
464 unsigned int num_points)
466 unsigned int number = 0;
468 const unsigned int eighthPoints = num_points / 8;
470 const float* inputVectorPtr = (
const float*)inputVector;
471 int16_t* outputVectorPtr = outputVector;
473 float min_val = SHRT_MIN;
474 float max_val = SHRT_MAX;
477 __m128 vScalar = _mm_set_ps1(scalar);
478 __m128 inputVal1, inputVal2;
479 __m128i intInputVal1, intInputVal2;
481 __m128 vmin_val = _mm_set_ps1(min_val);
482 __m128 vmax_val = _mm_set_ps1(max_val);
484 for (; number < eighthPoints; number++) {
485 inputVal1 = _mm_load_ps(inputVectorPtr);
487 inputVal2 = _mm_load_ps(inputVectorPtr);
491 ret1 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal1, vScalar), vmax_val), vmin_val);
492 ret2 = _mm_max_ps(_mm_min_ps(_mm_mul_ps(inputVal2, vScalar), vmax_val), vmin_val);
494 intInputVal1 = _mm_cvtps_epi32(ret1);
495 intInputVal2 = _mm_cvtps_epi32(ret2);
497 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
499 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
500 outputVectorPtr += 8;
503 number = eighthPoints * 8;
504 for (; number < num_points; number++) {
505 r = inputVector[number] * scalar;
508 else if (r < min_val)
510 outputVector[number] = (int16_t)
rintf(r);
517#include <xmmintrin.h>
520 const float* inputVector,
522 unsigned int num_points)
524 unsigned int number = 0;
526 const unsigned int quarterPoints = num_points / 4;
528 const float* inputVectorPtr = (
const float*)inputVector;
529 int16_t* outputVectorPtr = outputVector;
531 float min_val = SHRT_MIN;
532 float max_val = SHRT_MAX;
535 __m128 vScalar = _mm_set_ps1(scalar);
537 __m128 vmin_val = _mm_set_ps1(min_val);
538 __m128 vmax_val = _mm_set_ps1(max_val);
542 for (; number < quarterPoints; number++) {
543 ret = _mm_load_ps(inputVectorPtr);
547 ret = _mm_max_ps(_mm_min_ps(_mm_mul_ps(ret, vScalar), vmax_val), vmin_val);
549 _mm_store_ps(outputFloatBuffer, ret);
550 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[0]);
551 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[1]);
552 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[2]);
553 *outputVectorPtr++ = (int16_t)
rintf(outputFloatBuffer[3]);
556 number = quarterPoints * 4;
557 for (; number < num_points; number++) {
558 r = inputVector[number] * scalar;
561 else if (r < min_val)
563 outputVector[number] = (int16_t)
rintf(r);
569#ifdef LV_HAVE_GENERIC
572 const float* inputVector,
574 unsigned int num_points)
576 int16_t* outputVectorPtr = outputVector;
577 const float* inputVectorPtr = inputVector;
578 unsigned int number = 0;
579 float min_val = SHRT_MIN;
580 float max_val = SHRT_MAX;
583 for (number = 0; number < num_points; number++) {
584 r = *inputVectorPtr++ * scalar;
587 else if (r > max_val)
589 *outputVectorPtr++ = (int16_t)
rintf(r);
static float rintf(float x)
Definition: config.h:37
static void volk_32f_s32f_convert_16i_a_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:461
static void volk_32f_s32f_convert_16i_u_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:255
static void volk_32f_s32f_convert_16i_a_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:404
static void volk_32f_s32f_convert_16i_u_sse2(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:197
static void volk_32f_s32f_convert_16i_a_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:571
static void volk_32f_s32f_convert_16i_u_avx(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:139
static void volk_32f_s32f_convert_16i_generic(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:307
static void volk_32f_s32f_convert_16i_a_sse(int16_t *outputVector, const float *inputVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_convert_16i.h:519
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56