Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16i_s32f_convert_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_16i_s32f_convert_32f_u_H
55#define INCLUDED_volk_16i_s32f_convert_32f_u_H
56
57#include <inttypes.h>
58#include <stdio.h>
59
60#ifdef LV_HAVE_AVX2
61#include <immintrin.h>
62
63static inline void volk_16i_s32f_convert_32f_u_avx2(float* outputVector,
64 const int16_t* inputVector,
65 const float scalar,
66 unsigned int num_points)
67{
68 unsigned int number = 0;
69 const unsigned int eighthPoints = num_points / 8;
70
71 float* outputVectorPtr = outputVector;
72 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
73 int16_t* inputPtr = (int16_t*)inputVector;
74 __m128i inputVal;
75 __m256i inputVal2;
76 __m256 ret;
77
78 for (; number < eighthPoints; number++) {
79
80 // Load the 8 values
81 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
82
83 // Convert
84 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
85
86 ret = _mm256_cvtepi32_ps(inputVal2);
87 ret = _mm256_mul_ps(ret, invScalar);
88
89 _mm256_storeu_ps(outputVectorPtr, ret);
90
91 outputVectorPtr += 8;
92
93 inputPtr += 8;
94 }
95
96 number = eighthPoints * 8;
97 for (; number < num_points; number++) {
98 outputVector[number] = ((float)(inputVector[number])) / scalar;
99 }
100}
101#endif /* LV_HAVE_AVX2 */
102
103#ifdef LV_HAVE_AVX
104#include <immintrin.h>
105
106static inline void volk_16i_s32f_convert_32f_u_avx(float* outputVector,
107 const int16_t* inputVector,
108 const float scalar,
109 unsigned int num_points)
110{
111 unsigned int number = 0;
112 const unsigned int eighthPoints = num_points / 8;
113
114 float* outputVectorPtr = outputVector;
115 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
116 int16_t* inputPtr = (int16_t*)inputVector;
117 __m128i inputVal, inputVal2;
118 __m128 ret;
119 __m256 output;
120 __m256 dummy = _mm256_setzero_ps();
121
122 for (; number < eighthPoints; number++) {
123
124 // Load the 8 values
125 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
126 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
127
128 // Shift the input data to the right by 64 bits ( 8 bytes )
129 inputVal2 = _mm_srli_si128(inputVal, 8);
130
131 // Convert the lower 4 values into 32 bit words
132 inputVal = _mm_cvtepi16_epi32(inputVal);
133 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
134
135 ret = _mm_cvtepi32_ps(inputVal);
136 ret = _mm_mul_ps(ret, invScalar);
137 output = _mm256_insertf128_ps(dummy, ret, 0);
138
139 ret = _mm_cvtepi32_ps(inputVal2);
140 ret = _mm_mul_ps(ret, invScalar);
141 output = _mm256_insertf128_ps(output, ret, 1);
142
143 _mm256_storeu_ps(outputVectorPtr, output);
144
145 outputVectorPtr += 8;
146
147 inputPtr += 8;
148 }
149
150 number = eighthPoints * 8;
151 for (; number < num_points; number++) {
152 outputVector[number] = ((float)(inputVector[number])) / scalar;
153 }
154}
155#endif /* LV_HAVE_AVX */
156
157#ifdef LV_HAVE_SSE4_1
158#include <smmintrin.h>
159
160static inline void volk_16i_s32f_convert_32f_u_sse4_1(float* outputVector,
161 const int16_t* inputVector,
162 const float scalar,
163 unsigned int num_points)
164{
165 unsigned int number = 0;
166 const unsigned int eighthPoints = num_points / 8;
167
168 float* outputVectorPtr = outputVector;
169 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
170 int16_t* inputPtr = (int16_t*)inputVector;
171 __m128i inputVal;
172 __m128i inputVal2;
173 __m128 ret;
174
175 for (; number < eighthPoints; number++) {
176
177 // Load the 8 values
178 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
179
180 // Shift the input data to the right by 64 bits ( 8 bytes )
181 inputVal2 = _mm_srli_si128(inputVal, 8);
182
183 // Convert the lower 4 values into 32 bit words
184 inputVal = _mm_cvtepi16_epi32(inputVal);
185 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
186
187 ret = _mm_cvtepi32_ps(inputVal);
188 ret = _mm_mul_ps(ret, invScalar);
189 _mm_storeu_ps(outputVectorPtr, ret);
190 outputVectorPtr += 4;
191
192 ret = _mm_cvtepi32_ps(inputVal2);
193 ret = _mm_mul_ps(ret, invScalar);
194 _mm_storeu_ps(outputVectorPtr, ret);
195
196 outputVectorPtr += 4;
197
198 inputPtr += 8;
199 }
200
201 number = eighthPoints * 8;
202 for (; number < num_points; number++) {
203 outputVector[number] = ((float)(inputVector[number])) / scalar;
204 }
205}
206#endif /* LV_HAVE_SSE4_1 */
207
208#ifdef LV_HAVE_SSE
209#include <xmmintrin.h>
210
211static inline void volk_16i_s32f_convert_32f_u_sse(float* outputVector,
212 const int16_t* inputVector,
213 const float scalar,
214 unsigned int num_points)
215{
216 unsigned int number = 0;
217 const unsigned int quarterPoints = num_points / 4;
218
219 float* outputVectorPtr = outputVector;
220 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
221 int16_t* inputPtr = (int16_t*)inputVector;
222 __m128 ret;
223
224 for (; number < quarterPoints; number++) {
225 ret = _mm_set_ps((float)(inputPtr[3]),
226 (float)(inputPtr[2]),
227 (float)(inputPtr[1]),
228 (float)(inputPtr[0]));
229
230 ret = _mm_mul_ps(ret, invScalar);
231 _mm_storeu_ps(outputVectorPtr, ret);
232
233 inputPtr += 4;
234 outputVectorPtr += 4;
235 }
236
237 number = quarterPoints * 4;
238 for (; number < num_points; number++) {
239 outputVector[number] = (float)(inputVector[number]) / scalar;
240 }
241}
242#endif /* LV_HAVE_SSE */
243
244#ifdef LV_HAVE_GENERIC
245
246static inline void volk_16i_s32f_convert_32f_generic(float* outputVector,
247 const int16_t* inputVector,
248 const float scalar,
249 unsigned int num_points)
250{
251 float* outputVectorPtr = outputVector;
252 const int16_t* inputVectorPtr = inputVector;
253 unsigned int number = 0;
254
255 for (number = 0; number < num_points; number++) {
256 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
257 }
258}
259#endif /* LV_HAVE_GENERIC */
260
261#ifdef LV_HAVE_NEON
262#include <arm_neon.h>
263
264static inline void volk_16i_s32f_convert_32f_neon(float* outputVector,
265 const int16_t* inputVector,
266 const float scalar,
267 unsigned int num_points)
268{
269 float* outputPtr = outputVector;
270 const int16_t* inputPtr = inputVector;
271 unsigned int number = 0;
272 unsigned int eighth_points = num_points / 8;
273
274 int16x4x2_t input16;
275 int32x4_t input32_0, input32_1;
276 float32x4_t input_float_0, input_float_1;
277 float32x4x2_t output_float;
278 float32x4_t inv_scale;
279
280 inv_scale = vdupq_n_f32(1.0 / scalar);
281
282 // the generic disassembles to a 128-bit load
283 // and duplicates every instruction to operate on 64-bits
284 // at a time. This is only possible with lanes, which is faster
285 // than just doing a vld1_s16, but still slower.
286 for (number = 0; number < eighth_points; number++) {
287 input16 = vld2_s16(inputPtr);
288 // widen 16-bit int to 32-bit int
289 input32_0 = vmovl_s16(input16.val[0]);
290 input32_1 = vmovl_s16(input16.val[1]);
291 // convert 32-bit int to float with scale
292 input_float_0 = vcvtq_f32_s32(input32_0);
293 input_float_1 = vcvtq_f32_s32(input32_1);
294 output_float.val[0] = vmulq_f32(input_float_0, inv_scale);
295 output_float.val[1] = vmulq_f32(input_float_1, inv_scale);
296 vst2q_f32(outputPtr, output_float);
297 inputPtr += 8;
298 outputPtr += 8;
299 }
300
301 for (number = eighth_points * 8; number < num_points; number++) {
302 *outputPtr++ = ((float)(*inputPtr++)) / scalar;
303 }
304}
305#endif /* LV_HAVE_NEON */
306
307
308#endif /* INCLUDED_volk_16i_s32f_convert_32f_u_H */
309#ifndef INCLUDED_volk_16i_s32f_convert_32f_a_H
310#define INCLUDED_volk_16i_s32f_convert_32f_a_H
311
312#include <inttypes.h>
313#include <stdio.h>
314
315#ifdef LV_HAVE_AVX2
316#include <immintrin.h>
317
318static inline void volk_16i_s32f_convert_32f_a_avx2(float* outputVector,
319 const int16_t* inputVector,
320 const float scalar,
321 unsigned int num_points)
322{
323 unsigned int number = 0;
324 const unsigned int eighthPoints = num_points / 8;
325
326 float* outputVectorPtr = outputVector;
327 __m256 invScalar = _mm256_set1_ps(1.0 / scalar);
328 int16_t* inputPtr = (int16_t*)inputVector;
329 __m128i inputVal;
330 __m256i inputVal2;
331 __m256 ret;
332
333 for (; number < eighthPoints; number++) {
334
335 // Load the 8 values
336 inputVal = _mm_load_si128((__m128i*)inputPtr);
337
338 // Convert
339 inputVal2 = _mm256_cvtepi16_epi32(inputVal);
340
341 ret = _mm256_cvtepi32_ps(inputVal2);
342 ret = _mm256_mul_ps(ret, invScalar);
343
344 _mm256_store_ps(outputVectorPtr, ret);
345
346 outputVectorPtr += 8;
347
348 inputPtr += 8;
349 }
350
351 number = eighthPoints * 8;
352 for (; number < num_points; number++) {
353 outputVector[number] = ((float)(inputVector[number])) / scalar;
354 }
355}
356#endif /* LV_HAVE_AVX2 */
357
358#ifdef LV_HAVE_AVX
359#include <immintrin.h>
360
361static inline void volk_16i_s32f_convert_32f_a_avx(float* outputVector,
362 const int16_t* inputVector,
363 const float scalar,
364 unsigned int num_points)
365{
366 unsigned int number = 0;
367 const unsigned int eighthPoints = num_points / 8;
368
369 float* outputVectorPtr = outputVector;
370 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
371 int16_t* inputPtr = (int16_t*)inputVector;
372 __m128i inputVal, inputVal2;
373 __m128 ret;
374 __m256 output;
375 __m256 dummy = _mm256_setzero_ps();
376
377 for (; number < eighthPoints; number++) {
378
379 // Load the 8 values
380 // inputVal = _mm_loadu_si128((__m128i*)inputPtr);
381 inputVal = _mm_load_si128((__m128i*)inputPtr);
382
383 // Shift the input data to the right by 64 bits ( 8 bytes )
384 inputVal2 = _mm_srli_si128(inputVal, 8);
385
386 // Convert the lower 4 values into 32 bit words
387 inputVal = _mm_cvtepi16_epi32(inputVal);
388 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
389
390 ret = _mm_cvtepi32_ps(inputVal);
391 ret = _mm_mul_ps(ret, invScalar);
392 output = _mm256_insertf128_ps(dummy, ret, 0);
393
394 ret = _mm_cvtepi32_ps(inputVal2);
395 ret = _mm_mul_ps(ret, invScalar);
396 output = _mm256_insertf128_ps(output, ret, 1);
397
398 _mm256_store_ps(outputVectorPtr, output);
399
400 outputVectorPtr += 8;
401
402 inputPtr += 8;
403 }
404
405 number = eighthPoints * 8;
406 for (; number < num_points; number++) {
407 outputVector[number] = ((float)(inputVector[number])) / scalar;
408 }
409}
410#endif /* LV_HAVE_AVX */
411
412#ifdef LV_HAVE_SSE4_1
413#include <smmintrin.h>
414
415static inline void volk_16i_s32f_convert_32f_a_sse4_1(float* outputVector,
416 const int16_t* inputVector,
417 const float scalar,
418 unsigned int num_points)
419{
420 unsigned int number = 0;
421 const unsigned int eighthPoints = num_points / 8;
422
423 float* outputVectorPtr = outputVector;
424 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
425 int16_t* inputPtr = (int16_t*)inputVector;
426 __m128i inputVal;
427 __m128i inputVal2;
428 __m128 ret;
429
430 for (; number < eighthPoints; number++) {
431
432 // Load the 8 values
433 inputVal = _mm_loadu_si128((__m128i*)inputPtr);
434
435 // Shift the input data to the right by 64 bits ( 8 bytes )
436 inputVal2 = _mm_srli_si128(inputVal, 8);
437
438 // Convert the lower 4 values into 32 bit words
439 inputVal = _mm_cvtepi16_epi32(inputVal);
440 inputVal2 = _mm_cvtepi16_epi32(inputVal2);
441
442 ret = _mm_cvtepi32_ps(inputVal);
443 ret = _mm_mul_ps(ret, invScalar);
444 _mm_storeu_ps(outputVectorPtr, ret);
445 outputVectorPtr += 4;
446
447 ret = _mm_cvtepi32_ps(inputVal2);
448 ret = _mm_mul_ps(ret, invScalar);
449 _mm_storeu_ps(outputVectorPtr, ret);
450
451 outputVectorPtr += 4;
452
453 inputPtr += 8;
454 }
455
456 number = eighthPoints * 8;
457 for (; number < num_points; number++) {
458 outputVector[number] = ((float)(inputVector[number])) / scalar;
459 }
460}
461#endif /* LV_HAVE_SSE4_1 */
462
463#ifdef LV_HAVE_SSE
464#include <xmmintrin.h>
465
466static inline void volk_16i_s32f_convert_32f_a_sse(float* outputVector,
467 const int16_t* inputVector,
468 const float scalar,
469 unsigned int num_points)
470{
471 unsigned int number = 0;
472 const unsigned int quarterPoints = num_points / 4;
473
474 float* outputVectorPtr = outputVector;
475 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
476 int16_t* inputPtr = (int16_t*)inputVector;
477 __m128 ret;
478
479 for (; number < quarterPoints; number++) {
480 ret = _mm_set_ps((float)(inputPtr[3]),
481 (float)(inputPtr[2]),
482 (float)(inputPtr[1]),
483 (float)(inputPtr[0]));
484
485 ret = _mm_mul_ps(ret, invScalar);
486 _mm_storeu_ps(outputVectorPtr, ret);
487
488 inputPtr += 4;
489 outputVectorPtr += 4;
490 }
491
492 number = quarterPoints * 4;
493 for (; number < num_points; number++) {
494 outputVector[number] = (float)(inputVector[number]) / scalar;
495 }
496}
497#endif /* LV_HAVE_SSE */
498
499#ifdef LV_HAVE_GENERIC
500
501static inline void volk_16i_s32f_convert_32f_a_generic(float* outputVector,
502 const int16_t* inputVector,
503 const float scalar,
504 unsigned int num_points)
505{
506 float* outputVectorPtr = outputVector;
507 const int16_t* inputVectorPtr = inputVector;
508 unsigned int number = 0;
509
510 for (number = 0; number < num_points; number++) {
511 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) / scalar;
512 }
513}
514#endif /* LV_HAVE_GENERIC */
515
516#endif /* INCLUDED_volk_16i_s32f_convert_32f_a_H */
static void volk_16i_s32f_convert_32f_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:246
static void volk_16i_s32f_convert_32f_a_generic(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:501
static void volk_16i_s32f_convert_32f_u_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:211
static void volk_16i_s32f_convert_32f_a_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:361
static void volk_16i_s32f_convert_32f_u_avx(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:106
static void volk_16i_s32f_convert_32f_neon(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:264
static void volk_16i_s32f_convert_32f_a_sse(float *outputVector, const int16_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_16i_s32f_convert_32f.h:466