Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_convert_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See thegit
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
46#ifndef INCLUDED_volk_32fc_convert_16ic_a_H
47#define INCLUDED_volk_32fc_convert_16ic_a_H
48
49#include "volk/volk_complex.h"
50#include <limits.h>
51#include <math.h>
52
53#ifdef LV_HAVE_AVX2
54#include <immintrin.h>
55
56static inline void volk_32fc_convert_16ic_a_avx2(lv_16sc_t* outputVector,
57 const lv_32fc_t* inputVector,
58 unsigned int num_points)
59{
60 const unsigned int avx_iters = num_points / 8;
61
62 float* inputVectorPtr = (float*)inputVector;
63 int16_t* outputVectorPtr = (int16_t*)outputVector;
64 float aux;
65
66 const float min_val = (float)SHRT_MIN;
67 const float max_val = (float)SHRT_MAX;
68
69 __m256 inputVal1, inputVal2;
70 __m256i intInputVal1, intInputVal2;
71 __m256 ret1, ret2;
72 const __m256 vmin_val = _mm256_set1_ps(min_val);
73 const __m256 vmax_val = _mm256_set1_ps(max_val);
74 unsigned int i;
75
76 for (i = 0; i < avx_iters; i++) {
77 inputVal1 = _mm256_load_ps((float*)inputVectorPtr);
78 inputVectorPtr += 8;
79 inputVal2 = _mm256_load_ps((float*)inputVectorPtr);
80 inputVectorPtr += 8;
81 __VOLK_PREFETCH(inputVectorPtr + 16);
82
83 // Clip
84 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
85 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
86
87 intInputVal1 = _mm256_cvtps_epi32(ret1);
88 intInputVal2 = _mm256_cvtps_epi32(ret2);
89
90 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
91 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
92
93 _mm256_store_si256((__m256i*)outputVectorPtr, intInputVal1);
94 outputVectorPtr += 16;
95 }
96
97 for (i = avx_iters * 16; i < num_points * 2; i++) {
98 aux = *inputVectorPtr++;
99 if (aux > max_val)
100 aux = max_val;
101 else if (aux < min_val)
102 aux = min_val;
103 *outputVectorPtr++ = (int16_t)rintf(aux);
104 }
105}
106#endif /* LV_HAVE_AVX2 */
107
108#ifdef LV_HAVE_SSE2
109#include <emmintrin.h>
110
111static inline void volk_32fc_convert_16ic_a_sse2(lv_16sc_t* outputVector,
112 const lv_32fc_t* inputVector,
113 unsigned int num_points)
114{
115 const unsigned int sse_iters = num_points / 4;
116
117 float* inputVectorPtr = (float*)inputVector;
118 int16_t* outputVectorPtr = (int16_t*)outputVector;
119 float aux;
120
121 const float min_val = (float)SHRT_MIN;
122 const float max_val = (float)SHRT_MAX;
123
124 __m128 inputVal1, inputVal2;
125 __m128i intInputVal1, intInputVal2;
126 __m128 ret1, ret2;
127 const __m128 vmin_val = _mm_set_ps1(min_val);
128 const __m128 vmax_val = _mm_set_ps1(max_val);
129 unsigned int i;
130
131 for (i = 0; i < sse_iters; i++) {
132 inputVal1 = _mm_load_ps((float*)inputVectorPtr);
133 inputVectorPtr += 4;
134 inputVal2 = _mm_load_ps((float*)inputVectorPtr);
135 inputVectorPtr += 4;
136 __VOLK_PREFETCH(inputVectorPtr + 8);
137
138 // Clip
139 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
140 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
141
142 intInputVal1 = _mm_cvtps_epi32(ret1);
143 intInputVal2 = _mm_cvtps_epi32(ret2);
144
145 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
146
147 _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1);
148 outputVectorPtr += 8;
149 }
150
151 for (i = sse_iters * 8; i < num_points * 2; i++) {
152 aux = *inputVectorPtr++;
153 if (aux > max_val)
154 aux = max_val;
155 else if (aux < min_val)
156 aux = min_val;
157 *outputVectorPtr++ = (int16_t)rintf(aux);
158 }
159}
160#endif /* LV_HAVE_SSE2 */
161
162
163#if LV_HAVE_NEONV7
164#include <arm_neon.h>
165
166#define VCVTRQ_S32_F32(res, val) \
167 __VOLK_ASM("VCVTR.S32.F32 %[r0], %[v0]\n\t" \
168 : [r0] "=w"(res[0]) \
169 : [v0] "w"(val[0]) \
170 :); \
171 __VOLK_ASM("VCVTR.S32.F32 %[r1], %[v1]\n\t" \
172 : [r1] "=w"(res[1]) \
173 : [v1] "w"(val[1]) \
174 :); \
175 __VOLK_ASM("VCVTR.S32.F32 %[r2], %[v2]\n\t" \
176 : [r2] "=w"(res[2]) \
177 : [v2] "w"(val[2]) \
178 :); \
179 __VOLK_ASM("VCVTR.S32.F32 %[r3], %[v3]\n\t" : [r3] "=w"(res[3]) : [v3] "w"(val[3]) :);
180
181static inline void volk_32fc_convert_16ic_neon(lv_16sc_t* outputVector,
182 const lv_32fc_t* inputVector,
183 unsigned int num_points)
184{
185
186 const unsigned int neon_iters = num_points / 4;
187
188 float32_t* inputVectorPtr = (float32_t*)inputVector;
189 int16_t* outputVectorPtr = (int16_t*)outputVector;
190
191 const float min_val_f = (float)SHRT_MIN;
192 const float max_val_f = (float)SHRT_MAX;
193 float32_t aux;
194 unsigned int i;
195
196 const float32x4_t min_val = vmovq_n_f32(min_val_f);
197 const float32x4_t max_val = vmovq_n_f32(max_val_f);
198 float32x4_t ret1, ret2, a, b;
199
200 int32x4_t toint_a = { 0, 0, 0, 0 };
201 int32x4_t toint_b = { 0, 0, 0, 0 };
202 int16x4_t intInputVal1, intInputVal2;
203 int16x8_t res;
204
205 for (i = 0; i < neon_iters; i++) {
206 a = vld1q_f32((const float32_t*)(inputVectorPtr));
207 inputVectorPtr += 4;
208 b = vld1q_f32((const float32_t*)(inputVectorPtr));
209 inputVectorPtr += 4;
210 __VOLK_PREFETCH(inputVectorPtr + 8);
211
212 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
213 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
214
215 // vcvtr takes into account the current rounding mode (as does rintf)
216 VCVTRQ_S32_F32(toint_a, ret1);
217 VCVTRQ_S32_F32(toint_b, ret2);
218
219 intInputVal1 = vqmovn_s32(toint_a);
220 intInputVal2 = vqmovn_s32(toint_b);
221
222 res = vcombine_s16(intInputVal1, intInputVal2);
223 vst1q_s16((int16_t*)outputVectorPtr, res);
224 outputVectorPtr += 8;
225 }
226
227 for (i = neon_iters * 8; i < num_points * 2; i++) {
228 aux = *inputVectorPtr++;
229 if (aux > max_val_f)
230 aux = max_val_f;
231 else if (aux < min_val_f)
232 aux = min_val_f;
233 *outputVectorPtr++ = (int16_t)rintf(aux);
234 }
235}
236
237#undef VCVTRQ_S32_F32
238#endif /* LV_HAVE_NEONV7 */
239
240#if LV_HAVE_NEONV8
241#include <arm_neon.h>
242
243static inline void volk_32fc_convert_16ic_neonv8(lv_16sc_t* outputVector,
244 const lv_32fc_t* inputVector,
245 unsigned int num_points)
246{
247 const unsigned int neon_iters = num_points / 4;
248
249 float32_t* inputVectorPtr = (float32_t*)inputVector;
250 int16_t* outputVectorPtr = (int16_t*)outputVector;
251
252 const float min_val_f = (float)SHRT_MIN;
253 const float max_val_f = (float)SHRT_MAX;
254 float32_t aux;
255 unsigned int i;
256
257 const float32x4_t min_val = vmovq_n_f32(min_val_f);
258 const float32x4_t max_val = vmovq_n_f32(max_val_f);
259 float32x4_t ret1, ret2, a, b;
260
261 int32x4_t toint_a = { 0, 0, 0, 0 }, toint_b = { 0, 0, 0, 0 };
262 int16x4_t intInputVal1, intInputVal2;
263 int16x8_t res;
264
265 for (i = 0; i < neon_iters; i++) {
266 a = vld1q_f32((const float32_t*)(inputVectorPtr));
267 inputVectorPtr += 4;
268 b = vld1q_f32((const float32_t*)(inputVectorPtr));
269 inputVectorPtr += 4;
270 __VOLK_PREFETCH(inputVectorPtr + 8);
271
272 ret1 = vmaxq_f32(vminq_f32(a, max_val), min_val);
273 ret2 = vmaxq_f32(vminq_f32(b, max_val), min_val);
274
275 // vrndiq takes into account the current rounding mode (as does rintf)
276 toint_a = vcvtq_s32_f32(vrndiq_f32(ret1));
277 toint_b = vcvtq_s32_f32(vrndiq_f32(ret2));
278
279 intInputVal1 = vqmovn_s32(toint_a);
280 intInputVal2 = vqmovn_s32(toint_b);
281
282 res = vcombine_s16(intInputVal1, intInputVal2);
283 vst1q_s16((int16_t*)outputVectorPtr, res);
284 outputVectorPtr += 8;
285 }
286
287 for (i = neon_iters * 8; i < num_points * 2; i++) {
288 aux = *inputVectorPtr++;
289 if (aux > max_val_f)
290 aux = max_val_f;
291 else if (aux < min_val_f)
292 aux = min_val_f;
293 *outputVectorPtr++ = (int16_t)rintf(aux);
294 }
295}
296#endif /* LV_HAVE_NEONV8 */
297
298
299#ifdef LV_HAVE_GENERIC
300
301static inline void volk_32fc_convert_16ic_generic(lv_16sc_t* outputVector,
302 const lv_32fc_t* inputVector,
303 unsigned int num_points)
304{
305 float* inputVectorPtr = (float*)inputVector;
306 int16_t* outputVectorPtr = (int16_t*)outputVector;
307 const float min_val = (float)SHRT_MIN;
308 const float max_val = (float)SHRT_MAX;
309 float aux;
310 unsigned int i;
311 for (i = 0; i < num_points * 2; i++) {
312 aux = *inputVectorPtr++;
313 if (aux > max_val)
314 aux = max_val;
315 else if (aux < min_val)
316 aux = min_val;
317 *outputVectorPtr++ = (int16_t)rintf(aux);
318 }
319}
320#endif /* LV_HAVE_GENERIC */
321
322#endif /* INCLUDED_volk_32fc_convert_16ic_a_H */
323
324#ifndef INCLUDED_volk_32fc_convert_16ic_u_H
325#define INCLUDED_volk_32fc_convert_16ic_u_H
326
327#include "volk/volk_complex.h"
328#include <limits.h>
329#include <math.h>
330
331
332#ifdef LV_HAVE_AVX2
333#include <immintrin.h>
334
335static inline void volk_32fc_convert_16ic_u_avx2(lv_16sc_t* outputVector,
336 const lv_32fc_t* inputVector,
337 unsigned int num_points)
338{
339 const unsigned int avx_iters = num_points / 8;
340
341 float* inputVectorPtr = (float*)inputVector;
342 int16_t* outputVectorPtr = (int16_t*)outputVector;
343 float aux;
344
345 const float min_val = (float)SHRT_MIN;
346 const float max_val = (float)SHRT_MAX;
347
348 __m256 inputVal1, inputVal2;
349 __m256i intInputVal1, intInputVal2;
350 __m256 ret1, ret2;
351 const __m256 vmin_val = _mm256_set1_ps(min_val);
352 const __m256 vmax_val = _mm256_set1_ps(max_val);
353 unsigned int i;
354
355 for (i = 0; i < avx_iters; i++) {
356 inputVal1 = _mm256_loadu_ps((float*)inputVectorPtr);
357 inputVectorPtr += 8;
358 inputVal2 = _mm256_loadu_ps((float*)inputVectorPtr);
359 inputVectorPtr += 8;
360 __VOLK_PREFETCH(inputVectorPtr + 16);
361
362 // Clip
363 ret1 = _mm256_max_ps(_mm256_min_ps(inputVal1, vmax_val), vmin_val);
364 ret2 = _mm256_max_ps(_mm256_min_ps(inputVal2, vmax_val), vmin_val);
365
366 intInputVal1 = _mm256_cvtps_epi32(ret1);
367 intInputVal2 = _mm256_cvtps_epi32(ret2);
368
369 intInputVal1 = _mm256_packs_epi32(intInputVal1, intInputVal2);
370 intInputVal1 = _mm256_permute4x64_epi64(intInputVal1, 0xd8);
371
372 _mm256_storeu_si256((__m256i*)outputVectorPtr, intInputVal1);
373 outputVectorPtr += 16;
374 }
375
376 for (i = avx_iters * 16; i < num_points * 2; i++) {
377 aux = *inputVectorPtr++;
378 if (aux > max_val)
379 aux = max_val;
380 else if (aux < min_val)
381 aux = min_val;
382 *outputVectorPtr++ = (int16_t)rintf(aux);
383 }
384}
385#endif /* LV_HAVE_AVX2 */
386
387
388#ifdef LV_HAVE_SSE2
389#include <emmintrin.h>
390
391static inline void volk_32fc_convert_16ic_u_sse2(lv_16sc_t* outputVector,
392 const lv_32fc_t* inputVector,
393 unsigned int num_points)
394{
395 const unsigned int sse_iters = num_points / 4;
396
397 float* inputVectorPtr = (float*)inputVector;
398 int16_t* outputVectorPtr = (int16_t*)outputVector;
399 float aux;
400
401 const float min_val = (float)SHRT_MIN;
402 const float max_val = (float)SHRT_MAX;
403
404 __m128 inputVal1, inputVal2;
405 __m128i intInputVal1, intInputVal2;
406 __m128 ret1, ret2;
407 const __m128 vmin_val = _mm_set_ps1(min_val);
408 const __m128 vmax_val = _mm_set_ps1(max_val);
409
410 unsigned int i;
411 for (i = 0; i < sse_iters; i++) {
412 inputVal1 = _mm_loadu_ps((float*)inputVectorPtr);
413 inputVectorPtr += 4;
414 inputVal2 = _mm_loadu_ps((float*)inputVectorPtr);
415 inputVectorPtr += 4;
416 __VOLK_PREFETCH(inputVectorPtr + 8);
417
418 // Clip
419 ret1 = _mm_max_ps(_mm_min_ps(inputVal1, vmax_val), vmin_val);
420 ret2 = _mm_max_ps(_mm_min_ps(inputVal2, vmax_val), vmin_val);
421
422 intInputVal1 = _mm_cvtps_epi32(ret1);
423 intInputVal2 = _mm_cvtps_epi32(ret2);
424
425 intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2);
426
427 _mm_storeu_si128((__m128i*)outputVectorPtr, intInputVal1);
428 outputVectorPtr += 8;
429 }
430
431 for (i = sse_iters * 8; i < num_points * 2; i++) {
432 aux = *inputVectorPtr++;
433 if (aux > max_val)
434 aux = max_val;
435 else if (aux < min_val)
436 aux = min_val;
437 *outputVectorPtr++ = (int16_t)rintf(aux);
438 }
439}
440#endif /* LV_HAVE_SSE2 */
441#endif /* INCLUDED_volk_32fc_convert_16ic_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32fc_convert_16ic_a_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:111
static void volk_32fc_convert_16ic_u_sse2(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:391
static void volk_32fc_convert_16ic_generic(lv_16sc_t *outputVector, const lv_32fc_t *inputVector, unsigned int num_points)
Definition: volk_32fc_convert_16ic.h:301
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
short complex lv_16sc_t
Definition: volk_complex.h:62
for i
Definition: volk_config_fixed.tmpl.h:25