Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_magnitude_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_16ic_magnitude_16i_a_H
55#define INCLUDED_volk_16ic_magnitude_16i_a_H
56
57#include <inttypes.h>
58#include <limits.h>
59#include <math.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63#ifdef LV_HAVE_AVX2
64#include <immintrin.h>
65
66static inline void volk_16ic_magnitude_16i_a_avx2(int16_t* magnitudeVector,
67 const lv_16sc_t* complexVector,
68 unsigned int num_points)
69{
70 unsigned int number = 0;
71 const unsigned int eighthPoints = num_points / 8;
72
73 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
74 int16_t* magnitudeVectorPtr = magnitudeVector;
75
76 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
77 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
78 __m256i int1, int2;
79 __m128i short1, short2;
80 __m256 cplxValue1, cplxValue2, result;
81 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
82
83 for (; number < eighthPoints; number++) {
84
85 int1 = _mm256_load_si256((__m256i*)complexVectorPtr);
86 complexVectorPtr += 16;
87 short1 = _mm256_extracti128_si256(int1, 0);
88 short2 = _mm256_extracti128_si256(int1, 1);
89
90 int1 = _mm256_cvtepi16_epi32(short1);
91 int2 = _mm256_cvtepi16_epi32(short2);
92 cplxValue1 = _mm256_cvtepi32_ps(int1);
93 cplxValue2 = _mm256_cvtepi32_ps(int2);
94
95 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
96 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
97
98 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
99 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
100
101 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
102
103 result = _mm256_sqrt_ps(result); // Square root the values
104
105 result = _mm256_mul_ps(result, vScalar); // Scale the results
106
107 int1 = _mm256_cvtps_epi32(result);
108 int1 = _mm256_packs_epi32(int1, int1);
109 int1 = _mm256_permutevar8x32_epi32(
110 int1, idx); // permute to compensate for shuffling in hadd and packs
111 short1 = _mm256_extracti128_si256(int1, 0);
112 _mm_store_si128((__m128i*)magnitudeVectorPtr, short1);
113 magnitudeVectorPtr += 8;
114 }
115
116 number = eighthPoints * 8;
117 magnitudeVectorPtr = &magnitudeVector[number];
118 complexVectorPtr = (const int16_t*)&complexVector[number];
119 for (; number < num_points; number++) {
120 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
121 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
122 const float val1Result =
123 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
124 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
125 }
126}
127#endif /* LV_HAVE_AVX2 */
128
129#ifdef LV_HAVE_SSE3
130#include <pmmintrin.h>
131
132static inline void volk_16ic_magnitude_16i_a_sse3(int16_t* magnitudeVector,
133 const lv_16sc_t* complexVector,
134 unsigned int num_points)
135{
136 unsigned int number = 0;
137 const unsigned int quarterPoints = num_points / 4;
138
139 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
140 int16_t* magnitudeVectorPtr = magnitudeVector;
141
142 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
143 __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
144
145 __m128 cplxValue1, cplxValue2, result;
146
147 __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[8];
148 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
149
150 for (; number < quarterPoints; number++) {
151
152 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
153 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
154 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
155 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
156
157 inputFloatBuffer[4] = (float)(complexVectorPtr[4]);
158 inputFloatBuffer[5] = (float)(complexVectorPtr[5]);
159 inputFloatBuffer[6] = (float)(complexVectorPtr[6]);
160 inputFloatBuffer[7] = (float)(complexVectorPtr[7]);
161
162 cplxValue1 = _mm_load_ps(&inputFloatBuffer[0]);
163 cplxValue2 = _mm_load_ps(&inputFloatBuffer[4]);
164
165 complexVectorPtr += 8;
166
167 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
168 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
169
170 cplxValue1 = _mm_mul_ps(cplxValue1, cplxValue1); // Square the values
171 cplxValue2 = _mm_mul_ps(cplxValue2, cplxValue2); // Square the Values
172
173 result = _mm_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
174
175 result = _mm_sqrt_ps(result); // Square root the values
176
177 result = _mm_mul_ps(result, vScalar); // Scale the results
178
179 _mm_store_ps(outputFloatBuffer, result);
180 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
181 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
182 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
183 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
184 }
185
186 number = quarterPoints * 4;
187 magnitudeVectorPtr = &magnitudeVector[number];
188 complexVectorPtr = (const int16_t*)&complexVector[number];
189 for (; number < num_points; number++) {
190 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
191 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
192 const float val1Result =
193 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
194 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
195 }
196}
197#endif /* LV_HAVE_SSE3 */
198
199#ifdef LV_HAVE_SSE
200#include <xmmintrin.h>
201
202static inline void volk_16ic_magnitude_16i_a_sse(int16_t* magnitudeVector,
203 const lv_16sc_t* complexVector,
204 unsigned int num_points)
205{
206 unsigned int number = 0;
207 const unsigned int quarterPoints = num_points / 4;
208
209 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
210 int16_t* magnitudeVectorPtr = magnitudeVector;
211
212 __m128 vScalar = _mm_set_ps1(SHRT_MAX);
213 __m128 invScalar = _mm_set_ps1(1.0f / SHRT_MAX);
214
215 __m128 cplxValue1, cplxValue2, iValue, qValue, result;
216
217 __VOLK_ATTR_ALIGNED(16) float inputFloatBuffer[4];
218 __VOLK_ATTR_ALIGNED(16) float outputFloatBuffer[4];
219
220 for (; number < quarterPoints; number++) {
221
222 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
223 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
224 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
225 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
226
227 cplxValue1 = _mm_load_ps(inputFloatBuffer);
228 complexVectorPtr += 4;
229
230 inputFloatBuffer[0] = (float)(complexVectorPtr[0]);
231 inputFloatBuffer[1] = (float)(complexVectorPtr[1]);
232 inputFloatBuffer[2] = (float)(complexVectorPtr[2]);
233 inputFloatBuffer[3] = (float)(complexVectorPtr[3]);
234
235 cplxValue2 = _mm_load_ps(inputFloatBuffer);
236 complexVectorPtr += 4;
237
238 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
239 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
240
241 // Arrange in i1i2i3i4 format
242 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
243 // Arrange in q1q2q3q4 format
244 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
245
246 iValue = _mm_mul_ps(iValue, iValue); // Square the I values
247 qValue = _mm_mul_ps(qValue, qValue); // Square the Q Values
248
249 result = _mm_add_ps(iValue, qValue); // Add the I2 and Q2 values
250
251 result = _mm_sqrt_ps(result); // Square root the values
252
253 result = _mm_mul_ps(result, vScalar); // Scale the results
254
255 _mm_store_ps(outputFloatBuffer, result);
256 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[0]);
257 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[1]);
258 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[2]);
259 *magnitudeVectorPtr++ = (int16_t)rintf(outputFloatBuffer[3]);
260 }
261
262 number = quarterPoints * 4;
263 magnitudeVectorPtr = &magnitudeVector[number];
264 complexVectorPtr = (const int16_t*)&complexVector[number];
265 for (; number < num_points; number++) {
266 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
267 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
268 const float val1Result =
269 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
270 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
271 }
272}
273#endif /* LV_HAVE_SSE */
274
275#ifdef LV_HAVE_GENERIC
276
277static inline void volk_16ic_magnitude_16i_generic(int16_t* magnitudeVector,
278 const lv_16sc_t* complexVector,
279 unsigned int num_points)
280{
281 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
282 int16_t* magnitudeVectorPtr = magnitudeVector;
283 unsigned int number = 0;
284 const float scalar = SHRT_MAX;
285 for (number = 0; number < num_points; number++) {
286 float real = ((float)(*complexVectorPtr++)) / scalar;
287 float imag = ((float)(*complexVectorPtr++)) / scalar;
288 *magnitudeVectorPtr++ =
289 (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
290 }
291}
292#endif /* LV_HAVE_GENERIC */
293
294#ifdef LV_HAVE_ORC_DISABLED
295extern void volk_16ic_magnitude_16i_a_orc_impl(int16_t* magnitudeVector,
296 const lv_16sc_t* complexVector,
297 float scalar,
298 unsigned int num_points);
299
300static inline void volk_16ic_magnitude_16i_u_orc(int16_t* magnitudeVector,
301 const lv_16sc_t* complexVector,
302 unsigned int num_points)
303{
304 volk_16ic_magnitude_16i_a_orc_impl(
305 magnitudeVector, complexVector, SHRT_MAX, num_points);
306}
307#endif /* LV_HAVE_ORC */
308
309
310#endif /* INCLUDED_volk_16ic_magnitude_16i_a_H */
311
312
313#ifndef INCLUDED_volk_16ic_magnitude_16i_u_H
314#define INCLUDED_volk_16ic_magnitude_16i_u_H
315
316#include <inttypes.h>
317#include <math.h>
318#include <stdio.h>
319#include <volk/volk_common.h>
320
321#ifdef LV_HAVE_AVX2
322#include <immintrin.h>
323
324static inline void volk_16ic_magnitude_16i_u_avx2(int16_t* magnitudeVector,
325 const lv_16sc_t* complexVector,
326 unsigned int num_points)
327{
328 unsigned int number = 0;
329 const unsigned int eighthPoints = num_points / 8;
330
331 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
332 int16_t* magnitudeVectorPtr = magnitudeVector;
333
334 __m256 vScalar = _mm256_set1_ps(SHRT_MAX);
335 __m256 invScalar = _mm256_set1_ps(1.0f / SHRT_MAX);
336 __m256i int1, int2;
337 __m128i short1, short2;
338 __m256 cplxValue1, cplxValue2, result;
339 __m256i idx = _mm256_set_epi32(0, 0, 0, 0, 5, 1, 4, 0);
340
341 for (; number < eighthPoints; number++) {
342
343 int1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
344 complexVectorPtr += 16;
345 short1 = _mm256_extracti128_si256(int1, 0);
346 short2 = _mm256_extracti128_si256(int1, 1);
347
348 int1 = _mm256_cvtepi16_epi32(short1);
349 int2 = _mm256_cvtepi16_epi32(short2);
350 cplxValue1 = _mm256_cvtepi32_ps(int1);
351 cplxValue2 = _mm256_cvtepi32_ps(int2);
352
353 cplxValue1 = _mm256_mul_ps(cplxValue1, invScalar);
354 cplxValue2 = _mm256_mul_ps(cplxValue2, invScalar);
355
356 cplxValue1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the values
357 cplxValue2 = _mm256_mul_ps(cplxValue2, cplxValue2); // Square the Values
358
359 result = _mm256_hadd_ps(cplxValue1, cplxValue2); // Add the I2 and Q2 values
360
361 result = _mm256_sqrt_ps(result); // Square root the values
362
363 result = _mm256_mul_ps(result, vScalar); // Scale the results
364
365 int1 = _mm256_cvtps_epi32(result);
366 int1 = _mm256_packs_epi32(int1, int1);
367 int1 = _mm256_permutevar8x32_epi32(
368 int1, idx); // permute to compensate for shuffling in hadd and packs
369 short1 = _mm256_extracti128_si256(int1, 0);
370 _mm_storeu_si128((__m128i*)magnitudeVectorPtr, short1);
371 magnitudeVectorPtr += 8;
372 }
373
374 number = eighthPoints * 8;
375 magnitudeVectorPtr = &magnitudeVector[number];
376 complexVectorPtr = (const int16_t*)&complexVector[number];
377 for (; number < num_points; number++) {
378 const float val1Real = (float)(*complexVectorPtr++) / SHRT_MAX;
379 const float val1Imag = (float)(*complexVectorPtr++) / SHRT_MAX;
380 const float val1Result =
381 sqrtf((val1Real * val1Real) + (val1Imag * val1Imag)) * SHRT_MAX;
382 *magnitudeVectorPtr++ = (int16_t)rintf(val1Result);
383 }
384}
385#endif /* LV_HAVE_AVX2 */
386
387#ifdef LV_HAVE_NEONV7
388#include <arm_neon.h>
390
391static inline void volk_16ic_magnitude_16i_neonv7(int16_t* magnitudeVector,
392 const lv_16sc_t* complexVector,
393 unsigned int num_points)
394{
395 unsigned int number = 0;
396 unsigned int quarter_points = num_points / 4;
397
398 const float scalar = SHRT_MAX;
399 const float inv_scalar = 1.0f / scalar;
400
401 int16_t* magnitudeVectorPtr = magnitudeVector;
402 const lv_16sc_t* complexVectorPtr = complexVector;
403
404 float32x4_t mag_vec;
405 float32x4x2_t c_vec;
406
407 for (number = 0; number < quarter_points; number++) {
408 const int16x4x2_t c16_vec = vld2_s16((int16_t*)complexVectorPtr);
409 __VOLK_PREFETCH(complexVectorPtr + 4);
410 c_vec.val[0] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[0]));
411 c_vec.val[1] = vcvtq_f32_s32(vmovl_s16(c16_vec.val[1]));
412 // Scale to close to 0-1
413 c_vec.val[0] = vmulq_n_f32(c_vec.val[0], inv_scalar);
414 c_vec.val[1] = vmulq_n_f32(c_vec.val[1], inv_scalar);
415 // vsqrtq_f32 is armv8
416 const float32x4_t mag_vec_squared = _vmagnitudesquaredq_f32(c_vec);
417 mag_vec = vmulq_f32(mag_vec_squared, _vinvsqrtq_f32(mag_vec_squared));
418 // Reconstruct
419 mag_vec = vmulq_n_f32(mag_vec, scalar);
420 // Add 0.5 for correct rounding because vcvtq_s32_f32 truncates.
421 // This works because the magnitude is always positive.
422 mag_vec = vaddq_f32(mag_vec, vdupq_n_f32(0.5));
423 const int16x4_t mag16_vec = vmovn_s32(vcvtq_s32_f32(mag_vec));
424 vst1_s16(magnitudeVectorPtr, mag16_vec);
425 // Advance pointers
426 magnitudeVectorPtr += 4;
427 complexVectorPtr += 4;
428 }
429
430 // Deal with the rest
431 for (number = quarter_points * 4; number < num_points; number++) {
432 const float real = lv_creal(*complexVectorPtr) * inv_scalar;
433 const float imag = lv_cimag(*complexVectorPtr) * inv_scalar;
434 *magnitudeVectorPtr =
435 (int16_t)rintf(sqrtf((real * real) + (imag * imag)) * scalar);
436 complexVectorPtr++;
437 magnitudeVectorPtr++;
438 }
439}
440#endif /* LV_HAVE_NEONV7 */
441
442#endif /* INCLUDED_volk_16ic_magnitude_16i_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_16ic_magnitude_16i_generic(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:277
static void volk_16ic_magnitude_16i_a_sse(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:202
static void volk_16ic_magnitude_16i_a_sse3(int16_t *magnitudeVector, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_magnitude_16i.h:132
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cimag(x)
Definition: volk_complex.h:89
#define lv_creal(x)
Definition: volk_complex.h:87
short complex lv_16sc_t
Definition: volk_complex.h:62
static float32x4_t _vinvsqrtq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:96
static float32x4_t _vmagnitudesquaredq_f32(float32x4x2_t cmplxValue)
Definition: volk_neon_intrinsics.h:86