Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_8ic_s32f_deinterleave_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
56#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
57#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H
58
59#include <inttypes.h>
60#include <stdio.h>
61#include <volk/volk_common.h>
62
63
64#ifdef LV_HAVE_SSE4_1
65#include <smmintrin.h>
66
67static inline void
68volk_8ic_s32f_deinterleave_32f_x2_a_sse4_1(float* iBuffer,
69 float* qBuffer,
70 const lv_8sc_t* complexVector,
71 const float scalar,
72 unsigned int num_points)
73{
74 float* iBufferPtr = iBuffer;
75 float* qBufferPtr = qBuffer;
76
77 unsigned int number = 0;
78 const unsigned int eighthPoints = num_points / 8;
79 __m128 iFloatValue, qFloatValue;
80
81 const float iScalar = 1.0 / scalar;
82 __m128 invScalar = _mm_set_ps1(iScalar);
83 __m128i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
84 int8_t* complexVectorPtr = (int8_t*)complexVector;
85
86 __m128i iMoveMask = _mm_set_epi8(
87 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
88 __m128i qMoveMask = _mm_set_epi8(
89 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
90
91 for (; number < eighthPoints; number++) {
92 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
93 complexVectorPtr += 16;
94 iComplexVal = _mm_shuffle_epi8(complexVal, iMoveMask);
95 qComplexVal = _mm_shuffle_epi8(complexVal, qMoveMask);
96
97 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
98 iFloatValue = _mm_cvtepi32_ps(iIntVal);
99 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
100 _mm_store_ps(iBufferPtr, iFloatValue);
101 iBufferPtr += 4;
102
103 iComplexVal = _mm_srli_si128(iComplexVal, 4);
104
105 iIntVal = _mm_cvtepi8_epi32(iComplexVal);
106 iFloatValue = _mm_cvtepi32_ps(iIntVal);
107 iFloatValue = _mm_mul_ps(iFloatValue, invScalar);
108 _mm_store_ps(iBufferPtr, iFloatValue);
109 iBufferPtr += 4;
110
111 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
112 qFloatValue = _mm_cvtepi32_ps(qIntVal);
113 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
114 _mm_store_ps(qBufferPtr, qFloatValue);
115 qBufferPtr += 4;
116
117 qComplexVal = _mm_srli_si128(qComplexVal, 4);
118
119 qIntVal = _mm_cvtepi8_epi32(qComplexVal);
120 qFloatValue = _mm_cvtepi32_ps(qIntVal);
121 qFloatValue = _mm_mul_ps(qFloatValue, invScalar);
122 _mm_store_ps(qBufferPtr, qFloatValue);
123
124 qBufferPtr += 4;
125 }
126
127 number = eighthPoints * 8;
128 for (; number < num_points; number++) {
129 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
130 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
131 }
132}
133#endif /* LV_HAVE_SSE4_1 */
134
135
136#ifdef LV_HAVE_SSE
137#include <xmmintrin.h>
138
139static inline void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float* iBuffer,
140 float* qBuffer,
141 const lv_8sc_t* complexVector,
142 const float scalar,
143 unsigned int num_points)
144{
145 float* iBufferPtr = iBuffer;
146 float* qBufferPtr = qBuffer;
147
148 unsigned int number = 0;
149 const unsigned int quarterPoints = num_points / 4;
150 __m128 cplxValue1, cplxValue2, iValue, qValue;
151
152 __m128 invScalar = _mm_set_ps1(1.0 / scalar);
153 int8_t* complexVectorPtr = (int8_t*)complexVector;
154
155 __VOLK_ATTR_ALIGNED(16) float floatBuffer[8];
156
157 for (; number < quarterPoints; number++) {
158 floatBuffer[0] = (float)(complexVectorPtr[0]);
159 floatBuffer[1] = (float)(complexVectorPtr[1]);
160 floatBuffer[2] = (float)(complexVectorPtr[2]);
161 floatBuffer[3] = (float)(complexVectorPtr[3]);
162
163 floatBuffer[4] = (float)(complexVectorPtr[4]);
164 floatBuffer[5] = (float)(complexVectorPtr[5]);
165 floatBuffer[6] = (float)(complexVectorPtr[6]);
166 floatBuffer[7] = (float)(complexVectorPtr[7]);
167
168 cplxValue1 = _mm_load_ps(&floatBuffer[0]);
169 cplxValue2 = _mm_load_ps(&floatBuffer[4]);
170
171 complexVectorPtr += 8;
172
173 cplxValue1 = _mm_mul_ps(cplxValue1, invScalar);
174 cplxValue2 = _mm_mul_ps(cplxValue2, invScalar);
175
176 // Arrange in i1i2i3i4 format
177 iValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(2, 0, 2, 0));
178 qValue = _mm_shuffle_ps(cplxValue1, cplxValue2, _MM_SHUFFLE(3, 1, 3, 1));
179
180 _mm_store_ps(iBufferPtr, iValue);
181 _mm_store_ps(qBufferPtr, qValue);
182
183 iBufferPtr += 4;
184 qBufferPtr += 4;
185 }
186
187 number = quarterPoints * 4;
188 complexVectorPtr = (int8_t*)&complexVector[number];
189 for (; number < num_points; number++) {
190 *iBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
191 *qBufferPtr++ = (float)(*complexVectorPtr++) / scalar;
192 }
193}
194#endif /* LV_HAVE_SSE */
195
196
197#ifdef LV_HAVE_AVX2
198#include <immintrin.h>
199
200static inline void volk_8ic_s32f_deinterleave_32f_x2_a_avx2(float* iBuffer,
201 float* qBuffer,
202 const lv_8sc_t* complexVector,
203 const float scalar,
204 unsigned int num_points)
205{
206 float* iBufferPtr = iBuffer;
207 float* qBufferPtr = qBuffer;
208
209 unsigned int number = 0;
210 const unsigned int sixteenthPoints = num_points / 16;
211 __m256 iFloatValue, qFloatValue;
212
213 const float iScalar = 1.0 / scalar;
214 __m256 invScalar = _mm256_set1_ps(iScalar);
215 __m256i complexVal, iIntVal, qIntVal, iComplexVal, qComplexVal;
216 int8_t* complexVectorPtr = (int8_t*)complexVector;
217
218 __m256i iMoveMask = _mm256_set_epi8(0x80,
219 0x80,
220 0x80,
221 0x80,
222 0x80,
223 0x80,
224 0x80,
225 0x80,
226 14,
227 12,
228 10,
229 8,
230 6,
231 4,
232 2,
233 0,
234 0x80,
235 0x80,
236 0x80,
237 0x80,
238 0x80,
239 0x80,
240 0x80,
241 0x80,
242 14,
243 12,
244 10,
245 8,
246 6,
247 4,
248 2,
249 0);
250 __m256i qMoveMask = _mm256_set_epi8(0x80,
251 0x80,
252 0x80,
253 0x80,
254 0x80,
255 0x80,
256 0x80,
257 0x80,
258 15,
259 13,
260 11,
261 9,
262 7,
263 5,
264 3,
265 1,
266 0x80,
267 0x80,
268 0x80,
269 0x80,
270 0x80,
271 0x80,
272 0x80,
273 0x80,
274 15,
275 13,
276 11,
277 9,
278 7,
279 5,
280 3,
281 1);
282
283 for (; number < sixteenthPoints; number++) {
284 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
285 complexVectorPtr += 32;
286 iComplexVal = _mm256_shuffle_epi8(complexVal, iMoveMask);
287 qComplexVal = _mm256_shuffle_epi8(complexVal, qMoveMask);
288
289 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
290 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
291 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
292 _mm256_store_ps(iBufferPtr, iFloatValue);
293 iBufferPtr += 8;
294
295 iComplexVal = _mm256_permute4x64_epi64(iComplexVal, 0b11000110);
296 iIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(iComplexVal));
297 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
298 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
299 _mm256_store_ps(iBufferPtr, iFloatValue);
300 iBufferPtr += 8;
301
302 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
303 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
304 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
305 _mm256_store_ps(qBufferPtr, qFloatValue);
306 qBufferPtr += 8;
307
308 qComplexVal = _mm256_permute4x64_epi64(qComplexVal, 0b11000110);
309 qIntVal = _mm256_cvtepi8_epi32(_mm256_castsi256_si128(qComplexVal));
310 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
311 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
312 _mm256_store_ps(qBufferPtr, qFloatValue);
313 qBufferPtr += 8;
314 }
315
316 number = sixteenthPoints * 16;
317 for (; number < num_points; number++) {
318 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
319 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
320 }
321}
322#endif /* LV_HAVE_AVX2 */
323
324
325#ifdef LV_HAVE_GENERIC
326
327static inline void
329 float* qBuffer,
330 const lv_8sc_t* complexVector,
331 const float scalar,
332 unsigned int num_points)
333{
334 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
335 float* iBufferPtr = iBuffer;
336 float* qBufferPtr = qBuffer;
337 unsigned int number;
338 const float invScalar = 1.0 / scalar;
339 for (number = 0; number < num_points; number++) {
340 *iBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
341 *qBufferPtr++ = (float)(*complexVectorPtr++) * invScalar;
342 }
343}
344#endif /* LV_HAVE_GENERIC */
345
346
347#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_a_H */
348
349
350#ifndef INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
351#define INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H
352
353#include <inttypes.h>
354#include <stdio.h>
355#include <volk/volk_common.h>
356
357#ifdef LV_HAVE_AVX2
358#include <immintrin.h>
359
360static inline void volk_8ic_s32f_deinterleave_32f_x2_u_avx2(float* iBuffer,
361 float* qBuffer,
362 const lv_8sc_t* complexVector,
363 const float scalar,
364 unsigned int num_points)
365{
366 float* iBufferPtr = iBuffer;
367 float* qBufferPtr = qBuffer;
368
369 unsigned int number = 0;
370 const unsigned int sixteenthPoints = num_points / 16;
371 __m256 iFloatValue, qFloatValue;
372
373 const float iScalar = 1.0 / scalar;
374 __m256 invScalar = _mm256_set1_ps(iScalar);
375 __m256i complexVal, iIntVal, qIntVal;
376 __m128i iComplexVal, qComplexVal;
377 int8_t* complexVectorPtr = (int8_t*)complexVector;
378
379 __m256i MoveMask = _mm256_set_epi8(15,
380 13,
381 11,
382 9,
383 7,
384 5,
385 3,
386 1,
387 14,
388 12,
389 10,
390 8,
391 6,
392 4,
393 2,
394 0,
395 15,
396 13,
397 11,
398 9,
399 7,
400 5,
401 3,
402 1,
403 14,
404 12,
405 10,
406 8,
407 6,
408 4,
409 2,
410 0);
411
412 for (; number < sixteenthPoints; number++) {
413 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
414 complexVectorPtr += 32;
415 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
416 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
417 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
418 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
419
420 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
421 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
422 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
423 _mm256_storeu_ps(iBufferPtr, iFloatValue);
424 iBufferPtr += 8;
425
426 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
427 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
428 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
429 _mm256_storeu_ps(qBufferPtr, qFloatValue);
430 qBufferPtr += 8;
431
432 complexVal = _mm256_srli_si256(complexVal, 8);
433 iComplexVal = _mm256_extractf128_si256(complexVal, 0);
434 qComplexVal = _mm256_extractf128_si256(complexVal, 1);
435
436 iIntVal = _mm256_cvtepi8_epi32(iComplexVal);
437 iFloatValue = _mm256_cvtepi32_ps(iIntVal);
438 iFloatValue = _mm256_mul_ps(iFloatValue, invScalar);
439 _mm256_storeu_ps(iBufferPtr, iFloatValue);
440 iBufferPtr += 8;
441
442 qIntVal = _mm256_cvtepi8_epi32(qComplexVal);
443 qFloatValue = _mm256_cvtepi32_ps(qIntVal);
444 qFloatValue = _mm256_mul_ps(qFloatValue, invScalar);
445 _mm256_storeu_ps(qBufferPtr, qFloatValue);
446 qBufferPtr += 8;
447 }
448
449 number = sixteenthPoints * 16;
450 for (; number < num_points; number++) {
451 *iBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
452 *qBufferPtr++ = (float)(*complexVectorPtr++) * iScalar;
453 }
454}
455#endif /* LV_HAVE_AVX2 */
456
457#endif /* INCLUDED_volk_8ic_s32f_deinterleave_32f_x2_u_H */
static void volk_8ic_s32f_deinterleave_32f_x2_a_sse(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:139
static void volk_8ic_s32f_deinterleave_32f_x2_generic(float *iBuffer, float *qBuffer, const lv_8sc_t *complexVector, const float scalar, unsigned int num_points)
Definition: volk_8ic_s32f_deinterleave_32f_x2.h:328
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61