Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_16i_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
55#define INCLUDED_volk_8ic_deinterleave_16i_x2_a_H
56
57#include <inttypes.h>
58#include <stdio.h>
59
60#ifdef LV_HAVE_AVX2
61#include <immintrin.h>
62
63static inline void volk_8ic_deinterleave_16i_x2_a_avx2(int16_t* iBuffer,
64 int16_t* qBuffer,
65 const lv_8sc_t* complexVector,
66 unsigned int num_points)
67{
68 unsigned int number = 0;
69 const int8_t* complexVectorPtr = (int8_t*)complexVector;
70 int16_t* iBufferPtr = iBuffer;
71 int16_t* qBufferPtr = qBuffer;
72 __m256i MoveMask = _mm256_set_epi8(15,
73 13,
74 11,
75 9,
76 7,
77 5,
78 3,
79 1,
80 14,
81 12,
82 10,
83 8,
84 6,
85 4,
86 2,
87 0,
88 15,
89 13,
90 11,
91 9,
92 7,
93 5,
94 3,
95 1,
96 14,
97 12,
98 10,
99 8,
100 6,
101 4,
102 2,
103 0);
104 __m256i complexVal, iOutputVal, qOutputVal;
105 __m128i iOutputVal0, qOutputVal0;
106
107 unsigned int sixteenthPoints = num_points / 16;
108
109 for (number = 0; number < sixteenthPoints; number++) {
110 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
111 complexVectorPtr += 32;
112
113 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
114 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
115
116 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
117 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
118
119 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
120 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
121
122 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
123 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
124
125 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
126 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
127
128 iBufferPtr += 16;
129 qBufferPtr += 16;
130 }
131
132 number = sixteenthPoints * 16;
133 for (; number < num_points; number++) {
134 *iBufferPtr++ =
135 ((int16_t)*complexVectorPtr++) *
136 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
137 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
138 }
139}
140#endif /* LV_HAVE_AVX2 */
141
142#ifdef LV_HAVE_SSE4_1
143#include <smmintrin.h>
144
145static inline void volk_8ic_deinterleave_16i_x2_a_sse4_1(int16_t* iBuffer,
146 int16_t* qBuffer,
147 const lv_8sc_t* complexVector,
148 unsigned int num_points)
149{
150 unsigned int number = 0;
151 const int8_t* complexVectorPtr = (int8_t*)complexVector;
152 int16_t* iBufferPtr = iBuffer;
153 int16_t* qBufferPtr = qBuffer;
154 __m128i iMoveMask = _mm_set_epi8(0x80,
155 0x80,
156 0x80,
157 0x80,
158 0x80,
159 0x80,
160 0x80,
161 0x80,
162 14,
163 12,
164 10,
165 8,
166 6,
167 4,
168 2,
169 0); // set 16 byte values
170 __m128i qMoveMask = _mm_set_epi8(
171 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
172 __m128i complexVal, iOutputVal, qOutputVal;
173
174 unsigned int eighthPoints = num_points / 8;
175
176 for (number = 0; number < eighthPoints; number++) {
177 complexVal = _mm_load_si128((__m128i*)complexVectorPtr);
178 complexVectorPtr += 16; // aligned load
179
180 iOutputVal = _mm_shuffle_epi8(complexVal,
181 iMoveMask); // shuffle 16 bytes of 128bit complexVal
182 qOutputVal = _mm_shuffle_epi8(complexVal, qMoveMask);
183
184 iOutputVal = _mm_cvtepi8_epi16(iOutputVal); // fills 2-byte sign extended versions
185 // of lower 8 bytes of input to output
186 iOutputVal =
187 _mm_slli_epi16(iOutputVal, 8); // shift in left by 8 bits, each of the 8
188 // 16-bit integers, shift in with zeros
189
190 qOutputVal = _mm_cvtepi8_epi16(qOutputVal);
191 qOutputVal = _mm_slli_epi16(qOutputVal, 8);
192
193 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal); // aligned store
194 _mm_store_si128((__m128i*)qBufferPtr, qOutputVal);
195
196 iBufferPtr += 8;
197 qBufferPtr += 8;
198 }
199
200 number = eighthPoints * 8;
201 for (; number < num_points; number++) {
202 *iBufferPtr++ =
203 ((int16_t)*complexVectorPtr++) *
204 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
205 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
206 }
207}
208#endif /* LV_HAVE_SSE4_1 */
209
210
211#ifdef LV_HAVE_AVX
212#include <immintrin.h>
213
214static inline void volk_8ic_deinterleave_16i_x2_a_avx(int16_t* iBuffer,
215 int16_t* qBuffer,
216 const lv_8sc_t* complexVector,
217 unsigned int num_points)
218{
219 unsigned int number = 0;
220 const int8_t* complexVectorPtr = (int8_t*)complexVector;
221 int16_t* iBufferPtr = iBuffer;
222 int16_t* qBufferPtr = qBuffer;
223 __m128i iMoveMask = _mm_set_epi8(0x80,
224 0x80,
225 0x80,
226 0x80,
227 0x80,
228 0x80,
229 0x80,
230 0x80,
231 14,
232 12,
233 10,
234 8,
235 6,
236 4,
237 2,
238 0); // set 16 byte values
239 __m128i qMoveMask = _mm_set_epi8(
240 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 15, 13, 11, 9, 7, 5, 3, 1);
241 __m256i complexVal, iOutputVal, qOutputVal;
242 __m128i complexVal1, complexVal0;
243 __m128i iOutputVal1, iOutputVal0, qOutputVal1, qOutputVal0;
244
245 unsigned int sixteenthPoints = num_points / 16;
246
247 for (number = 0; number < sixteenthPoints; number++) {
248 complexVal = _mm256_load_si256((__m256i*)complexVectorPtr);
249 complexVectorPtr += 32; // aligned load
250
251 // Extract from complexVal to iOutputVal and qOutputVal
252 complexVal1 = _mm256_extractf128_si256(complexVal, 1);
253 complexVal0 = _mm256_extractf128_si256(complexVal, 0);
254
255 iOutputVal1 = _mm_shuffle_epi8(
256 complexVal1, iMoveMask); // shuffle 16 bytes of 128bit complexVal
257 iOutputVal0 = _mm_shuffle_epi8(complexVal0, iMoveMask);
258 qOutputVal1 = _mm_shuffle_epi8(complexVal1, qMoveMask);
259 qOutputVal0 = _mm_shuffle_epi8(complexVal0, qMoveMask);
260
261 iOutputVal1 =
262 _mm_cvtepi8_epi16(iOutputVal1); // fills 2-byte sign extended versions of
263 // lower 8 bytes of input to output
264 iOutputVal1 =
265 _mm_slli_epi16(iOutputVal1, 8); // shift in left by 8 bits, each of the 8
266 // 16-bit integers, shift in with zeros
267 iOutputVal0 = _mm_cvtepi8_epi16(iOutputVal0);
268 iOutputVal0 = _mm_slli_epi16(iOutputVal0, 8);
269
270 qOutputVal1 = _mm_cvtepi8_epi16(qOutputVal1);
271 qOutputVal1 = _mm_slli_epi16(qOutputVal1, 8);
272 qOutputVal0 = _mm_cvtepi8_epi16(qOutputVal0);
273 qOutputVal0 = _mm_slli_epi16(qOutputVal0, 8);
274
275 // Pack iOutputVal0,1 to iOutputVal
276 __m256i dummy = _mm256_setzero_si256();
277 iOutputVal = _mm256_insertf128_si256(dummy, iOutputVal0, 0);
278 iOutputVal = _mm256_insertf128_si256(iOutputVal, iOutputVal1, 1);
279 qOutputVal = _mm256_insertf128_si256(dummy, qOutputVal0, 0);
280 qOutputVal = _mm256_insertf128_si256(qOutputVal, qOutputVal1, 1);
281
282 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal); // aligned store
283 _mm256_store_si256((__m256i*)qBufferPtr, qOutputVal);
284
285 iBufferPtr += 16;
286 qBufferPtr += 16;
287 }
288
289 number = sixteenthPoints * 16;
290 for (; number < num_points; number++) {
291 *iBufferPtr++ =
292 ((int16_t)*complexVectorPtr++) *
293 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
294 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
295 }
296}
297#endif /* LV_HAVE_AVX */
298
299
300#ifdef LV_HAVE_GENERIC
301
302static inline void volk_8ic_deinterleave_16i_x2_generic(int16_t* iBuffer,
303 int16_t* qBuffer,
304 const lv_8sc_t* complexVector,
305 unsigned int num_points)
306{
307 const int8_t* complexVectorPtr = (const int8_t*)complexVector;
308 int16_t* iBufferPtr = iBuffer;
309 int16_t* qBufferPtr = qBuffer;
310 unsigned int number;
311 for (number = 0; number < num_points; number++) {
312 *iBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
313 *qBufferPtr++ = (int16_t)(*complexVectorPtr++) * 256;
314 }
315}
316#endif /* LV_HAVE_GENERIC */
317
318
319#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_a_H */
320
321#ifndef INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
322#define INCLUDED_volk_8ic_deinterleave_16i_x2_u_H
323
324#include <inttypes.h>
325#include <stdio.h>
326
327#ifdef LV_HAVE_AVX2
328#include <immintrin.h>
329
330static inline void volk_8ic_deinterleave_16i_x2_u_avx2(int16_t* iBuffer,
331 int16_t* qBuffer,
332 const lv_8sc_t* complexVector,
333 unsigned int num_points)
334{
335 unsigned int number = 0;
336 const int8_t* complexVectorPtr = (int8_t*)complexVector;
337 int16_t* iBufferPtr = iBuffer;
338 int16_t* qBufferPtr = qBuffer;
339 __m256i MoveMask = _mm256_set_epi8(15,
340 13,
341 11,
342 9,
343 7,
344 5,
345 3,
346 1,
347 14,
348 12,
349 10,
350 8,
351 6,
352 4,
353 2,
354 0,
355 15,
356 13,
357 11,
358 9,
359 7,
360 5,
361 3,
362 1,
363 14,
364 12,
365 10,
366 8,
367 6,
368 4,
369 2,
370 0);
371 __m256i complexVal, iOutputVal, qOutputVal;
372 __m128i iOutputVal0, qOutputVal0;
373
374 unsigned int sixteenthPoints = num_points / 16;
375
376 for (number = 0; number < sixteenthPoints; number++) {
377 complexVal = _mm256_loadu_si256((__m256i*)complexVectorPtr);
378 complexVectorPtr += 32;
379
380 complexVal = _mm256_shuffle_epi8(complexVal, MoveMask);
381 complexVal = _mm256_permute4x64_epi64(complexVal, 0xd8);
382
383 iOutputVal0 = _mm256_extracti128_si256(complexVal, 0);
384 qOutputVal0 = _mm256_extracti128_si256(complexVal, 1);
385
386 iOutputVal = _mm256_cvtepi8_epi16(iOutputVal0);
387 iOutputVal = _mm256_slli_epi16(iOutputVal, 8);
388
389 qOutputVal = _mm256_cvtepi8_epi16(qOutputVal0);
390 qOutputVal = _mm256_slli_epi16(qOutputVal, 8);
391
392 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
393 _mm256_storeu_si256((__m256i*)qBufferPtr, qOutputVal);
394
395 iBufferPtr += 16;
396 qBufferPtr += 16;
397 }
398
399 number = sixteenthPoints * 16;
400 for (; number < num_points; number++) {
401 *iBufferPtr++ =
402 ((int16_t)*complexVectorPtr++) *
403 256; // load 8 bit Complexvector into 16 bit, shift left by 8 bits and store
404 *qBufferPtr++ = ((int16_t)*complexVectorPtr++) * 256;
405 }
406}
407#endif /* LV_HAVE_AVX2 */
408#endif /* INCLUDED_volk_8ic_deinterleave_16i_x2_u_H */
static void volk_8ic_deinterleave_16i_x2_generic(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:302
static void volk_8ic_deinterleave_16i_x2_a_avx(int16_t *iBuffer, int16_t *qBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_16i_x2.h:214
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61