Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_a_H
55#define INCLUDED_volk_16ic_deinterleave_real_8i_a_H
56
57#include <inttypes.h>
58#include <stdio.h>
59
60
61#ifdef LV_HAVE_AVX2
62#include <immintrin.h>
63
64static inline void volk_16ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
65 const lv_16sc_t* complexVector,
66 unsigned int num_points)
67{
68 unsigned int number = 0;
69 const int8_t* complexVectorPtr = (int8_t*)complexVector;
70 int8_t* iBufferPtr = iBuffer;
71 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
72 0x80,
73 0x80,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 13,
80 12,
81 9,
82 8,
83 5,
84 4,
85 1,
86 0,
87 0x80,
88 0x80,
89 0x80,
90 0x80,
91 0x80,
92 0x80,
93 0x80,
94 0x80,
95 13,
96 12,
97 9,
98 8,
99 5,
100 4,
101 1,
102 0);
103 __m256i iMoveMask2 = _mm256_set_epi8(13,
104 12,
105 9,
106 8,
107 5,
108 4,
109 1,
110 0,
111 0x80,
112 0x80,
113 0x80,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 13,
120 12,
121 9,
122 8,
123 5,
124 4,
125 1,
126 0,
127 0x80,
128 0x80,
129 0x80,
130 0x80,
131 0x80,
132 0x80,
133 0x80,
134 0x80);
135 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
136
137 unsigned int thirtysecondPoints = num_points / 32;
138
139 for (number = 0; number < thirtysecondPoints; number++) {
140 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
141 complexVectorPtr += 32;
142 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
143 complexVectorPtr += 32;
144
145 complexVal3 = _mm256_load_si256((__m256i*)complexVectorPtr);
146 complexVectorPtr += 32;
147 complexVal4 = _mm256_load_si256((__m256i*)complexVectorPtr);
148 complexVectorPtr += 32;
149
150 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
151 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
152
153 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
154 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
155
156 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
157 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
158
159 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
160 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
161
162 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
163 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
164
165 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
166 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
167
168 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
169
170 iBufferPtr += 32;
171 }
172
173 number = thirtysecondPoints * 32;
174 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
175 for (; number < num_points; number++) {
176 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
177 int16ComplexVectorPtr++;
178 }
179}
180#endif /* LV_HAVE_AVX2 */
181
182
183#ifdef LV_HAVE_SSSE3
184#include <tmmintrin.h>
185
186static inline void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
187 const lv_16sc_t* complexVector,
188 unsigned int num_points)
189{
190 unsigned int number = 0;
191 const int8_t* complexVectorPtr = (int8_t*)complexVector;
192 int8_t* iBufferPtr = iBuffer;
193 __m128i iMoveMask1 = _mm_set_epi8(
194 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
195 __m128i iMoveMask2 = _mm_set_epi8(
196 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
197 __m128i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
198
199 unsigned int sixteenthPoints = num_points / 16;
200
201 for (number = 0; number < sixteenthPoints; number++) {
202 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
203 complexVectorPtr += 16;
204 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
205 complexVectorPtr += 16;
206
207 complexVal3 = _mm_load_si128((__m128i*)complexVectorPtr);
208 complexVectorPtr += 16;
209 complexVal4 = _mm_load_si128((__m128i*)complexVectorPtr);
210 complexVectorPtr += 16;
211
212 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
213 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
214
215 complexVal1 = _mm_or_si128(complexVal1, complexVal2);
216
217 complexVal3 = _mm_shuffle_epi8(complexVal3, iMoveMask1);
218 complexVal4 = _mm_shuffle_epi8(complexVal4, iMoveMask2);
219
220 complexVal3 = _mm_or_si128(complexVal3, complexVal4);
221
222
223 complexVal1 = _mm_srai_epi16(complexVal1, 8);
224 complexVal3 = _mm_srai_epi16(complexVal3, 8);
225
226 iOutputVal = _mm_packs_epi16(complexVal1, complexVal3);
227
228 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
229
230 iBufferPtr += 16;
231 }
232
233 number = sixteenthPoints * 16;
234 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
235 for (; number < num_points; number++) {
236 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
237 int16ComplexVectorPtr++;
238 }
239}
240#endif /* LV_HAVE_SSSE3 */
241
242#ifdef LV_HAVE_GENERIC
243
244static inline void volk_16ic_deinterleave_real_8i_generic(int8_t* iBuffer,
245 const lv_16sc_t* complexVector,
246 unsigned int num_points)
247{
248 unsigned int number = 0;
249 int16_t* complexVectorPtr = (int16_t*)complexVector;
250 int8_t* iBufferPtr = iBuffer;
251 for (number = 0; number < num_points; number++) {
252 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
253 complexVectorPtr++;
254 }
255}
256#endif /* LV_HAVE_GENERIC */
257
258#ifdef LV_HAVE_NEON
259#include <arm_neon.h>
260
261static inline void volk_16ic_deinterleave_real_8i_neon(int8_t* iBuffer,
262 const lv_16sc_t* complexVector,
263 unsigned int num_points)
264{
265 const int16_t* complexVectorPtr = (const int16_t*)complexVector;
266 int8_t* iBufferPtr = iBuffer;
267 unsigned int eighth_points = num_points / 8;
268 unsigned int number;
269
270 int16x8x2_t complexInput;
271 int8x8_t realOutput;
272 for (number = 0; number < eighth_points; number++) {
273 complexInput = vld2q_s16(complexVectorPtr);
274 realOutput = vshrn_n_s16(complexInput.val[0], 8);
275 vst1_s8(iBufferPtr, realOutput);
276 complexVectorPtr += 16;
277 iBufferPtr += 8;
278 }
279
280 for (number = eighth_points * 8; number < num_points; number++) {
281 *iBufferPtr++ = ((int8_t)(*complexVectorPtr++ >> 8));
282 complexVectorPtr++;
283 }
284}
285#endif
286
287#ifdef LV_HAVE_ORC
288
289extern void volk_16ic_deinterleave_real_8i_a_orc_impl(int8_t* iBuffer,
290 const lv_16sc_t* complexVector,
291 unsigned int num_points);
292
293static inline void volk_16ic_deinterleave_real_8i_u_orc(int8_t* iBuffer,
294 const lv_16sc_t* complexVector,
295 unsigned int num_points)
296{
297 volk_16ic_deinterleave_real_8i_a_orc_impl(iBuffer, complexVector, num_points);
298}
299#endif /* LV_HAVE_ORC */
300
301
302#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_a_H */
303
304#ifndef INCLUDED_volk_16ic_deinterleave_real_8i_u_H
305#define INCLUDED_volk_16ic_deinterleave_real_8i_u_H
306
307#include <inttypes.h>
308#include <stdio.h>
309
310
311#ifdef LV_HAVE_AVX2
312#include <immintrin.h>
313
314static inline void volk_16ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
315 const lv_16sc_t* complexVector,
316 unsigned int num_points)
317{
318 unsigned int number = 0;
319 const int8_t* complexVectorPtr = (int8_t*)complexVector;
320 int8_t* iBufferPtr = iBuffer;
321 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
322 0x80,
323 0x80,
324 0x80,
325 0x80,
326 0x80,
327 0x80,
328 0x80,
329 13,
330 12,
331 9,
332 8,
333 5,
334 4,
335 1,
336 0,
337 0x80,
338 0x80,
339 0x80,
340 0x80,
341 0x80,
342 0x80,
343 0x80,
344 0x80,
345 13,
346 12,
347 9,
348 8,
349 5,
350 4,
351 1,
352 0);
353 __m256i iMoveMask2 = _mm256_set_epi8(13,
354 12,
355 9,
356 8,
357 5,
358 4,
359 1,
360 0,
361 0x80,
362 0x80,
363 0x80,
364 0x80,
365 0x80,
366 0x80,
367 0x80,
368 0x80,
369 13,
370 12,
371 9,
372 8,
373 5,
374 4,
375 1,
376 0,
377 0x80,
378 0x80,
379 0x80,
380 0x80,
381 0x80,
382 0x80,
383 0x80,
384 0x80);
385 __m256i complexVal1, complexVal2, complexVal3, complexVal4, iOutputVal;
386
387 unsigned int thirtysecondPoints = num_points / 32;
388
389 for (number = 0; number < thirtysecondPoints; number++) {
390 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
391 complexVectorPtr += 32;
392 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
393 complexVectorPtr += 32;
394
395 complexVal3 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
396 complexVectorPtr += 32;
397 complexVal4 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
398 complexVectorPtr += 32;
399
400 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
401 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
402
403 complexVal1 = _mm256_or_si256(complexVal1, complexVal2);
404 complexVal1 = _mm256_permute4x64_epi64(complexVal1, 0xd8);
405
406 complexVal3 = _mm256_shuffle_epi8(complexVal3, iMoveMask1);
407 complexVal4 = _mm256_shuffle_epi8(complexVal4, iMoveMask2);
408
409 complexVal3 = _mm256_or_si256(complexVal3, complexVal4);
410 complexVal3 = _mm256_permute4x64_epi64(complexVal3, 0xd8);
411
412 complexVal1 = _mm256_srai_epi16(complexVal1, 8);
413 complexVal3 = _mm256_srai_epi16(complexVal3, 8);
414
415 iOutputVal = _mm256_packs_epi16(complexVal1, complexVal3);
416 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
417
418 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
419
420 iBufferPtr += 32;
421 }
422
423 number = thirtysecondPoints * 32;
424 int16_t* int16ComplexVectorPtr = (int16_t*)complexVectorPtr;
425 for (; number < num_points; number++) {
426 *iBufferPtr++ = ((int8_t)(*int16ComplexVectorPtr++ >> 8));
427 int16ComplexVectorPtr++;
428 }
429}
430#endif /* LV_HAVE_AVX2 */
431#endif /* INCLUDED_volk_16ic_deinterleave_real_8i_u_H */
static void volk_16ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:244
static void volk_16ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:261
static void volk_16ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_8i.h:186
short complex lv_16sc_t
Definition: volk_complex.h:62