Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_8ic_deinterleave_real_8i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
53#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
54#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_AVX2
60#include <immintrin.h>
61
62static inline void volk_8ic_deinterleave_real_8i_a_avx2(int8_t* iBuffer,
63 const lv_8sc_t* complexVector,
64 unsigned int num_points)
65{
66 unsigned int number = 0;
67 const int8_t* complexVectorPtr = (int8_t*)complexVector;
68 int8_t* iBufferPtr = iBuffer;
69 __m256i moveMask1 = _mm256_set_epi8(0x80,
70 0x80,
71 0x80,
72 0x80,
73 0x80,
74 0x80,
75 0x80,
76 0x80,
77 14,
78 12,
79 10,
80 8,
81 6,
82 4,
83 2,
84 0,
85 0x80,
86 0x80,
87 0x80,
88 0x80,
89 0x80,
90 0x80,
91 0x80,
92 0x80,
93 14,
94 12,
95 10,
96 8,
97 6,
98 4,
99 2,
100 0);
101 __m256i moveMask2 = _mm256_set_epi8(14,
102 12,
103 10,
104 8,
105 6,
106 4,
107 2,
108 0,
109 0x80,
110 0x80,
111 0x80,
112 0x80,
113 0x80,
114 0x80,
115 0x80,
116 0x80,
117 14,
118 12,
119 10,
120 8,
121 6,
122 4,
123 2,
124 0,
125 0x80,
126 0x80,
127 0x80,
128 0x80,
129 0x80,
130 0x80,
131 0x80,
132 0x80);
133 __m256i complexVal1, complexVal2, outputVal;
134
135 unsigned int thirtysecondPoints = num_points / 32;
136
137 for (number = 0; number < thirtysecondPoints; number++) {
138
139 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
140 complexVectorPtr += 32;
141 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
142 complexVectorPtr += 32;
143
144 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
145 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
146 outputVal = _mm256_or_si256(complexVal1, complexVal2);
147 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
148
149 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
150 iBufferPtr += 32;
151 }
152
153 number = thirtysecondPoints * 32;
154 for (; number < num_points; number++) {
155 *iBufferPtr++ = *complexVectorPtr++;
156 complexVectorPtr++;
157 }
158}
159#endif /* LV_HAVE_AVX2 */
160
161
162#ifdef LV_HAVE_SSSE3
163#include <tmmintrin.h>
164
165static inline void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t* iBuffer,
166 const lv_8sc_t* complexVector,
167 unsigned int num_points)
168{
169 unsigned int number = 0;
170 const int8_t* complexVectorPtr = (int8_t*)complexVector;
171 int8_t* iBufferPtr = iBuffer;
172 __m128i moveMask1 = _mm_set_epi8(
173 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
174 __m128i moveMask2 = _mm_set_epi8(
175 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
176 __m128i complexVal1, complexVal2, outputVal;
177
178 unsigned int sixteenthPoints = num_points / 16;
179
180 for (number = 0; number < sixteenthPoints; number++) {
181 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
182 complexVectorPtr += 16;
183 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
184 complexVectorPtr += 16;
185
186 complexVal1 = _mm_shuffle_epi8(complexVal1, moveMask1);
187 complexVal2 = _mm_shuffle_epi8(complexVal2, moveMask2);
188
189 outputVal = _mm_or_si128(complexVal1, complexVal2);
190
191 _mm_store_si128((__m128i*)iBufferPtr, outputVal);
192 iBufferPtr += 16;
193 }
194
195 number = sixteenthPoints * 16;
196 for (; number < num_points; number++) {
197 *iBufferPtr++ = *complexVectorPtr++;
198 complexVectorPtr++;
199 }
200}
201#endif /* LV_HAVE_SSSE3 */
202
203
204#ifdef LV_HAVE_AVX
205#include <immintrin.h>
206
207static inline void volk_8ic_deinterleave_real_8i_a_avx(int8_t* iBuffer,
208 const lv_8sc_t* complexVector,
209 unsigned int num_points)
210{
211 unsigned int number = 0;
212 const int8_t* complexVectorPtr = (int8_t*)complexVector;
213 int8_t* iBufferPtr = iBuffer;
214 __m128i moveMaskL = _mm_set_epi8(
215 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 14, 12, 10, 8, 6, 4, 2, 0);
216 __m128i moveMaskH = _mm_set_epi8(
217 14, 12, 10, 8, 6, 4, 2, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
218 __m256i complexVal1, complexVal2, outputVal;
219 __m128i complexVal1H, complexVal1L, complexVal2H, complexVal2L, outputVal1,
220 outputVal2;
221
222 unsigned int thirtysecondPoints = num_points / 32;
223
224 for (number = 0; number < thirtysecondPoints; number++) {
225
226 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
227 complexVectorPtr += 32;
228 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
229 complexVectorPtr += 32;
230
231 complexVal1H = _mm256_extractf128_si256(complexVal1, 1);
232 complexVal1L = _mm256_extractf128_si256(complexVal1, 0);
233 complexVal2H = _mm256_extractf128_si256(complexVal2, 1);
234 complexVal2L = _mm256_extractf128_si256(complexVal2, 0);
235
236 complexVal1H = _mm_shuffle_epi8(complexVal1H, moveMaskH);
237 complexVal1L = _mm_shuffle_epi8(complexVal1L, moveMaskL);
238 outputVal1 = _mm_or_si128(complexVal1H, complexVal1L);
239
240
241 complexVal2H = _mm_shuffle_epi8(complexVal2H, moveMaskH);
242 complexVal2L = _mm_shuffle_epi8(complexVal2L, moveMaskL);
243 outputVal2 = _mm_or_si128(complexVal2H, complexVal2L);
244
245 __m256i dummy = _mm256_setzero_si256();
246 outputVal = _mm256_insertf128_si256(dummy, outputVal1, 0);
247 outputVal = _mm256_insertf128_si256(outputVal, outputVal2, 1);
248
249
250 _mm256_store_si256((__m256i*)iBufferPtr, outputVal);
251 iBufferPtr += 32;
252 }
253
254 number = thirtysecondPoints * 32;
255 for (; number < num_points; number++) {
256 *iBufferPtr++ = *complexVectorPtr++;
257 complexVectorPtr++;
258 }
259}
260#endif /* LV_HAVE_AVX */
261
262
263#ifdef LV_HAVE_GENERIC
264
265static inline void volk_8ic_deinterleave_real_8i_generic(int8_t* iBuffer,
266 const lv_8sc_t* complexVector,
267 unsigned int num_points)
268{
269 unsigned int number = 0;
270 const int8_t* complexVectorPtr = (int8_t*)complexVector;
271 int8_t* iBufferPtr = iBuffer;
272 for (number = 0; number < num_points; number++) {
273 *iBufferPtr++ = *complexVectorPtr++;
274 complexVectorPtr++;
275 }
276}
277#endif /* LV_HAVE_GENERIC */
278
279
280#ifdef LV_HAVE_NEON
281#include <arm_neon.h>
282
283static inline void volk_8ic_deinterleave_real_8i_neon(int8_t* iBuffer,
284 const lv_8sc_t* complexVector,
285 unsigned int num_points)
286{
287 unsigned int number;
288 unsigned int sixteenth_points = num_points / 16;
289
290 int8x16x2_t input_vector;
291 for (number = 0; number < sixteenth_points; ++number) {
292 input_vector = vld2q_s8((int8_t*)complexVector);
293 vst1q_s8(iBuffer, input_vector.val[0]);
294 iBuffer += 16;
295 complexVector += 16;
296 }
297
298 const int8_t* complexVectorPtr = (int8_t*)complexVector;
299 int8_t* iBufferPtr = iBuffer;
300 for (number = sixteenth_points * 16; number < num_points; number++) {
301 *iBufferPtr++ = *complexVectorPtr++;
302 complexVectorPtr++;
303 }
304}
305#endif /* LV_HAVE_NEON */
306
307
308#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_ALIGNED8_H */
309
310#ifndef INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
311#define INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H
312
313#include <inttypes.h>
314#include <stdio.h>
315
316#ifdef LV_HAVE_AVX2
317#include <immintrin.h>
318
319static inline void volk_8ic_deinterleave_real_8i_u_avx2(int8_t* iBuffer,
320 const lv_8sc_t* complexVector,
321 unsigned int num_points)
322{
323 unsigned int number = 0;
324 const int8_t* complexVectorPtr = (int8_t*)complexVector;
325 int8_t* iBufferPtr = iBuffer;
326 __m256i moveMask1 = _mm256_set_epi8(0x80,
327 0x80,
328 0x80,
329 0x80,
330 0x80,
331 0x80,
332 0x80,
333 0x80,
334 14,
335 12,
336 10,
337 8,
338 6,
339 4,
340 2,
341 0,
342 0x80,
343 0x80,
344 0x80,
345 0x80,
346 0x80,
347 0x80,
348 0x80,
349 0x80,
350 14,
351 12,
352 10,
353 8,
354 6,
355 4,
356 2,
357 0);
358 __m256i moveMask2 = _mm256_set_epi8(14,
359 12,
360 10,
361 8,
362 6,
363 4,
364 2,
365 0,
366 0x80,
367 0x80,
368 0x80,
369 0x80,
370 0x80,
371 0x80,
372 0x80,
373 0x80,
374 14,
375 12,
376 10,
377 8,
378 6,
379 4,
380 2,
381 0,
382 0x80,
383 0x80,
384 0x80,
385 0x80,
386 0x80,
387 0x80,
388 0x80,
389 0x80);
390 __m256i complexVal1, complexVal2, outputVal;
391
392 unsigned int thirtysecondPoints = num_points / 32;
393
394 for (number = 0; number < thirtysecondPoints; number++) {
395
396 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
397 complexVectorPtr += 32;
398 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
399 complexVectorPtr += 32;
400
401 complexVal1 = _mm256_shuffle_epi8(complexVal1, moveMask1);
402 complexVal2 = _mm256_shuffle_epi8(complexVal2, moveMask2);
403 outputVal = _mm256_or_si256(complexVal1, complexVal2);
404 outputVal = _mm256_permute4x64_epi64(outputVal, 0xd8);
405
406 _mm256_storeu_si256((__m256i*)iBufferPtr, outputVal);
407 iBufferPtr += 32;
408 }
409
410 number = thirtysecondPoints * 32;
411 for (; number < num_points; number++) {
412 *iBufferPtr++ = *complexVectorPtr++;
413 complexVectorPtr++;
414 }
415}
416#endif /* LV_HAVE_AVX2 */
417
418#endif /* INCLUDED_VOLK_8sc_DEINTERLEAVE_REAL_8s_UNALIGNED8_H */
static void volk_8ic_deinterleave_real_8i_a_ssse3(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:165
static void volk_8ic_deinterleave_real_8i_a_avx(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:207
static void volk_8ic_deinterleave_real_8i_neon(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:283
static void volk_8ic_deinterleave_real_8i_generic(int8_t *iBuffer, const lv_8sc_t *complexVector, unsigned int num_points)
Definition: volk_8ic_deinterleave_real_8i.h:265
char complex lv_8sc_t
Provide typedefs and operators for all complex types in C and C++.
Definition: volk_complex.h:61