Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_deinterleave_real_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
54#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_a_H
55#define INCLUDED_volk_16ic_deinterleave_real_16i_a_H
56
57#include <inttypes.h>
58#include <stdio.h>
59
60
61#ifdef LV_HAVE_AVX2
62#include <immintrin.h>
63
64static inline void volk_16ic_deinterleave_real_16i_a_avx2(int16_t* iBuffer,
65 const lv_16sc_t* complexVector,
66 unsigned int num_points)
67{
68 unsigned int number = 0;
69 const int16_t* complexVectorPtr = (int16_t*)complexVector;
70 int16_t* iBufferPtr = iBuffer;
71
72 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
73 0x80,
74 0x80,
75 0x80,
76 0x80,
77 0x80,
78 0x80,
79 0x80,
80 13,
81 12,
82 9,
83 8,
84 5,
85 4,
86 1,
87 0,
88 0x80,
89 0x80,
90 0x80,
91 0x80,
92 0x80,
93 0x80,
94 0x80,
95 0x80,
96 13,
97 12,
98 9,
99 8,
100 5,
101 4,
102 1,
103 0);
104 __m256i iMoveMask2 = _mm256_set_epi8(13,
105 12,
106 9,
107 8,
108 5,
109 4,
110 1,
111 0,
112 0x80,
113 0x80,
114 0x80,
115 0x80,
116 0x80,
117 0x80,
118 0x80,
119 0x80,
120 13,
121 12,
122 9,
123 8,
124 5,
125 4,
126 1,
127 0,
128 0x80,
129 0x80,
130 0x80,
131 0x80,
132 0x80,
133 0x80,
134 0x80,
135 0x80);
136
137 __m256i complexVal1, complexVal2, iOutputVal;
138
139 unsigned int sixteenthPoints = num_points / 16;
140
141 for (number = 0; number < sixteenthPoints; number++) {
142 complexVal1 = _mm256_load_si256((__m256i*)complexVectorPtr);
143 complexVectorPtr += 16;
144 complexVal2 = _mm256_load_si256((__m256i*)complexVectorPtr);
145 complexVectorPtr += 16;
146
147 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
148 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
149
150 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
151 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
152
153 _mm256_store_si256((__m256i*)iBufferPtr, iOutputVal);
154
155 iBufferPtr += 16;
156 }
157
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 *iBufferPtr++ = *complexVectorPtr++;
161 complexVectorPtr++;
162 }
163}
164#endif /* LV_HAVE_AVX2 */
165
166#ifdef LV_HAVE_SSSE3
167#include <tmmintrin.h>
168
169static inline void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t* iBuffer,
170 const lv_16sc_t* complexVector,
171 unsigned int num_points)
172{
173 unsigned int number = 0;
174 const int16_t* complexVectorPtr = (int16_t*)complexVector;
175 int16_t* iBufferPtr = iBuffer;
176
177 __m128i iMoveMask1 = _mm_set_epi8(
178 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 13, 12, 9, 8, 5, 4, 1, 0);
179 __m128i iMoveMask2 = _mm_set_epi8(
180 13, 12, 9, 8, 5, 4, 1, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80);
181
182 __m128i complexVal1, complexVal2, iOutputVal;
183
184 unsigned int eighthPoints = num_points / 8;
185
186 for (number = 0; number < eighthPoints; number++) {
187 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
188 complexVectorPtr += 8;
189 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
190 complexVectorPtr += 8;
191
192 complexVal1 = _mm_shuffle_epi8(complexVal1, iMoveMask1);
193 complexVal2 = _mm_shuffle_epi8(complexVal2, iMoveMask2);
194
195 iOutputVal = _mm_or_si128(complexVal1, complexVal2);
196
197 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
198
199 iBufferPtr += 8;
200 }
201
202 number = eighthPoints * 8;
203 for (; number < num_points; number++) {
204 *iBufferPtr++ = *complexVectorPtr++;
205 complexVectorPtr++;
206 }
207}
208#endif /* LV_HAVE_SSSE3 */
209
210
211#ifdef LV_HAVE_SSE2
212#include <emmintrin.h>
213
214static inline void volk_16ic_deinterleave_real_16i_a_sse2(int16_t* iBuffer,
215 const lv_16sc_t* complexVector,
216 unsigned int num_points)
217{
218 unsigned int number = 0;
219 const int16_t* complexVectorPtr = (int16_t*)complexVector;
220 int16_t* iBufferPtr = iBuffer;
221 __m128i complexVal1, complexVal2, iOutputVal;
222 __m128i lowMask = _mm_set_epi32(0x0, 0x0, 0xFFFFFFFF, 0xFFFFFFFF);
223 __m128i highMask = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x0, 0x0);
224
225 unsigned int eighthPoints = num_points / 8;
226
227 for (number = 0; number < eighthPoints; number++) {
228 complexVal1 = _mm_load_si128((__m128i*)complexVectorPtr);
229 complexVectorPtr += 8;
230 complexVal2 = _mm_load_si128((__m128i*)complexVectorPtr);
231 complexVectorPtr += 8;
232
233 complexVal1 = _mm_shufflelo_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
234
235 complexVal1 = _mm_shufflehi_epi16(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
236
237 complexVal1 = _mm_shuffle_epi32(complexVal1, _MM_SHUFFLE(3, 1, 2, 0));
238
239 complexVal2 = _mm_shufflelo_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
240
241 complexVal2 = _mm_shufflehi_epi16(complexVal2, _MM_SHUFFLE(3, 1, 2, 0));
242
243 complexVal2 = _mm_shuffle_epi32(complexVal2, _MM_SHUFFLE(2, 0, 3, 1));
244
245 iOutputVal = _mm_or_si128(_mm_and_si128(complexVal1, lowMask),
246 _mm_and_si128(complexVal2, highMask));
247
248 _mm_store_si128((__m128i*)iBufferPtr, iOutputVal);
249
250 iBufferPtr += 8;
251 }
252
253 number = eighthPoints * 8;
254 for (; number < num_points; number++) {
255 *iBufferPtr++ = *complexVectorPtr++;
256 complexVectorPtr++;
257 }
258}
259#endif /* LV_HAVE_SSE2 */
260
261#ifdef LV_HAVE_GENERIC
262
263static inline void volk_16ic_deinterleave_real_16i_generic(int16_t* iBuffer,
264 const lv_16sc_t* complexVector,
265 unsigned int num_points)
266{
267 unsigned int number = 0;
268 const int16_t* complexVectorPtr = (int16_t*)complexVector;
269 int16_t* iBufferPtr = iBuffer;
270 for (number = 0; number < num_points; number++) {
271 *iBufferPtr++ = *complexVectorPtr++;
272 complexVectorPtr++;
273 }
274}
275#endif /* LV_HAVE_GENERIC */
276
277
278#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_a_H */
279
280
281#ifndef INCLUDED_volk_16ic_deinterleave_real_16i_u_H
282#define INCLUDED_volk_16ic_deinterleave_real_16i_u_H
283
284#include <inttypes.h>
285#include <stdio.h>
286
287
288#ifdef LV_HAVE_AVX2
289#include <immintrin.h>
290
291static inline void volk_16ic_deinterleave_real_16i_u_avx2(int16_t* iBuffer,
292 const lv_16sc_t* complexVector,
293 unsigned int num_points)
294{
295 unsigned int number = 0;
296 const int16_t* complexVectorPtr = (int16_t*)complexVector;
297 int16_t* iBufferPtr = iBuffer;
298
299 __m256i iMoveMask1 = _mm256_set_epi8(0x80,
300 0x80,
301 0x80,
302 0x80,
303 0x80,
304 0x80,
305 0x80,
306 0x80,
307 13,
308 12,
309 9,
310 8,
311 5,
312 4,
313 1,
314 0,
315 0x80,
316 0x80,
317 0x80,
318 0x80,
319 0x80,
320 0x80,
321 0x80,
322 0x80,
323 13,
324 12,
325 9,
326 8,
327 5,
328 4,
329 1,
330 0);
331 __m256i iMoveMask2 = _mm256_set_epi8(13,
332 12,
333 9,
334 8,
335 5,
336 4,
337 1,
338 0,
339 0x80,
340 0x80,
341 0x80,
342 0x80,
343 0x80,
344 0x80,
345 0x80,
346 0x80,
347 13,
348 12,
349 9,
350 8,
351 5,
352 4,
353 1,
354 0,
355 0x80,
356 0x80,
357 0x80,
358 0x80,
359 0x80,
360 0x80,
361 0x80,
362 0x80);
363
364 __m256i complexVal1, complexVal2, iOutputVal;
365
366 unsigned int sixteenthPoints = num_points / 16;
367
368 for (number = 0; number < sixteenthPoints; number++) {
369 complexVal1 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
370 complexVectorPtr += 16;
371 complexVal2 = _mm256_loadu_si256((__m256i*)complexVectorPtr);
372 complexVectorPtr += 16;
373
374 complexVal1 = _mm256_shuffle_epi8(complexVal1, iMoveMask1);
375 complexVal2 = _mm256_shuffle_epi8(complexVal2, iMoveMask2);
376
377 iOutputVal = _mm256_or_si256(complexVal1, complexVal2);
378 iOutputVal = _mm256_permute4x64_epi64(iOutputVal, 0xd8);
379
380 _mm256_storeu_si256((__m256i*)iBufferPtr, iOutputVal);
381
382 iBufferPtr += 16;
383 }
384
385 number = sixteenthPoints * 16;
386 for (; number < num_points; number++) {
387 *iBufferPtr++ = *complexVectorPtr++;
388 complexVectorPtr++;
389 }
390}
391#endif /* LV_HAVE_AVX2 */
392
393#endif /* INCLUDED_volk_16ic_deinterleave_real_16i_u_H */
static void volk_16ic_deinterleave_real_16i_generic(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:263
static void volk_16ic_deinterleave_real_16i_a_sse2(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:214
static void volk_16ic_deinterleave_real_16i_a_ssse3(int16_t *iBuffer, const lv_16sc_t *complexVector, unsigned int num_points)
Definition: volk_16ic_deinterleave_real_16i.h:169
short complex lv_16sc_t
Definition: volk_complex.h:62