Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_x2_s32f_interleave_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
75#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
76#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H
77
78#include <inttypes.h>
79#include <stdio.h>
80#include <volk/volk_common.h>
81
82#ifdef LV_HAVE_AVX2
83#include <immintrin.h>
84
85static inline void volk_32f_x2_s32f_interleave_16ic_a_avx2(lv_16sc_t* complexVector,
86 const float* iBuffer,
87 const float* qBuffer,
88 const float scalar,
89 unsigned int num_points)
90{
91 unsigned int number = 0;
92 const float* iBufferPtr = iBuffer;
93 const float* qBufferPtr = qBuffer;
94
95 __m256 vScalar = _mm256_set1_ps(scalar);
96
97 const unsigned int eighthPoints = num_points / 8;
98
99 __m256 iValue, qValue, cplxValue1, cplxValue2;
100 __m256i intValue1, intValue2;
101
102 int16_t* complexVectorPtr = (int16_t*)complexVector;
103
104 for (; number < eighthPoints; number++) {
105 iValue = _mm256_load_ps(iBufferPtr);
106 qValue = _mm256_load_ps(qBufferPtr);
107
108 // Interleaves the lower two values in the i and q variables into one buffer
109 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
110 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
111
112 // Interleaves the upper two values in the i and q variables into one buffer
113 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
114 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
115
116 intValue1 = _mm256_cvtps_epi32(cplxValue1);
117 intValue2 = _mm256_cvtps_epi32(cplxValue2);
118
119 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
120
121 _mm256_store_si256((__m256i*)complexVectorPtr, intValue1);
122 complexVectorPtr += 16;
123
124 iBufferPtr += 8;
125 qBufferPtr += 8;
126 }
127
128 number = eighthPoints * 8;
129 complexVectorPtr = (int16_t*)(&complexVector[number]);
130 for (; number < num_points; number++) {
131 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
132 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
133 }
134}
135#endif /* LV_HAVE_AVX2 */
136
137
138#ifdef LV_HAVE_SSE2
139#include <emmintrin.h>
140
141static inline void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t* complexVector,
142 const float* iBuffer,
143 const float* qBuffer,
144 const float scalar,
145 unsigned int num_points)
146{
147 unsigned int number = 0;
148 const float* iBufferPtr = iBuffer;
149 const float* qBufferPtr = qBuffer;
150
151 __m128 vScalar = _mm_set_ps1(scalar);
152
153 const unsigned int quarterPoints = num_points / 4;
154
155 __m128 iValue, qValue, cplxValue1, cplxValue2;
156 __m128i intValue1, intValue2;
157
158 int16_t* complexVectorPtr = (int16_t*)complexVector;
159
160 for (; number < quarterPoints; number++) {
161 iValue = _mm_load_ps(iBufferPtr);
162 qValue = _mm_load_ps(qBufferPtr);
163
164 // Interleaves the lower two values in the i and q variables into one buffer
165 cplxValue1 = _mm_unpacklo_ps(iValue, qValue);
166 cplxValue1 = _mm_mul_ps(cplxValue1, vScalar);
167
168 // Interleaves the upper two values in the i and q variables into one buffer
169 cplxValue2 = _mm_unpackhi_ps(iValue, qValue);
170 cplxValue2 = _mm_mul_ps(cplxValue2, vScalar);
171
172 intValue1 = _mm_cvtps_epi32(cplxValue1);
173 intValue2 = _mm_cvtps_epi32(cplxValue2);
174
175 intValue1 = _mm_packs_epi32(intValue1, intValue2);
176
177 _mm_store_si128((__m128i*)complexVectorPtr, intValue1);
178 complexVectorPtr += 8;
179
180 iBufferPtr += 4;
181 qBufferPtr += 4;
182 }
183
184 number = quarterPoints * 4;
185 complexVectorPtr = (int16_t*)(&complexVector[number]);
186 for (; number < num_points; number++) {
187 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
188 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
189 }
190}
191#endif /* LV_HAVE_SSE2 */
192
193
194#ifdef LV_HAVE_SSE
195#include <xmmintrin.h>
196
197static inline void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t* complexVector,
198 const float* iBuffer,
199 const float* qBuffer,
200 const float scalar,
201 unsigned int num_points)
202{
203 unsigned int number = 0;
204 const float* iBufferPtr = iBuffer;
205 const float* qBufferPtr = qBuffer;
206
207 __m128 vScalar = _mm_set_ps1(scalar);
208
209 const unsigned int quarterPoints = num_points / 4;
210
211 __m128 iValue, qValue, cplxValue;
212
213 int16_t* complexVectorPtr = (int16_t*)complexVector;
214
215 __VOLK_ATTR_ALIGNED(16) float floatBuffer[4];
216
217 for (; number < quarterPoints; number++) {
218 iValue = _mm_load_ps(iBufferPtr);
219 qValue = _mm_load_ps(qBufferPtr);
220
221 // Interleaves the lower two values in the i and q variables into one buffer
222 cplxValue = _mm_unpacklo_ps(iValue, qValue);
223 cplxValue = _mm_mul_ps(cplxValue, vScalar);
224
225 _mm_store_ps(floatBuffer, cplxValue);
226
227 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
228 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
229 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
230 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
231
232 // Interleaves the upper two values in the i and q variables into one buffer
233 cplxValue = _mm_unpackhi_ps(iValue, qValue);
234 cplxValue = _mm_mul_ps(cplxValue, vScalar);
235
236 _mm_store_ps(floatBuffer, cplxValue);
237
238 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[0]);
239 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[1]);
240 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[2]);
241 *complexVectorPtr++ = (int16_t)rintf(floatBuffer[3]);
242
243 iBufferPtr += 4;
244 qBufferPtr += 4;
245 }
246
247 number = quarterPoints * 4;
248 complexVectorPtr = (int16_t*)(&complexVector[number]);
249 for (; number < num_points; number++) {
250 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
251 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
252 }
253}
254#endif /* LV_HAVE_SSE */
255
256
257#ifdef LV_HAVE_GENERIC
258
259static inline void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t* complexVector,
260 const float* iBuffer,
261 const float* qBuffer,
262 const float scalar,
263 unsigned int num_points)
264{
265 int16_t* complexVectorPtr = (int16_t*)complexVector;
266 const float* iBufferPtr = iBuffer;
267 const float* qBufferPtr = qBuffer;
268 unsigned int number = 0;
269
270 for (number = 0; number < num_points; number++) {
271 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
272 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
273 }
274}
275#endif /* LV_HAVE_GENERIC */
276
277
278#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_a_H */
279
280#ifndef INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
281#define INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H
282
283#include <inttypes.h>
284#include <stdio.h>
285#include <volk/volk_common.h>
286
287#ifdef LV_HAVE_AVX2
288#include <immintrin.h>
289
290static inline void volk_32f_x2_s32f_interleave_16ic_u_avx2(lv_16sc_t* complexVector,
291 const float* iBuffer,
292 const float* qBuffer,
293 const float scalar,
294 unsigned int num_points)
295{
296 unsigned int number = 0;
297 const float* iBufferPtr = iBuffer;
298 const float* qBufferPtr = qBuffer;
299
300 __m256 vScalar = _mm256_set1_ps(scalar);
301
302 const unsigned int eighthPoints = num_points / 8;
303
304 __m256 iValue, qValue, cplxValue1, cplxValue2;
305 __m256i intValue1, intValue2;
306
307 int16_t* complexVectorPtr = (int16_t*)complexVector;
308
309 for (; number < eighthPoints; number++) {
310 iValue = _mm256_loadu_ps(iBufferPtr);
311 qValue = _mm256_loadu_ps(qBufferPtr);
312
313 // Interleaves the lower two values in the i and q variables into one buffer
314 cplxValue1 = _mm256_unpacklo_ps(iValue, qValue);
315 cplxValue1 = _mm256_mul_ps(cplxValue1, vScalar);
316
317 // Interleaves the upper two values in the i and q variables into one buffer
318 cplxValue2 = _mm256_unpackhi_ps(iValue, qValue);
319 cplxValue2 = _mm256_mul_ps(cplxValue2, vScalar);
320
321 intValue1 = _mm256_cvtps_epi32(cplxValue1);
322 intValue2 = _mm256_cvtps_epi32(cplxValue2);
323
324 intValue1 = _mm256_packs_epi32(intValue1, intValue2);
325
326 _mm256_storeu_si256((__m256i*)complexVectorPtr, intValue1);
327 complexVectorPtr += 16;
328
329 iBufferPtr += 8;
330 qBufferPtr += 8;
331 }
332
333 number = eighthPoints * 8;
334 complexVectorPtr = (int16_t*)(&complexVector[number]);
335 for (; number < num_points; number++) {
336 *complexVectorPtr++ = (int16_t)rintf(*iBufferPtr++ * scalar);
337 *complexVectorPtr++ = (int16_t)rintf(*qBufferPtr++ * scalar);
338 }
339}
340#endif /* LV_HAVE_AVX2 */
341
342
343#endif /* INCLUDED_volk_32f_x2_s32f_interleave_16ic_u_H */
static float rintf(float x)
Definition: config.h:37
static void volk_32f_x2_s32f_interleave_16ic_a_sse2(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:141
static void volk_32f_x2_s32f_interleave_16ic_a_sse(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:197
static void volk_32f_x2_s32f_interleave_16ic_generic(lv_16sc_t *complexVector, const float *iBuffer, const float *qBuffer, const float scalar, unsigned int num_points)
Definition: volk_32f_x2_s32f_interleave_16ic.h:259
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
short complex lv_16sc_t
Definition: volk_complex.h:62