Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_8i_convert_16i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
53#ifndef INCLUDED_volk_8i_convert_16i_u_H
54#define INCLUDED_volk_8i_convert_16i_u_H
55
56#include <inttypes.h>
57#include <stdio.h>
58
59#ifdef LV_HAVE_AVX2
60#include <immintrin.h>
61
62static inline void volk_8i_convert_16i_u_avx2(int16_t* outputVector,
63 const int8_t* inputVector,
64 unsigned int num_points)
65{
66 unsigned int number = 0;
67 const unsigned int sixteenthPoints = num_points / 16;
68
69 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
70 __m256i* outputVectorPtr = (__m256i*)outputVector;
71 __m128i inputVal;
72 __m256i ret;
73
74 for (; number < sixteenthPoints; number++) {
75 inputVal = _mm_loadu_si128(inputVectorPtr);
76 ret = _mm256_cvtepi8_epi16(inputVal);
77 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
78 _mm256_storeu_si256(outputVectorPtr, ret);
79
80 outputVectorPtr++;
81 inputVectorPtr++;
82 }
83
84 number = sixteenthPoints * 16;
85 for (; number < num_points; number++) {
86 outputVector[number] = (int16_t)(inputVector[number]) * 256;
87 }
88}
89#endif /* LV_HAVE_AVX2 */
90
91
92#ifdef LV_HAVE_SSE4_1
93#include <smmintrin.h>
94
95static inline void volk_8i_convert_16i_u_sse4_1(int16_t* outputVector,
96 const int8_t* inputVector,
97 unsigned int num_points)
98{
99 unsigned int number = 0;
100 const unsigned int sixteenthPoints = num_points / 16;
101
102 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
103 __m128i* outputVectorPtr = (__m128i*)outputVector;
104 __m128i inputVal;
105 __m128i ret;
106
107 for (; number < sixteenthPoints; number++) {
108 inputVal = _mm_loadu_si128(inputVectorPtr);
109 ret = _mm_cvtepi8_epi16(inputVal);
110 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
111 _mm_storeu_si128(outputVectorPtr, ret);
112
113 outputVectorPtr++;
114
115 inputVal = _mm_srli_si128(inputVal, 8);
116 ret = _mm_cvtepi8_epi16(inputVal);
117 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
118 _mm_storeu_si128(outputVectorPtr, ret);
119
120 outputVectorPtr++;
121
122 inputVectorPtr++;
123 }
124
125 number = sixteenthPoints * 16;
126 for (; number < num_points; number++) {
127 outputVector[number] = (int16_t)(inputVector[number]) * 256;
128 }
129}
130#endif /* LV_HAVE_SSE4_1 */
131
132
133#ifdef LV_HAVE_GENERIC
134
135static inline void volk_8i_convert_16i_generic(int16_t* outputVector,
136 const int8_t* inputVector,
137 unsigned int num_points)
138{
139 int16_t* outputVectorPtr = outputVector;
140 const int8_t* inputVectorPtr = inputVector;
141 unsigned int number = 0;
142
143 for (number = 0; number < num_points; number++) {
144 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
145 }
146}
147#endif /* LV_HAVE_GENERIC */
148
149
150#endif /* INCLUDED_VOLK_8s_CONVERT_16s_UNALIGNED8_H */
151
152
153#ifndef INCLUDED_volk_8i_convert_16i_a_H
154#define INCLUDED_volk_8i_convert_16i_a_H
155
156#include <inttypes.h>
157#include <stdio.h>
158
159#ifdef LV_HAVE_AVX2
160#include <immintrin.h>
161
162static inline void volk_8i_convert_16i_a_avx2(int16_t* outputVector,
163 const int8_t* inputVector,
164 unsigned int num_points)
165{
166 unsigned int number = 0;
167 const unsigned int sixteenthPoints = num_points / 16;
168
169 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
170 __m256i* outputVectorPtr = (__m256i*)outputVector;
171 __m128i inputVal;
172 __m256i ret;
173
174 for (; number < sixteenthPoints; number++) {
175 inputVal = _mm_load_si128(inputVectorPtr);
176 ret = _mm256_cvtepi8_epi16(inputVal);
177 ret = _mm256_slli_epi16(ret, 8); // Multiply by 256
178 _mm256_store_si256(outputVectorPtr, ret);
179
180 outputVectorPtr++;
181 inputVectorPtr++;
182 }
183
184 number = sixteenthPoints * 16;
185 for (; number < num_points; number++) {
186 outputVector[number] = (int16_t)(inputVector[number]) * 256;
187 }
188}
189#endif /* LV_HAVE_AVX2 */
190
191
192#ifdef LV_HAVE_SSE4_1
193#include <smmintrin.h>
194
195static inline void volk_8i_convert_16i_a_sse4_1(int16_t* outputVector,
196 const int8_t* inputVector,
197 unsigned int num_points)
198{
199 unsigned int number = 0;
200 const unsigned int sixteenthPoints = num_points / 16;
201
202 const __m128i* inputVectorPtr = (const __m128i*)inputVector;
203 __m128i* outputVectorPtr = (__m128i*)outputVector;
204 __m128i inputVal;
205 __m128i ret;
206
207 for (; number < sixteenthPoints; number++) {
208 inputVal = _mm_load_si128(inputVectorPtr);
209 ret = _mm_cvtepi8_epi16(inputVal);
210 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
211 _mm_store_si128(outputVectorPtr, ret);
212
213 outputVectorPtr++;
214
215 inputVal = _mm_srli_si128(inputVal, 8);
216 ret = _mm_cvtepi8_epi16(inputVal);
217 ret = _mm_slli_epi16(ret, 8); // Multiply by 256
218 _mm_store_si128(outputVectorPtr, ret);
219
220 outputVectorPtr++;
221
222 inputVectorPtr++;
223 }
224
225 number = sixteenthPoints * 16;
226 for (; number < num_points; number++) {
227 outputVector[number] = (int16_t)(inputVector[number]) * 256;
228 }
229}
230#endif /* LV_HAVE_SSE4_1 */
231
232
233#ifdef LV_HAVE_GENERIC
234
235static inline void volk_8i_convert_16i_a_generic(int16_t* outputVector,
236 const int8_t* inputVector,
237 unsigned int num_points)
238{
239 int16_t* outputVectorPtr = outputVector;
240 const int8_t* inputVectorPtr = inputVector;
241 unsigned int number = 0;
242
243 for (number = 0; number < num_points; number++) {
244 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
245 }
246}
247#endif /* LV_HAVE_GENERIC */
248
249
250#ifdef LV_HAVE_NEON
251#include <arm_neon.h>
252
253static inline void volk_8i_convert_16i_neon(int16_t* outputVector,
254 const int8_t* inputVector,
255 unsigned int num_points)
256{
257 int16_t* outputVectorPtr = outputVector;
258 const int8_t* inputVectorPtr = inputVector;
259 unsigned int number;
260 const unsigned int eighth_points = num_points / 8;
261
262 int8x8_t input_vec;
263 int16x8_t converted_vec;
264
265 // NEON doesn't have a concept of 8 bit registers, so we are really
266 // dealing with the low half of 16-bit registers. Since this requires
267 // a move instruction we likely do better with ASM here.
268 for (number = 0; number < eighth_points; ++number) {
269 input_vec = vld1_s8(inputVectorPtr);
270 converted_vec = vmovl_s8(input_vec);
271 // converted_vec = vmulq_s16(converted_vec, scale_factor);
272 converted_vec = vshlq_n_s16(converted_vec, 8);
273 vst1q_s16(outputVectorPtr, converted_vec);
274
275 inputVectorPtr += 8;
276 outputVectorPtr += 8;
277 }
278
279 for (number = eighth_points * 8; number < num_points; number++) {
280 *outputVectorPtr++ = ((int16_t)(*inputVectorPtr++)) * 256;
281 }
282}
283#endif /* LV_HAVE_NEON */
284
285
286#ifdef LV_HAVE_ORC
287extern void volk_8i_convert_16i_a_orc_impl(int16_t* outputVector,
288 const int8_t* inputVector,
289 unsigned int num_points);
290
291static inline void volk_8i_convert_16i_u_orc(int16_t* outputVector,
292 const int8_t* inputVector,
293 unsigned int num_points)
294{
295 volk_8i_convert_16i_a_orc_impl(outputVector, inputVector, num_points);
296}
297#endif /* LV_HAVE_ORC */
298
299
300#endif /* INCLUDED_VOLK_8s_CONVERT_16s_ALIGNED8_H */
static void volk_8i_convert_16i_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:135
static void volk_8i_convert_16i_a_generic(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:235
static void volk_8i_convert_16i_neon(int16_t *outputVector, const int8_t *inputVector, unsigned int num_points)
Definition: volk_8i_convert_16i.h:253