54#ifndef INCLUDED_volk_8i_s32f_convert_32f_u_H
55#define INCLUDED_volk_8i_s32f_convert_32f_u_H
63static inline void volk_8i_s32f_convert_32f_u_avx2(
float* outputVector,
64 const int8_t* inputVector,
66 unsigned int num_points)
68 unsigned int number = 0;
69 const unsigned int sixteenthPoints = num_points / 16;
71 float* outputVectorPtr = outputVector;
72 const float iScalar = 1.0 / scalar;
73 __m256 invScalar = _mm256_set1_ps(iScalar);
74 const int8_t* inputVectorPtr = inputVector;
79 for (; number < sixteenthPoints; number++) {
80 inputVal128 = _mm_loadu_si128((__m128i*)inputVectorPtr);
82 interimVal = _mm256_cvtepi8_epi32(inputVal128);
83 ret = _mm256_cvtepi32_ps(interimVal);
84 ret = _mm256_mul_ps(ret, invScalar);
85 _mm256_storeu_ps(outputVectorPtr, ret);
88 inputVal128 = _mm_srli_si128(inputVal128, 8);
89 interimVal = _mm256_cvtepi8_epi32(inputVal128);
90 ret = _mm256_cvtepi32_ps(interimVal);
91 ret = _mm256_mul_ps(ret, invScalar);
92 _mm256_storeu_ps(outputVectorPtr, ret);
98 number = sixteenthPoints * 16;
99 for (; number < num_points; number++) {
100 outputVector[number] = (float)(inputVector[number]) * iScalar;
107#include <smmintrin.h>
109static inline void volk_8i_s32f_convert_32f_u_sse4_1(
float* outputVector,
110 const int8_t* inputVector,
112 unsigned int num_points)
114 unsigned int number = 0;
115 const unsigned int sixteenthPoints = num_points / 16;
117 float* outputVectorPtr = outputVector;
118 const float iScalar = 1.0 / scalar;
119 __m128 invScalar = _mm_set_ps1(iScalar);
120 const int8_t* inputVectorPtr = inputVector;
125 for (; number < sixteenthPoints; number++) {
126 inputVal = _mm_loadu_si128((__m128i*)inputVectorPtr);
128 interimVal = _mm_cvtepi8_epi32(inputVal);
129 ret = _mm_cvtepi32_ps(interimVal);
130 ret = _mm_mul_ps(ret, invScalar);
131 _mm_storeu_ps(outputVectorPtr, ret);
132 outputVectorPtr += 4;
134 inputVal = _mm_srli_si128(inputVal, 4);
135 interimVal = _mm_cvtepi8_epi32(inputVal);
136 ret = _mm_cvtepi32_ps(interimVal);
137 ret = _mm_mul_ps(ret, invScalar);
138 _mm_storeu_ps(outputVectorPtr, ret);
139 outputVectorPtr += 4;
141 inputVal = _mm_srli_si128(inputVal, 4);
142 interimVal = _mm_cvtepi8_epi32(inputVal);
143 ret = _mm_cvtepi32_ps(interimVal);
144 ret = _mm_mul_ps(ret, invScalar);
145 _mm_storeu_ps(outputVectorPtr, ret);
146 outputVectorPtr += 4;
148 inputVal = _mm_srli_si128(inputVal, 4);
149 interimVal = _mm_cvtepi8_epi32(inputVal);
150 ret = _mm_cvtepi32_ps(interimVal);
151 ret = _mm_mul_ps(ret, invScalar);
152 _mm_storeu_ps(outputVectorPtr, ret);
153 outputVectorPtr += 4;
155 inputVectorPtr += 16;
158 number = sixteenthPoints * 16;
159 for (; number < num_points; number++) {
160 outputVector[number] = (float)(inputVector[number]) * iScalar;
165#ifdef LV_HAVE_GENERIC
168 const int8_t* inputVector,
170 unsigned int num_points)
172 float* outputVectorPtr = outputVector;
173 const int8_t* inputVectorPtr = inputVector;
174 unsigned int number = 0;
175 const float iScalar = 1.0 / scalar;
177 for (number = 0; number < num_points; number++) {
178 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
186#ifndef INCLUDED_volk_8i_s32f_convert_32f_a_H
187#define INCLUDED_volk_8i_s32f_convert_32f_a_H
193#include <immintrin.h>
195static inline void volk_8i_s32f_convert_32f_a_avx2(
float* outputVector,
196 const int8_t* inputVector,
198 unsigned int num_points)
200 unsigned int number = 0;
201 const unsigned int sixteenthPoints = num_points / 16;
203 float* outputVectorPtr = outputVector;
204 const float iScalar = 1.0 / scalar;
205 __m256 invScalar = _mm256_set1_ps(iScalar);
206 const int8_t* inputVectorPtr = inputVector;
211 for (; number < sixteenthPoints; number++) {
212 inputVal128 = _mm_load_si128((__m128i*)inputVectorPtr);
214 interimVal = _mm256_cvtepi8_epi32(inputVal128);
215 ret = _mm256_cvtepi32_ps(interimVal);
216 ret = _mm256_mul_ps(ret, invScalar);
217 _mm256_store_ps(outputVectorPtr, ret);
218 outputVectorPtr += 8;
220 inputVal128 = _mm_srli_si128(inputVal128, 8);
221 interimVal = _mm256_cvtepi8_epi32(inputVal128);
222 ret = _mm256_cvtepi32_ps(interimVal);
223 ret = _mm256_mul_ps(ret, invScalar);
224 _mm256_store_ps(outputVectorPtr, ret);
225 outputVectorPtr += 8;
227 inputVectorPtr += 16;
230 number = sixteenthPoints * 16;
231 for (; number < num_points; number++) {
232 outputVector[number] = (float)(inputVector[number]) * iScalar;
238#include <smmintrin.h>
240static inline void volk_8i_s32f_convert_32f_a_sse4_1(
float* outputVector,
241 const int8_t* inputVector,
243 unsigned int num_points)
245 unsigned int number = 0;
246 const unsigned int sixteenthPoints = num_points / 16;
248 float* outputVectorPtr = outputVector;
249 const float iScalar = 1.0 / scalar;
250 __m128 invScalar = _mm_set_ps1(iScalar);
251 const int8_t* inputVectorPtr = inputVector;
256 for (; number < sixteenthPoints; number++) {
257 inputVal = _mm_load_si128((__m128i*)inputVectorPtr);
259 interimVal = _mm_cvtepi8_epi32(inputVal);
260 ret = _mm_cvtepi32_ps(interimVal);
261 ret = _mm_mul_ps(ret, invScalar);
262 _mm_store_ps(outputVectorPtr, ret);
263 outputVectorPtr += 4;
265 inputVal = _mm_srli_si128(inputVal, 4);
266 interimVal = _mm_cvtepi8_epi32(inputVal);
267 ret = _mm_cvtepi32_ps(interimVal);
268 ret = _mm_mul_ps(ret, invScalar);
269 _mm_store_ps(outputVectorPtr, ret);
270 outputVectorPtr += 4;
272 inputVal = _mm_srli_si128(inputVal, 4);
273 interimVal = _mm_cvtepi8_epi32(inputVal);
274 ret = _mm_cvtepi32_ps(interimVal);
275 ret = _mm_mul_ps(ret, invScalar);
276 _mm_store_ps(outputVectorPtr, ret);
277 outputVectorPtr += 4;
279 inputVal = _mm_srli_si128(inputVal, 4);
280 interimVal = _mm_cvtepi8_epi32(inputVal);
281 ret = _mm_cvtepi32_ps(interimVal);
282 ret = _mm_mul_ps(ret, invScalar);
283 _mm_store_ps(outputVectorPtr, ret);
284 outputVectorPtr += 4;
286 inputVectorPtr += 16;
289 number = sixteenthPoints * 16;
290 for (; number < num_points; number++) {
291 outputVector[number] = (float)(inputVector[number]) * iScalar;
300 const int8_t* inputVector,
302 unsigned int num_points)
304 float* outputVectorPtr = outputVector;
305 const int8_t* inputVectorPtr = inputVector;
307 const float iScalar = 1.0 / scalar;
308 const float32x4_t qiScalar = vdupq_n_f32(iScalar);
315 float32x4_t outputFloat;
317 unsigned int number = 0;
318 const unsigned int sixteenthPoints = num_points / 16;
319 for (; number < sixteenthPoints; number++) {
320 inputVal = vld1q_s8(inputVectorPtr);
321 inputVectorPtr += 16;
323 lower = vmovl_s8(vget_low_s8(inputVal));
324 higher = vmovl_s8(vget_high_s8(inputVal));
326 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(lower))), qiScalar);
327 vst1q_f32(outputVectorPtr, outputFloat);
328 outputVectorPtr += 4;
330 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(lower))), qiScalar);
331 vst1q_f32(outputVectorPtr, outputFloat);
332 outputVectorPtr += 4;
334 outputFloat = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(higher))), qiScalar);
335 vst1q_f32(outputVectorPtr, outputFloat);
336 outputVectorPtr += 4;
339 vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(higher))), qiScalar);
340 vst1q_f32(outputVectorPtr, outputFloat);
341 outputVectorPtr += 4;
343 for (number = sixteenthPoints * 16; number < num_points; number++) {
344 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
350#ifdef LV_HAVE_GENERIC
353 const int8_t* inputVector,
355 unsigned int num_points)
357 float* outputVectorPtr = outputVector;
358 const int8_t* inputVectorPtr = inputVector;
359 unsigned int number = 0;
360 const float iScalar = 1.0 / scalar;
362 for (number = 0; number < num_points; number++) {
363 *outputVectorPtr++ = ((float)(*inputVectorPtr++)) * iScalar;
370extern void volk_8i_s32f_convert_32f_a_orc_impl(
float* outputVector,
371 const int8_t* inputVector,
373 unsigned int num_points);
375static inline void volk_8i_s32f_convert_32f_u_orc(
float* outputVector,
376 const int8_t* inputVector,
378 unsigned int num_points)
380 float invscalar = 1.0 / scalar;
381 volk_8i_s32f_convert_32f_a_orc_impl(outputVector, inputVector, invscalar, num_points);
static void volk_8i_s32f_convert_32f_a_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:352
static void volk_8i_s32f_convert_32f_generic(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:167
static void volk_8i_s32f_convert_32f_neon(float *outputVector, const int8_t *inputVector, const float scalar, unsigned int num_points)
Definition: volk_8i_s32f_convert_32f.h:299