68#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
69#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
79static inline void volk_32f_s32f_stddev_32f_a_sse4_1(
float* stddev,
80 const float* inputBuffer,
82 unsigned int num_points)
84 float returnValue = 0;
86 unsigned int number = 0;
87 const unsigned int sixteenthPoints = num_points / 16;
89 const float* aPtr = inputBuffer;
93 __m128 squareAccumulator = _mm_setzero_ps();
94 __m128 aVal1, aVal2, aVal3, aVal4;
95 __m128 cVal1, cVal2, cVal3, cVal4;
96 for (; number < sixteenthPoints; number++) {
97 aVal1 = _mm_load_ps(aPtr);
99 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
101 aVal2 = _mm_load_ps(aPtr);
103 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
105 aVal3 = _mm_load_ps(aPtr);
107 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
109 aVal4 = _mm_load_ps(aPtr);
111 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
113 cVal1 = _mm_or_ps(cVal1, cVal2);
114 cVal3 = _mm_or_ps(cVal3, cVal4);
115 cVal1 = _mm_or_ps(cVal1, cVal3);
118 _mm_add_ps(squareAccumulator, cVal1);
120 _mm_store_ps(squareBuffer,
122 returnValue = squareBuffer[0];
123 returnValue += squareBuffer[1];
124 returnValue += squareBuffer[2];
125 returnValue += squareBuffer[3];
127 number = sixteenthPoints * 16;
128 for (; number < num_points; number++) {
129 returnValue += (*aPtr) * (*aPtr);
132 returnValue /= num_points;
133 returnValue -= (mean * mean);
134 returnValue = sqrtf(returnValue);
136 *stddev = returnValue;
142#include <xmmintrin.h>
145 const float* inputBuffer,
147 unsigned int num_points)
149 float returnValue = 0;
150 if (num_points > 0) {
151 unsigned int number = 0;
152 const unsigned int quarterPoints = num_points / 4;
154 const float* aPtr = inputBuffer;
158 __m128 squareAccumulator = _mm_setzero_ps();
159 __m128 aVal = _mm_setzero_ps();
160 for (; number < quarterPoints; number++) {
161 aVal = _mm_load_ps(aPtr);
162 aVal = _mm_mul_ps(aVal, aVal);
163 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
166 _mm_store_ps(squareBuffer,
168 returnValue = squareBuffer[0];
169 returnValue += squareBuffer[1];
170 returnValue += squareBuffer[2];
171 returnValue += squareBuffer[3];
173 number = quarterPoints * 4;
174 for (; number < num_points; number++) {
175 returnValue += (*aPtr) * (*aPtr);
178 returnValue /= num_points;
179 returnValue -= (mean * mean);
180 returnValue = sqrtf(returnValue);
182 *stddev = returnValue;
188#include <immintrin.h>
191 const float* inputBuffer,
193 unsigned int num_points)
196 if (num_points > 0) {
197 unsigned int number = 0;
198 const unsigned int thirtySecondthPoints = num_points / 32;
200 const float* aPtr = inputBuffer;
203 __m256 squareAccumulator = _mm256_setzero_ps();
204 __m256 aVal1, aVal2, aVal3, aVal4;
205 __m256 cVal1, cVal2, cVal3, cVal4;
206 for (; number < thirtySecondthPoints; number++) {
207 aVal1 = _mm256_load_ps(aPtr);
209 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
211 aVal2 = _mm256_load_ps(aPtr);
213 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
215 aVal3 = _mm256_load_ps(aPtr);
217 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
219 aVal4 = _mm256_load_ps(aPtr);
221 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
223 cVal1 = _mm256_or_ps(cVal1, cVal2);
224 cVal3 = _mm256_or_ps(cVal3, cVal4);
225 cVal1 = _mm256_or_ps(cVal1, cVal3);
228 _mm256_add_ps(squareAccumulator, cVal1);
230 _mm256_store_ps(squareBuffer,
232 stdDev = squareBuffer[0];
233 stdDev += squareBuffer[1];
234 stdDev += squareBuffer[2];
235 stdDev += squareBuffer[3];
236 stdDev += squareBuffer[4];
237 stdDev += squareBuffer[5];
238 stdDev += squareBuffer[6];
239 stdDev += squareBuffer[7];
241 number = thirtySecondthPoints * 32;
242 for (; number < num_points; number++) {
243 stdDev += (*aPtr) * (*aPtr);
246 stdDev /= num_points;
247 stdDev -= (mean * mean);
248 stdDev = sqrtf(stdDev);
255#ifdef LV_HAVE_GENERIC
258 const float* inputBuffer,
260 unsigned int num_points)
262 float returnValue = 0;
263 if (num_points > 0) {
264 const float* aPtr = inputBuffer;
265 unsigned int number = 0;
267 for (number = 0; number < num_points; number++) {
268 returnValue += (*aPtr) * (*aPtr);
272 returnValue /= num_points;
273 returnValue -= (mean * mean);
274 returnValue = sqrtf(returnValue);
276 *stddev = returnValue;
284#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
285#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
293#include <immintrin.h>
296 const float* inputBuffer,
298 unsigned int num_points)
301 if (num_points > 0) {
302 unsigned int number = 0;
303 const unsigned int thirtySecondthPoints = num_points / 32;
305 const float* aPtr = inputBuffer;
308 __m256 squareAccumulator = _mm256_setzero_ps();
309 __m256 aVal1, aVal2, aVal3, aVal4;
310 __m256 cVal1, cVal2, cVal3, cVal4;
311 for (; number < thirtySecondthPoints; number++) {
312 aVal1 = _mm256_loadu_ps(aPtr);
314 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
316 aVal2 = _mm256_loadu_ps(aPtr);
318 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
320 aVal3 = _mm256_loadu_ps(aPtr);
322 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
324 aVal4 = _mm256_loadu_ps(aPtr);
326 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
328 cVal1 = _mm256_or_ps(cVal1, cVal2);
329 cVal3 = _mm256_or_ps(cVal3, cVal4);
330 cVal1 = _mm256_or_ps(cVal1, cVal3);
333 _mm256_add_ps(squareAccumulator, cVal1);
338 stdDev = squareBuffer[0];
339 stdDev += squareBuffer[1];
340 stdDev += squareBuffer[2];
341 stdDev += squareBuffer[3];
342 stdDev += squareBuffer[4];
343 stdDev += squareBuffer[5];
344 stdDev += squareBuffer[6];
345 stdDev += squareBuffer[7];
347 number = thirtySecondthPoints * 32;
348 for (; number < num_points; number++) {
349 stdDev += (*aPtr) * (*aPtr);
352 stdDev /= num_points;
353 stdDev -= (mean * mean);
354 stdDev = sqrtf(stdDev);
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:190
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:144
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:295
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:257
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56