Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_s32f_stddev_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
68#ifndef INCLUDED_volk_32f_s32f_stddev_32f_a_H
69#define INCLUDED_volk_32f_s32f_stddev_32f_a_H
70
71#include <inttypes.h>
72#include <math.h>
73#include <stdio.h>
74#include <volk/volk_common.h>
75
76#ifdef LV_HAVE_SSE4_1
77#include <smmintrin.h>
78
79static inline void volk_32f_s32f_stddev_32f_a_sse4_1(float* stddev,
80 const float* inputBuffer,
81 const float mean,
82 unsigned int num_points)
83{
84 float returnValue = 0;
85 if (num_points > 0) {
86 unsigned int number = 0;
87 const unsigned int sixteenthPoints = num_points / 16;
88
89 const float* aPtr = inputBuffer;
90
91 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
92
93 __m128 squareAccumulator = _mm_setzero_ps();
94 __m128 aVal1, aVal2, aVal3, aVal4;
95 __m128 cVal1, cVal2, cVal3, cVal4;
96 for (; number < sixteenthPoints; number++) {
97 aVal1 = _mm_load_ps(aPtr);
98 aPtr += 4;
99 cVal1 = _mm_dp_ps(aVal1, aVal1, 0xF1);
100
101 aVal2 = _mm_load_ps(aPtr);
102 aPtr += 4;
103 cVal2 = _mm_dp_ps(aVal2, aVal2, 0xF2);
104
105 aVal3 = _mm_load_ps(aPtr);
106 aPtr += 4;
107 cVal3 = _mm_dp_ps(aVal3, aVal3, 0xF4);
108
109 aVal4 = _mm_load_ps(aPtr);
110 aPtr += 4;
111 cVal4 = _mm_dp_ps(aVal4, aVal4, 0xF8);
112
113 cVal1 = _mm_or_ps(cVal1, cVal2);
114 cVal3 = _mm_or_ps(cVal3, cVal4);
115 cVal1 = _mm_or_ps(cVal1, cVal3);
116
117 squareAccumulator =
118 _mm_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
119 }
120 _mm_store_ps(squareBuffer,
121 squareAccumulator); // Store the results back into the C container
122 returnValue = squareBuffer[0];
123 returnValue += squareBuffer[1];
124 returnValue += squareBuffer[2];
125 returnValue += squareBuffer[3];
126
127 number = sixteenthPoints * 16;
128 for (; number < num_points; number++) {
129 returnValue += (*aPtr) * (*aPtr);
130 aPtr++;
131 }
132 returnValue /= num_points;
133 returnValue -= (mean * mean);
134 returnValue = sqrtf(returnValue);
135 }
136 *stddev = returnValue;
137}
138
139#endif /* LV_HAVE_SSE4_1 */
140
141#ifdef LV_HAVE_SSE
142#include <xmmintrin.h>
143
144static inline void volk_32f_s32f_stddev_32f_a_sse(float* stddev,
145 const float* inputBuffer,
146 const float mean,
147 unsigned int num_points)
148{
149 float returnValue = 0;
150 if (num_points > 0) {
151 unsigned int number = 0;
152 const unsigned int quarterPoints = num_points / 4;
153
154 const float* aPtr = inputBuffer;
155
156 __VOLK_ATTR_ALIGNED(16) float squareBuffer[4];
157
158 __m128 squareAccumulator = _mm_setzero_ps();
159 __m128 aVal = _mm_setzero_ps();
160 for (; number < quarterPoints; number++) {
161 aVal = _mm_load_ps(aPtr); // aVal = x
162 aVal = _mm_mul_ps(aVal, aVal); // squareAccumulator += x^2
163 squareAccumulator = _mm_add_ps(squareAccumulator, aVal);
164 aPtr += 4;
165 }
166 _mm_store_ps(squareBuffer,
167 squareAccumulator); // Store the results back into the C container
168 returnValue = squareBuffer[0];
169 returnValue += squareBuffer[1];
170 returnValue += squareBuffer[2];
171 returnValue += squareBuffer[3];
172
173 number = quarterPoints * 4;
174 for (; number < num_points; number++) {
175 returnValue += (*aPtr) * (*aPtr);
176 aPtr++;
177 }
178 returnValue /= num_points;
179 returnValue -= (mean * mean);
180 returnValue = sqrtf(returnValue);
181 }
182 *stddev = returnValue;
183}
184#endif /* LV_HAVE_SSE */
185
186
187#ifdef LV_HAVE_AVX
188#include <immintrin.h>
189
190static inline void volk_32f_s32f_stddev_32f_a_avx(float* stddev,
191 const float* inputBuffer,
192 const float mean,
193 unsigned int num_points)
194{
195 float stdDev = 0;
196 if (num_points > 0) {
197 unsigned int number = 0;
198 const unsigned int thirtySecondthPoints = num_points / 32;
199
200 const float* aPtr = inputBuffer;
201 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
202
203 __m256 squareAccumulator = _mm256_setzero_ps();
204 __m256 aVal1, aVal2, aVal3, aVal4;
205 __m256 cVal1, cVal2, cVal3, cVal4;
206 for (; number < thirtySecondthPoints; number++) {
207 aVal1 = _mm256_load_ps(aPtr);
208 aPtr += 8;
209 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
210
211 aVal2 = _mm256_load_ps(aPtr);
212 aPtr += 8;
213 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
214
215 aVal3 = _mm256_load_ps(aPtr);
216 aPtr += 8;
217 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
218
219 aVal4 = _mm256_load_ps(aPtr);
220 aPtr += 8;
221 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
222
223 cVal1 = _mm256_or_ps(cVal1, cVal2);
224 cVal3 = _mm256_or_ps(cVal3, cVal4);
225 cVal1 = _mm256_or_ps(cVal1, cVal3);
226
227 squareAccumulator =
228 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
229 }
230 _mm256_store_ps(squareBuffer,
231 squareAccumulator); // Store the results back into the C container
232 stdDev = squareBuffer[0];
233 stdDev += squareBuffer[1];
234 stdDev += squareBuffer[2];
235 stdDev += squareBuffer[3];
236 stdDev += squareBuffer[4];
237 stdDev += squareBuffer[5];
238 stdDev += squareBuffer[6];
239 stdDev += squareBuffer[7];
240
241 number = thirtySecondthPoints * 32;
242 for (; number < num_points; number++) {
243 stdDev += (*aPtr) * (*aPtr);
244 aPtr++;
245 }
246 stdDev /= num_points;
247 stdDev -= (mean * mean);
248 stdDev = sqrtf(stdDev);
249 }
250 *stddev = stdDev;
251}
252#endif /* LV_HAVE_AVX */
253
254
255#ifdef LV_HAVE_GENERIC
256
257static inline void volk_32f_s32f_stddev_32f_generic(float* stddev,
258 const float* inputBuffer,
259 const float mean,
260 unsigned int num_points)
261{
262 float returnValue = 0;
263 if (num_points > 0) {
264 const float* aPtr = inputBuffer;
265 unsigned int number = 0;
266
267 for (number = 0; number < num_points; number++) {
268 returnValue += (*aPtr) * (*aPtr);
269 aPtr++;
270 }
271
272 returnValue /= num_points;
273 returnValue -= (mean * mean);
274 returnValue = sqrtf(returnValue);
275 }
276 *stddev = returnValue;
277}
278
279#endif /* LV_HAVE_GENERIC */
280
281
282#endif /* INCLUDED_volk_32f_s32f_stddev_32f_a_H */
283
284#ifndef INCLUDED_volk_32f_s32f_stddev_32f_u_H
285#define INCLUDED_volk_32f_s32f_stddev_32f_u_H
286
287#include <inttypes.h>
288#include <math.h>
289#include <stdio.h>
290#include <volk/volk_common.h>
291
292#ifdef LV_HAVE_AVX
293#include <immintrin.h>
294
295static inline void volk_32f_s32f_stddev_32f_u_avx(float* stddev,
296 const float* inputBuffer,
297 const float mean,
298 unsigned int num_points)
299{
300 float stdDev = 0;
301 if (num_points > 0) {
302 unsigned int number = 0;
303 const unsigned int thirtySecondthPoints = num_points / 32;
304
305 const float* aPtr = inputBuffer;
306 __VOLK_ATTR_ALIGNED(32) float squareBuffer[8];
307
308 __m256 squareAccumulator = _mm256_setzero_ps();
309 __m256 aVal1, aVal2, aVal3, aVal4;
310 __m256 cVal1, cVal2, cVal3, cVal4;
311 for (; number < thirtySecondthPoints; number++) {
312 aVal1 = _mm256_loadu_ps(aPtr);
313 aPtr += 8;
314 cVal1 = _mm256_dp_ps(aVal1, aVal1, 0xF1);
315
316 aVal2 = _mm256_loadu_ps(aPtr);
317 aPtr += 8;
318 cVal2 = _mm256_dp_ps(aVal2, aVal2, 0xF2);
319
320 aVal3 = _mm256_loadu_ps(aPtr);
321 aPtr += 8;
322 cVal3 = _mm256_dp_ps(aVal3, aVal3, 0xF4);
323
324 aVal4 = _mm256_loadu_ps(aPtr);
325 aPtr += 8;
326 cVal4 = _mm256_dp_ps(aVal4, aVal4, 0xF8);
327
328 cVal1 = _mm256_or_ps(cVal1, cVal2);
329 cVal3 = _mm256_or_ps(cVal3, cVal4);
330 cVal1 = _mm256_or_ps(cVal1, cVal3);
331
332 squareAccumulator =
333 _mm256_add_ps(squareAccumulator, cVal1); // squareAccumulator += x^2
334 }
335 _mm256_storeu_ps(
336 squareBuffer,
337 squareAccumulator); // Store the results back into the C container
338 stdDev = squareBuffer[0];
339 stdDev += squareBuffer[1];
340 stdDev += squareBuffer[2];
341 stdDev += squareBuffer[3];
342 stdDev += squareBuffer[4];
343 stdDev += squareBuffer[5];
344 stdDev += squareBuffer[6];
345 stdDev += squareBuffer[7];
346
347 number = thirtySecondthPoints * 32;
348 for (; number < num_points; number++) {
349 stdDev += (*aPtr) * (*aPtr);
350 aPtr++;
351 }
352 stdDev /= num_points;
353 stdDev -= (mean * mean);
354 stdDev = sqrtf(stdDev);
355 }
356 *stddev = stdDev;
357}
358#endif /* LV_HAVE_AVX */
359
360#endif /* INCLUDED_volk_32f_s32f_stddev_32f_u_H */
static void volk_32f_s32f_stddev_32f_a_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:190
static void volk_32f_s32f_stddev_32f_a_sse(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:144
static void volk_32f_s32f_stddev_32f_u_avx(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:295
static void volk_32f_s32f_stddev_32f_generic(float *stddev, const float *inputBuffer, const float mean, unsigned int num_points)
Definition: volk_32f_s32f_stddev_32f.h:257
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56