Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_stddev_and_mean_32f_x2.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014, 2021 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
73#ifndef INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
74#define INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H
75
76#include <inttypes.h>
77#include <math.h>
78#include <volk/volk_common.h>
79
80// Youngs and Cramer's Algorithm for calculating std and mean
81// Using the methods discussed here:
82// https://doi.org/10.1145/3221269.3223036
83#ifdef LV_HAVE_GENERIC
84
85static inline void volk_32f_stddev_and_mean_32f_x2_generic(float* stddev,
86 float* mean,
87 const float* inputBuffer,
88 unsigned int num_points)
89{
90 const float* in_ptr = inputBuffer;
91 if (num_points == 0) {
92 return;
93 } else if (num_points == 1) {
94 *stddev = 0.f;
95 *mean = (*in_ptr);
96 return;
97 }
98
99 float Sum[2];
100 float SquareSum[2] = { 0.f, 0.f };
101 Sum[0] = (*in_ptr++);
102 Sum[1] = (*in_ptr++);
103
104 uint32_t half_points = num_points / 2;
105
106 for (uint32_t number = 1; number < half_points; number++) {
107 float Val0 = (*in_ptr++);
108 float Val1 = (*in_ptr++);
109 float n = (float)number;
110 float n_plus_one = n + 1.f;
111 float r = 1.f / (n * n_plus_one);
112
113 Sum[0] += Val0;
114 Sum[1] += Val1;
115
116 SquareSum[0] += r * powf(n_plus_one * Val0 - Sum[0], 2);
117 SquareSum[1] += r * powf(n_plus_one * Val1 - Sum[1], 2);
118 }
119
120 SquareSum[0] += SquareSum[1] + .5f / half_points * pow(Sum[0] - Sum[1], 2);
121 Sum[0] += Sum[1];
122
123 uint32_t points_done = half_points * 2;
124
125 for (; points_done < num_points; points_done++) {
126 float Val = (*in_ptr++);
127 float n = (float)points_done;
128 float n_plus_one = n + 1.f;
129 float r = 1.f / (n * n_plus_one);
130 Sum[0] += Val;
131 SquareSum[0] += r * powf(n_plus_one * Val - Sum[0], 2);
132 }
133 *stddev = sqrtf(SquareSum[0] / num_points);
134 *mean = Sum[0] / num_points;
135}
136#endif /* LV_HAVE_GENERIC */
137
138static inline float update_square_sum_1_val(const float SquareSum,
139 const float Sum,
140 const uint32_t len,
141 const float val)
142{
143 // Updates a sum of squares calculated over len values with the value val
144 float n = (float)len;
145 float n_plus_one = n + 1.f;
146 return SquareSum +
147 1.f / (n * n_plus_one) * (n_plus_one * val - Sum) * (n_plus_one * val - Sum);
148}
149
150static inline float add_square_sums(const float SquareSum0,
151 const float Sum0,
152 const float SquareSum1,
153 const float Sum1,
154 const uint32_t len)
155{
156 // Add two sums of squares calculated over the same number of values, len
157 float n = (float)len;
158 return SquareSum0 + SquareSum1 + .5f / n * (Sum0 - Sum1) * (Sum0 - Sum1);
159}
160
161static inline void accrue_result(float* PartialSquareSums,
162 float* PartialSums,
163 const uint32_t NumberOfPartitions,
164 const uint32_t PartitionLen)
165{
166 // Add all partial sums and square sums into the first element of the arrays
167 uint32_t accumulators = NumberOfPartitions;
168 uint32_t stages = 0;
169 uint32_t offset = 1;
170 uint32_t partition_len = PartitionLen;
171
172 while (accumulators >>= 1) {
173 stages++;
174 } // Integer log2
175 accumulators = NumberOfPartitions;
176
177 for (uint32_t s = 0; s < stages; s++) {
178 accumulators /= 2;
179 uint32_t idx = 0;
180 for (uint32_t a = 0; a < accumulators; a++) {
181 PartialSquareSums[idx] = add_square_sums(PartialSquareSums[idx],
182 PartialSums[idx],
183 PartialSquareSums[idx + offset],
184 PartialSums[idx + offset],
185 partition_len);
186 PartialSums[idx] += PartialSums[idx + offset];
187 idx += 2 * offset;
188 }
189 offset *= 2;
190 partition_len *= 2;
191 }
192}
193
194#ifdef LV_HAVE_NEON
195#include <arm_neon.h>
197
198static inline void volk_32f_stddev_and_mean_32f_x2_neon(float* stddev,
199 float* mean,
200 const float* inputBuffer,
201 unsigned int num_points)
202{
203 if (num_points < 8) {
204 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
205 return;
206 }
207
208 const float* in_ptr = inputBuffer;
209
210 __VOLK_ATTR_ALIGNED(32) float SumLocal[8] = { 0.f };
211 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[8] = { 0.f };
212
213 const uint32_t eigth_points = num_points / 8;
214
215 float32x4_t Sum0, Sum1;
216
217 Sum0 = vld1q_f32((const float32_t*)in_ptr);
218 in_ptr += 4;
219 __VOLK_PREFETCH(in_ptr + 4);
220
221 Sum1 = vld1q_f32((const float32_t*)in_ptr);
222 in_ptr += 4;
223 __VOLK_PREFETCH(in_ptr + 4);
224
225 float32x4_t SquareSum0 = { 0.f };
226 float32x4_t SquareSum1 = { 0.f };
227
228 float32x4_t Values0, Values1;
229 float32x4_t Aux0, Aux1;
230 float32x4_t Reciprocal;
231
232 for (uint32_t number = 1; number < eigth_points; number++) {
233 Values0 = vld1q_f32(in_ptr);
234 in_ptr += 4;
235 __VOLK_PREFETCH(in_ptr + 4);
236
237 Values1 = vld1q_f32(in_ptr);
238 in_ptr += 4;
239 __VOLK_PREFETCH(in_ptr + 4);
240
241 float n = (float)number;
242 float n_plus_one = n + 1.f;
243 Reciprocal = vdupq_n_f32(1.f / (n * n_plus_one));
244
245 Sum0 = vaddq_f32(Sum0, Values0);
246 Aux0 = vdupq_n_f32(n_plus_one);
247 SquareSum0 =
248 _neon_accumulate_square_sum_f32(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
249
250 Sum1 = vaddq_f32(Sum1, Values1);
251 Aux1 = vdupq_n_f32(n_plus_one);
252 SquareSum1 =
253 _neon_accumulate_square_sum_f32(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
254 }
255
256 vst1q_f32(&SumLocal[0], Sum0);
257 vst1q_f32(&SumLocal[4], Sum1);
258 vst1q_f32(&SquareSumLocal[0], SquareSum0);
259 vst1q_f32(&SquareSumLocal[4], SquareSum1);
260
261 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
262
263 uint32_t points_done = eigth_points * 8;
264
265 for (; points_done < num_points; points_done++) {
266 float val = (*in_ptr++);
267 SumLocal[0] += val;
268 SquareSumLocal[0] =
269 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
270 }
271
272 *stddev = sqrtf(SquareSumLocal[0] / num_points);
273 *mean = SumLocal[0] / num_points;
274}
275#endif /* LV_HAVE_NEON */
276
277#ifdef LV_HAVE_SSE
279#include <xmmintrin.h>
280
281static inline void volk_32f_stddev_and_mean_32f_x2_u_sse(float* stddev,
282 float* mean,
283 const float* inputBuffer,
284 unsigned int num_points)
285{
286 if (num_points < 8) {
287 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
288 return;
289 }
290
291 const float* in_ptr = inputBuffer;
292
293 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
294 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
295
296
297 const uint32_t eigth_points = num_points / 8;
298
299 __m128 Sum0 = _mm_loadu_ps(in_ptr);
300 in_ptr += 4;
301 __m128 Sum1 = _mm_loadu_ps(in_ptr);
302 in_ptr += 4;
303 __m128 SquareSum0 = _mm_setzero_ps();
304 __m128 SquareSum1 = _mm_setzero_ps();
305 __m128 Values0, Values1;
306 __m128 Aux0, Aux1;
307 __m128 Reciprocal;
308
309 for (uint32_t number = 1; number < eigth_points; number++) {
310 Values0 = _mm_loadu_ps(in_ptr);
311 in_ptr += 4;
312 __VOLK_PREFETCH(in_ptr + 4);
313
314 Values1 = _mm_loadu_ps(in_ptr);
315 in_ptr += 4;
316 __VOLK_PREFETCH(in_ptr + 4);
317
318 float n = (float)number;
319 float n_plus_one = n + 1.f;
320 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
321
322 Sum0 = _mm_add_ps(Sum0, Values0);
323 Aux0 = _mm_set_ps1(n_plus_one);
324 SquareSum0 =
325 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
326
327 Sum1 = _mm_add_ps(Sum1, Values1);
328 Aux1 = _mm_set_ps1(n_plus_one);
329 SquareSum1 =
330 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
331 }
332
333 _mm_store_ps(&SumLocal[0], Sum0);
334 _mm_store_ps(&SumLocal[4], Sum1);
335 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
336 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
337
338 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
339
340 uint32_t points_done = eigth_points * 8;
341
342 for (; points_done < num_points; points_done++) {
343 float val = (*in_ptr++);
344 SumLocal[0] += val;
345 SquareSumLocal[0] =
346 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
347 }
348
349 *stddev = sqrtf(SquareSumLocal[0] / num_points);
350 *mean = SumLocal[0] / num_points;
351}
352#endif /* LV_HAVE_SSE */
353
354#ifdef LV_HAVE_AVX
355#include <immintrin.h>
357
358static inline void volk_32f_stddev_and_mean_32f_x2_u_avx(float* stddev,
359 float* mean,
360 const float* inputBuffer,
361 unsigned int num_points)
362{
363 if (num_points < 16) {
364 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
365 return;
366 }
367
368 const float* in_ptr = inputBuffer;
369
370 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
371 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
372
373 const unsigned int sixteenth_points = num_points / 16;
374
375 __m256 Sum0 = _mm256_loadu_ps(in_ptr);
376 in_ptr += 8;
377 __m256 Sum1 = _mm256_loadu_ps(in_ptr);
378 in_ptr += 8;
379
380 __m256 SquareSum0 = _mm256_setzero_ps();
381 __m256 SquareSum1 = _mm256_setzero_ps();
382 __m256 Values0, Values1;
383 __m256 Aux0, Aux1;
384 __m256 Reciprocal;
385
386 for (uint32_t number = 1; number < sixteenth_points; number++) {
387 Values0 = _mm256_loadu_ps(in_ptr);
388 in_ptr += 8;
389 __VOLK_PREFETCH(in_ptr + 8);
390
391 Values1 = _mm256_loadu_ps(in_ptr);
392 in_ptr += 8;
393 __VOLK_PREFETCH(in_ptr + 8);
394
395 float n = (float)number;
396 float n_plus_one = n + 1.f;
397
398 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
399
400 Sum0 = _mm256_add_ps(Sum0, Values0);
401 Aux0 = _mm256_set1_ps(n_plus_one);
402 SquareSum0 =
403 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
404
405 Sum1 = _mm256_add_ps(Sum1, Values1);
406 Aux1 = _mm256_set1_ps(n_plus_one);
407 SquareSum1 =
408 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
409 }
410
411 _mm256_store_ps(&SumLocal[0], Sum0);
412 _mm256_store_ps(&SumLocal[8], Sum1);
413 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
414 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
415
416 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
417
418 uint32_t points_done = sixteenth_points * 16;
419
420 for (; points_done < num_points; points_done++) {
421 float val = (*in_ptr++);
422 SumLocal[0] += val;
423 SquareSumLocal[0] =
424 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
425 }
426
427 *stddev = sqrtf(SquareSumLocal[0] / num_points);
428 *mean = SumLocal[0] / num_points;
429}
430#endif /* LV_HAVE_AVX */
431
432#ifdef LV_HAVE_SSE
433#include <xmmintrin.h>
434
435static inline void volk_32f_stddev_and_mean_32f_x2_a_sse(float* stddev,
436 float* mean,
437 const float* inputBuffer,
438 unsigned int num_points)
439{
440 if (num_points < 8) {
441 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
442 return;
443 }
444
445 const float* in_ptr = inputBuffer;
446
447 __VOLK_ATTR_ALIGNED(16) float SumLocal[8] = { 0.f };
448 __VOLK_ATTR_ALIGNED(16) float SquareSumLocal[8] = { 0.f };
449
450
451 const uint32_t eigth_points = num_points / 8;
452
453 __m128 Sum0 = _mm_load_ps(in_ptr);
454 in_ptr += 4;
455 __m128 Sum1 = _mm_load_ps(in_ptr);
456 in_ptr += 4;
457 __m128 SquareSum0 = _mm_setzero_ps();
458 __m128 SquareSum1 = _mm_setzero_ps();
459 __m128 Values0, Values1;
460 __m128 Aux0, Aux1;
461 __m128 Reciprocal;
462
463 for (uint32_t number = 1; number < eigth_points; number++) {
464 Values0 = _mm_load_ps(in_ptr);
465 in_ptr += 4;
466 __VOLK_PREFETCH(in_ptr + 4);
467
468 Values1 = _mm_load_ps(in_ptr);
469 in_ptr += 4;
470 __VOLK_PREFETCH(in_ptr + 4);
471
472 float n = (float)number;
473 float n_plus_one = n + 1.f;
474 Reciprocal = _mm_set_ps1(1.f / (n * n_plus_one));
475
476 Sum0 = _mm_add_ps(Sum0, Values0);
477 Aux0 = _mm_set_ps1(n_plus_one);
478 SquareSum0 =
479 _mm_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
480
481 Sum1 = _mm_add_ps(Sum1, Values1);
482 Aux1 = _mm_set_ps1(n_plus_one);
483 SquareSum1 =
484 _mm_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
485 }
486
487 _mm_store_ps(&SumLocal[0], Sum0);
488 _mm_store_ps(&SumLocal[4], Sum1);
489 _mm_store_ps(&SquareSumLocal[0], SquareSum0);
490 _mm_store_ps(&SquareSumLocal[4], SquareSum1);
491
492 accrue_result(SquareSumLocal, SumLocal, 8, eigth_points);
493
494 uint32_t points_done = eigth_points * 8;
495
496 for (; points_done < num_points; points_done++) {
497 float val = (*in_ptr++);
498 SumLocal[0] += val;
499 SquareSumLocal[0] =
500 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
501 }
502
503 *stddev = sqrtf(SquareSumLocal[0] / num_points);
504 *mean = SumLocal[0] / num_points;
505}
506#endif /* LV_HAVE_SSE */
507
508#ifdef LV_HAVE_AVX
509#include <immintrin.h>
510
511static inline void volk_32f_stddev_and_mean_32f_x2_a_avx(float* stddev,
512 float* mean,
513 const float* inputBuffer,
514 unsigned int num_points)
515{
516 if (num_points < 16) {
517 volk_32f_stddev_and_mean_32f_x2_generic(stddev, mean, inputBuffer, num_points);
518 return;
519 }
520
521 const float* in_ptr = inputBuffer;
522
523 __VOLK_ATTR_ALIGNED(32) float SumLocal[16] = { 0.f };
524 __VOLK_ATTR_ALIGNED(32) float SquareSumLocal[16] = { 0.f };
525
526 const unsigned int sixteenth_points = num_points / 16;
527
528 __m256 Sum0 = _mm256_load_ps(in_ptr);
529 in_ptr += 8;
530 __m256 Sum1 = _mm256_load_ps(in_ptr);
531 in_ptr += 8;
532
533 __m256 SquareSum0 = _mm256_setzero_ps();
534 __m256 SquareSum1 = _mm256_setzero_ps();
535 __m256 Values0, Values1;
536 __m256 Aux0, Aux1;
537 __m256 Reciprocal;
538
539 for (uint32_t number = 1; number < sixteenth_points; number++) {
540 Values0 = _mm256_load_ps(in_ptr);
541 in_ptr += 8;
542 __VOLK_PREFETCH(in_ptr + 8);
543
544 Values1 = _mm256_load_ps(in_ptr);
545 in_ptr += 8;
546 __VOLK_PREFETCH(in_ptr + 8);
547
548 float n = (float)number;
549 float n_plus_one = n + 1.f;
550
551 Reciprocal = _mm256_set1_ps(1.f / (n * n_plus_one));
552
553 Sum0 = _mm256_add_ps(Sum0, Values0);
554 Aux0 = _mm256_set1_ps(n_plus_one);
555 SquareSum0 =
556 _mm256_accumulate_square_sum_ps(SquareSum0, Sum0, Values0, Reciprocal, Aux0);
557
558 Sum1 = _mm256_add_ps(Sum1, Values1);
559 Aux1 = _mm256_set1_ps(n_plus_one);
560 SquareSum1 =
561 _mm256_accumulate_square_sum_ps(SquareSum1, Sum1, Values1, Reciprocal, Aux1);
562 }
563
564 _mm256_store_ps(&SumLocal[0], Sum0);
565 _mm256_store_ps(&SumLocal[8], Sum1);
566 _mm256_store_ps(&SquareSumLocal[0], SquareSum0);
567 _mm256_store_ps(&SquareSumLocal[8], SquareSum1);
568
569 accrue_result(SquareSumLocal, SumLocal, 16, sixteenth_points);
570
571 uint32_t points_done = sixteenth_points * 16;
572
573 for (; points_done < num_points; points_done++) {
574 float val = (*in_ptr++);
575 SumLocal[0] += val;
576 SquareSumLocal[0] =
577 update_square_sum_1_val(SquareSumLocal[0], SumLocal[0], points_done, val);
578 }
579
580 *stddev = sqrtf(SquareSumLocal[0] / num_points);
581 *mean = SumLocal[0] / num_points;
582}
583#endif /* LV_HAVE_AVX */
584
585#endif /* INCLUDED_volk_32f_stddev_and_mean_32f_x2_a_H */
val
Definition: volk_arch_defs.py:66
static void volk_32f_stddev_and_mean_32f_x2_u_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:281
static float add_square_sums(const float SquareSum0, const float Sum0, const float SquareSum1, const float Sum1, const uint32_t len)
Definition: volk_32f_stddev_and_mean_32f_x2.h:150
static void accrue_result(float *PartialSquareSums, float *PartialSums, const uint32_t NumberOfPartitions, const uint32_t PartitionLen)
Definition: volk_32f_stddev_and_mean_32f_x2.h:161
static void volk_32f_stddev_and_mean_32f_x2_u_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:358
static void volk_32f_stddev_and_mean_32f_x2_generic(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:85
static void volk_32f_stddev_and_mean_32f_x2_a_avx(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:511
static void volk_32f_stddev_and_mean_32f_x2_neon(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:198
static void volk_32f_stddev_and_mean_32f_x2_a_sse(float *stddev, float *mean, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_stddev_and_mean_32f_x2.h:435
static float update_square_sum_1_val(const float SquareSum, const float Sum, const uint32_t len, const float val)
Definition: volk_32f_stddev_and_mean_32f_x2.h:138
static __m256 _mm256_accumulate_square_sum_ps(__m256 sq_acc, __m256 acc, __m256 val, __m256 rec, __m256 aux)
Definition: volk_avx_intrinsics.h:198
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
static float32x4_t _neon_accumulate_square_sum_f32(float32x4_t sq_acc, float32x4_t acc, float32x4_t val, float32x4_t rec, float32x4_t aux)
Definition: volk_neon_intrinsics.h:280
static __m128 _mm_accumulate_square_sum_ps(__m128 sq_acc, __m128 acc, __m128 val, __m128 rec, __m128 aux)
Definition: volk_sse_intrinsics.h:62