Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_x2_pow_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72#define INCLUDED_volk_32f_x2_pow_32f_a_H
73
74#include <inttypes.h>
75#include <math.h>
76#include <stdio.h>
77#include <stdlib.h>
78
79#define POW_POLY_DEGREE 3
80
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
82#include <immintrin.h>
83
84#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85#define POLY1_AVX2_FMA(x, c0, c1) \
86 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
87#define POLY2_AVX2_FMA(x, c0, c1, c2) \
88 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
89#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
90 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
91#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
92 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
93#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
94 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
95
96static inline void volk_32f_x2_pow_32f_a_avx2_fma(float* cVector,
97 const float* bVector,
98 const float* aVector,
99 unsigned int num_points)
100{
101 float* cPtr = cVector;
102 const float* bPtr = bVector;
103 const float* aPtr = aVector;
104
105 unsigned int number = 0;
106 const unsigned int eighthPoints = num_points / 8;
107
108 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
109 __m256 tmp, fx, mask, pow2n, z, y;
110 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
111 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
112 __m256i bias, exp, emm0, pi32_0x7f;
113
114 one = _mm256_set1_ps(1.0);
115 exp_hi = _mm256_set1_ps(88.3762626647949);
116 exp_lo = _mm256_set1_ps(-88.3762626647949);
117 ln2 = _mm256_set1_ps(0.6931471805);
118 log2EF = _mm256_set1_ps(1.44269504088896341);
119 half = _mm256_set1_ps(0.5);
120 exp_C1 = _mm256_set1_ps(0.693359375);
121 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
122 pi32_0x7f = _mm256_set1_epi32(0x7f);
123
124 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
125 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
126 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
127 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
128 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
129 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
130
131 for (; number < eighthPoints; number++) {
132 // First compute the logarithm
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
139 23),
140 bias);
141 logarithm = _mm256_cvtepi32_ps(exp);
142
143 frac = _mm256_or_ps(
144 leadingOne,
145 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
146
147#if POW_POLY_DEGREE == 6
148 mantissa = POLY5_AVX2_FMA(frac,
149 3.1157899f,
150 -3.3241990f,
151 2.5988452f,
152 -1.2315303f,
153 3.1821337e-1f,
154 -3.4436006e-2f);
155#elif POW_POLY_DEGREE == 5
156 mantissa = POLY4_AVX2_FMA(frac,
157 2.8882704548164776201f,
158 -2.52074962577807006663f,
159 1.48116647521213171641f,
160 -0.465725644288844778798f,
161 0.0596515482674574969533f);
162#elif POW_POLY_DEGREE == 4
163 mantissa = POLY3_AVX2_FMA(frac,
164 2.61761038894603480148f,
165 -1.75647175389045657003f,
166 0.688243882994381274313f,
167 -0.107254423828329604454f);
168#elif POW_POLY_DEGREE == 3
169 mantissa = POLY2_AVX2_FMA(frac,
170 2.28330284476918490682f,
171 -1.04913055217340124191f,
172 0.204446009836232697516f);
173#else
174#error
175#endif
176
177 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
178 logarithm = _mm256_mul_ps(logarithm, ln2);
179
180 // Now calculate b*lna
181 bVal = _mm256_load_ps(bPtr);
182 bVal = _mm256_mul_ps(bVal, logarithm);
183
184 // Now compute exp(b*lna)
185 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
186
187 fx = _mm256_fmadd_ps(bVal, log2EF, half);
188
189 emm0 = _mm256_cvttps_epi32(fx);
190 tmp = _mm256_cvtepi32_ps(emm0);
191
192 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
193 fx = _mm256_sub_ps(tmp, mask);
194
195 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
196 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
197 z = _mm256_mul_ps(bVal, bVal);
198
199 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
200 y = _mm256_fmadd_ps(y, bVal, exp_p2);
201 y = _mm256_fmadd_ps(y, bVal, exp_p3);
202 y = _mm256_fmadd_ps(y, bVal, exp_p4);
203 y = _mm256_fmadd_ps(y, bVal, exp_p5);
204 y = _mm256_fmadd_ps(y, z, bVal);
205 y = _mm256_add_ps(y, one);
206
207 emm0 =
208 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
209
210 pow2n = _mm256_castsi256_ps(emm0);
211 cVal = _mm256_mul_ps(y, pow2n);
212
213 _mm256_store_ps(cPtr, cVal);
214
215 aPtr += 8;
216 bPtr += 8;
217 cPtr += 8;
218 }
219
220 number = eighthPoints * 8;
221 for (; number < num_points; number++) {
222 *cPtr++ = pow(*aPtr++, *bPtr++);
223 }
224}
225
226#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
227
228#ifdef LV_HAVE_AVX2
229#include <immintrin.h>
230
231#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
232#define POLY1_AVX2(x, c0, c1) \
233 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
234#define POLY2_AVX2(x, c0, c1, c2) \
235 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
236#define POLY3_AVX2(x, c0, c1, c2, c3) \
237 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
238#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
239 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
240#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
241 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
242
243static inline void volk_32f_x2_pow_32f_a_avx2(float* cVector,
244 const float* bVector,
245 const float* aVector,
246 unsigned int num_points)
247{
248 float* cPtr = cVector;
249 const float* bPtr = bVector;
250 const float* aPtr = aVector;
251
252 unsigned int number = 0;
253 const unsigned int eighthPoints = num_points / 8;
254
255 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
256 __m256 tmp, fx, mask, pow2n, z, y;
257 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
258 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
259 __m256i bias, exp, emm0, pi32_0x7f;
260
261 one = _mm256_set1_ps(1.0);
262 exp_hi = _mm256_set1_ps(88.3762626647949);
263 exp_lo = _mm256_set1_ps(-88.3762626647949);
264 ln2 = _mm256_set1_ps(0.6931471805);
265 log2EF = _mm256_set1_ps(1.44269504088896341);
266 half = _mm256_set1_ps(0.5);
267 exp_C1 = _mm256_set1_ps(0.693359375);
268 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
269 pi32_0x7f = _mm256_set1_epi32(0x7f);
270
271 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
272 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
273 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
274 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
275 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
276 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
277
278 for (; number < eighthPoints; number++) {
279 // First compute the logarithm
280 aVal = _mm256_load_ps(aPtr);
281 bias = _mm256_set1_epi32(127);
282 leadingOne = _mm256_set1_ps(1.0f);
283 exp = _mm256_sub_epi32(
284 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
285 _mm256_set1_epi32(0x7f800000)),
286 23),
287 bias);
288 logarithm = _mm256_cvtepi32_ps(exp);
289
290 frac = _mm256_or_ps(
291 leadingOne,
292 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
293
294#if POW_POLY_DEGREE == 6
295 mantissa = POLY5_AVX2(frac,
296 3.1157899f,
297 -3.3241990f,
298 2.5988452f,
299 -1.2315303f,
300 3.1821337e-1f,
301 -3.4436006e-2f);
302#elif POW_POLY_DEGREE == 5
303 mantissa = POLY4_AVX2(frac,
304 2.8882704548164776201f,
305 -2.52074962577807006663f,
306 1.48116647521213171641f,
307 -0.465725644288844778798f,
308 0.0596515482674574969533f);
309#elif POW_POLY_DEGREE == 4
310 mantissa = POLY3_AVX2(frac,
311 2.61761038894603480148f,
312 -1.75647175389045657003f,
313 0.688243882994381274313f,
314 -0.107254423828329604454f);
315#elif POW_POLY_DEGREE == 3
316 mantissa = POLY2_AVX2(frac,
317 2.28330284476918490682f,
318 -1.04913055217340124191f,
319 0.204446009836232697516f);
320#else
321#error
322#endif
323
324 logarithm = _mm256_add_ps(
325 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
326 logarithm = _mm256_mul_ps(logarithm, ln2);
327
328 // Now calculate b*lna
329 bVal = _mm256_load_ps(bPtr);
330 bVal = _mm256_mul_ps(bVal, logarithm);
331
332 // Now compute exp(b*lna)
333 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
334
335 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
336
337 emm0 = _mm256_cvttps_epi32(fx);
338 tmp = _mm256_cvtepi32_ps(emm0);
339
340 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
341 fx = _mm256_sub_ps(tmp, mask);
342
343 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
344 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
345 z = _mm256_mul_ps(bVal, bVal);
346
347 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
348 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
349 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
350 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
351 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
352 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
353 y = _mm256_add_ps(y, one);
354
355 emm0 =
356 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
357
358 pow2n = _mm256_castsi256_ps(emm0);
359 cVal = _mm256_mul_ps(y, pow2n);
360
361 _mm256_store_ps(cPtr, cVal);
362
363 aPtr += 8;
364 bPtr += 8;
365 cPtr += 8;
366 }
367
368 number = eighthPoints * 8;
369 for (; number < num_points; number++) {
370 *cPtr++ = pow(*aPtr++, *bPtr++);
371 }
372}
373
374#endif /* LV_HAVE_AVX2 for aligned */
375
376
377#ifdef LV_HAVE_SSE4_1
378#include <smmintrin.h>
379
380#define POLY0(x, c0) _mm_set1_ps(c0)
381#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
382#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
383#define POLY3(x, c0, c1, c2, c3) \
384 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
385#define POLY4(x, c0, c1, c2, c3, c4) \
386 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
387#define POLY5(x, c0, c1, c2, c3, c4, c5) \
388 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
389
390static inline void volk_32f_x2_pow_32f_a_sse4_1(float* cVector,
391 const float* bVector,
392 const float* aVector,
393 unsigned int num_points)
394{
395 float* cPtr = cVector;
396 const float* bPtr = bVector;
397 const float* aPtr = aVector;
398
399 unsigned int number = 0;
400 const unsigned int quarterPoints = num_points / 4;
401
402 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
403 __m128 tmp, fx, mask, pow2n, z, y;
404 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
405 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
406 __m128i bias, exp, emm0, pi32_0x7f;
407
408 one = _mm_set1_ps(1.0);
409 exp_hi = _mm_set1_ps(88.3762626647949);
410 exp_lo = _mm_set1_ps(-88.3762626647949);
411 ln2 = _mm_set1_ps(0.6931471805);
412 log2EF = _mm_set1_ps(1.44269504088896341);
413 half = _mm_set1_ps(0.5);
414 exp_C1 = _mm_set1_ps(0.693359375);
415 exp_C2 = _mm_set1_ps(-2.12194440e-4);
416 pi32_0x7f = _mm_set1_epi32(0x7f);
417
418 exp_p0 = _mm_set1_ps(1.9875691500e-4);
419 exp_p1 = _mm_set1_ps(1.3981999507e-3);
420 exp_p2 = _mm_set1_ps(8.3334519073e-3);
421 exp_p3 = _mm_set1_ps(4.1665795894e-2);
422 exp_p4 = _mm_set1_ps(1.6666665459e-1);
423 exp_p5 = _mm_set1_ps(5.0000001201e-1);
424
425 for (; number < quarterPoints; number++) {
426 // First compute the logarithm
427 aVal = _mm_load_ps(aPtr);
428 bias = _mm_set1_epi32(127);
429 leadingOne = _mm_set1_ps(1.0f);
430 exp = _mm_sub_epi32(
431 _mm_srli_epi32(
432 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
433 bias);
434 logarithm = _mm_cvtepi32_ps(exp);
435
436 frac = _mm_or_ps(leadingOne,
437 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
438
439#if POW_POLY_DEGREE == 6
440 mantissa = POLY5(frac,
441 3.1157899f,
442 -3.3241990f,
443 2.5988452f,
444 -1.2315303f,
445 3.1821337e-1f,
446 -3.4436006e-2f);
447#elif POW_POLY_DEGREE == 5
448 mantissa = POLY4(frac,
449 2.8882704548164776201f,
450 -2.52074962577807006663f,
451 1.48116647521213171641f,
452 -0.465725644288844778798f,
453 0.0596515482674574969533f);
454#elif POW_POLY_DEGREE == 4
455 mantissa = POLY3(frac,
456 2.61761038894603480148f,
457 -1.75647175389045657003f,
458 0.688243882994381274313f,
459 -0.107254423828329604454f);
460#elif POW_POLY_DEGREE == 3
461 mantissa = POLY2(frac,
462 2.28330284476918490682f,
463 -1.04913055217340124191f,
464 0.204446009836232697516f);
465#else
466#error
467#endif
468
469 logarithm =
470 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
471 logarithm = _mm_mul_ps(logarithm, ln2);
472
473
474 // Now calculate b*lna
475 bVal = _mm_load_ps(bPtr);
476 bVal = _mm_mul_ps(bVal, logarithm);
477
478 // Now compute exp(b*lna)
479 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
480
481 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
482
483 emm0 = _mm_cvttps_epi32(fx);
484 tmp = _mm_cvtepi32_ps(emm0);
485
486 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
487 fx = _mm_sub_ps(tmp, mask);
488
489 tmp = _mm_mul_ps(fx, exp_C1);
490 z = _mm_mul_ps(fx, exp_C2);
491 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
492 z = _mm_mul_ps(bVal, bVal);
493
494 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
495 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
496 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
497 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
498 y = _mm_add_ps(y, one);
499
500 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
501
502 pow2n = _mm_castsi128_ps(emm0);
503 cVal = _mm_mul_ps(y, pow2n);
504
505 _mm_store_ps(cPtr, cVal);
506
507 aPtr += 4;
508 bPtr += 4;
509 cPtr += 4;
510 }
511
512 number = quarterPoints * 4;
513 for (; number < num_points; number++) {
514 *cPtr++ = powf(*aPtr++, *bPtr++);
515 }
516}
517
518#endif /* LV_HAVE_SSE4_1 for aligned */
519
520#endif /* INCLUDED_volk_32f_x2_pow_32f_a_H */
521
522#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
523#define INCLUDED_volk_32f_x2_pow_32f_u_H
524
525#include <inttypes.h>
526#include <math.h>
527#include <stdio.h>
528#include <stdlib.h>
529
530#define POW_POLY_DEGREE 3
531
532#ifdef LV_HAVE_GENERIC
533
534static inline void volk_32f_x2_pow_32f_generic(float* cVector,
535 const float* bVector,
536 const float* aVector,
537 unsigned int num_points)
538{
539 float* cPtr = cVector;
540 const float* bPtr = bVector;
541 const float* aPtr = aVector;
542 unsigned int number = 0;
543
544 for (number = 0; number < num_points; number++) {
545 *cPtr++ = powf(*aPtr++, *bPtr++);
546 }
547}
548#endif /* LV_HAVE_GENERIC */
549
550
551#ifdef LV_HAVE_SSE4_1
552#include <smmintrin.h>
553
554#define POLY0(x, c0) _mm_set1_ps(c0)
555#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
556#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
557#define POLY3(x, c0, c1, c2, c3) \
558 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
559#define POLY4(x, c0, c1, c2, c3, c4) \
560 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
561#define POLY5(x, c0, c1, c2, c3, c4, c5) \
562 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
563
564static inline void volk_32f_x2_pow_32f_u_sse4_1(float* cVector,
565 const float* bVector,
566 const float* aVector,
567 unsigned int num_points)
568{
569 float* cPtr = cVector;
570 const float* bPtr = bVector;
571 const float* aPtr = aVector;
572
573 unsigned int number = 0;
574 const unsigned int quarterPoints = num_points / 4;
575
576 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
577 __m128 tmp, fx, mask, pow2n, z, y;
578 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
579 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
580 __m128i bias, exp, emm0, pi32_0x7f;
581
582 one = _mm_set1_ps(1.0);
583 exp_hi = _mm_set1_ps(88.3762626647949);
584 exp_lo = _mm_set1_ps(-88.3762626647949);
585 ln2 = _mm_set1_ps(0.6931471805);
586 log2EF = _mm_set1_ps(1.44269504088896341);
587 half = _mm_set1_ps(0.5);
588 exp_C1 = _mm_set1_ps(0.693359375);
589 exp_C2 = _mm_set1_ps(-2.12194440e-4);
590 pi32_0x7f = _mm_set1_epi32(0x7f);
591
592 exp_p0 = _mm_set1_ps(1.9875691500e-4);
593 exp_p1 = _mm_set1_ps(1.3981999507e-3);
594 exp_p2 = _mm_set1_ps(8.3334519073e-3);
595 exp_p3 = _mm_set1_ps(4.1665795894e-2);
596 exp_p4 = _mm_set1_ps(1.6666665459e-1);
597 exp_p5 = _mm_set1_ps(5.0000001201e-1);
598
599 for (; number < quarterPoints; number++) {
600 // First compute the logarithm
601 aVal = _mm_loadu_ps(aPtr);
602 bias = _mm_set1_epi32(127);
603 leadingOne = _mm_set1_ps(1.0f);
604 exp = _mm_sub_epi32(
605 _mm_srli_epi32(
606 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
607 bias);
608 logarithm = _mm_cvtepi32_ps(exp);
609
610 frac = _mm_or_ps(leadingOne,
611 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
612
613#if POW_POLY_DEGREE == 6
614 mantissa = POLY5(frac,
615 3.1157899f,
616 -3.3241990f,
617 2.5988452f,
618 -1.2315303f,
619 3.1821337e-1f,
620 -3.4436006e-2f);
621#elif POW_POLY_DEGREE == 5
622 mantissa = POLY4(frac,
623 2.8882704548164776201f,
624 -2.52074962577807006663f,
625 1.48116647521213171641f,
626 -0.465725644288844778798f,
627 0.0596515482674574969533f);
628#elif POW_POLY_DEGREE == 4
629 mantissa = POLY3(frac,
630 2.61761038894603480148f,
631 -1.75647175389045657003f,
632 0.688243882994381274313f,
633 -0.107254423828329604454f);
634#elif POW_POLY_DEGREE == 3
635 mantissa = POLY2(frac,
636 2.28330284476918490682f,
637 -1.04913055217340124191f,
638 0.204446009836232697516f);
639#else
640#error
641#endif
642
643 logarithm =
644 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
645 logarithm = _mm_mul_ps(logarithm, ln2);
646
647
648 // Now calculate b*lna
649 bVal = _mm_loadu_ps(bPtr);
650 bVal = _mm_mul_ps(bVal, logarithm);
651
652 // Now compute exp(b*lna)
653 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
654
655 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
656
657 emm0 = _mm_cvttps_epi32(fx);
658 tmp = _mm_cvtepi32_ps(emm0);
659
660 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
661 fx = _mm_sub_ps(tmp, mask);
662
663 tmp = _mm_mul_ps(fx, exp_C1);
664 z = _mm_mul_ps(fx, exp_C2);
665 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
666 z = _mm_mul_ps(bVal, bVal);
667
668 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
669 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
670 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
671 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
672 y = _mm_add_ps(y, one);
673
674 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
675
676 pow2n = _mm_castsi128_ps(emm0);
677 cVal = _mm_mul_ps(y, pow2n);
678
679 _mm_storeu_ps(cPtr, cVal);
680
681 aPtr += 4;
682 bPtr += 4;
683 cPtr += 4;
684 }
685
686 number = quarterPoints * 4;
687 for (; number < num_points; number++) {
688 *cPtr++ = powf(*aPtr++, *bPtr++);
689 }
690}
691
692#endif /* LV_HAVE_SSE4_1 for unaligned */
693
694#if LV_HAVE_AVX2 && LV_HAVE_FMA
695#include <immintrin.h>
696
697#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
698#define POLY1_AVX2_FMA(x, c0, c1) \
699 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
700#define POLY2_AVX2_FMA(x, c0, c1, c2) \
701 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
702#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
703 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
704#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
705 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
706#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
707 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
708
709static inline void volk_32f_x2_pow_32f_u_avx2_fma(float* cVector,
710 const float* bVector,
711 const float* aVector,
712 unsigned int num_points)
713{
714 float* cPtr = cVector;
715 const float* bPtr = bVector;
716 const float* aPtr = aVector;
717
718 unsigned int number = 0;
719 const unsigned int eighthPoints = num_points / 8;
720
721 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
722 __m256 tmp, fx, mask, pow2n, z, y;
723 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
724 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
725 __m256i bias, exp, emm0, pi32_0x7f;
726
727 one = _mm256_set1_ps(1.0);
728 exp_hi = _mm256_set1_ps(88.3762626647949);
729 exp_lo = _mm256_set1_ps(-88.3762626647949);
730 ln2 = _mm256_set1_ps(0.6931471805);
731 log2EF = _mm256_set1_ps(1.44269504088896341);
732 half = _mm256_set1_ps(0.5);
733 exp_C1 = _mm256_set1_ps(0.693359375);
734 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
735 pi32_0x7f = _mm256_set1_epi32(0x7f);
736
737 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
738 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
739 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
740 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
741 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
742 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
743
744 for (; number < eighthPoints; number++) {
745 // First compute the logarithm
746 aVal = _mm256_loadu_ps(aPtr);
747 bias = _mm256_set1_epi32(127);
748 leadingOne = _mm256_set1_ps(1.0f);
749 exp = _mm256_sub_epi32(
750 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
751 _mm256_set1_epi32(0x7f800000)),
752 23),
753 bias);
754 logarithm = _mm256_cvtepi32_ps(exp);
755
756 frac = _mm256_or_ps(
757 leadingOne,
758 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
759
760#if POW_POLY_DEGREE == 6
761 mantissa = POLY5_AVX2_FMA(frac,
762 3.1157899f,
763 -3.3241990f,
764 2.5988452f,
765 -1.2315303f,
766 3.1821337e-1f,
767 -3.4436006e-2f);
768#elif POW_POLY_DEGREE == 5
769 mantissa = POLY4_AVX2_FMA(frac,
770 2.8882704548164776201f,
771 -2.52074962577807006663f,
772 1.48116647521213171641f,
773 -0.465725644288844778798f,
774 0.0596515482674574969533f);
775#elif POW_POLY_DEGREE == 4
776 mantissa = POLY3_AVX2_FMA(frac,
777 2.61761038894603480148f,
778 -1.75647175389045657003f,
779 0.688243882994381274313f,
780 -0.107254423828329604454f);
781#elif POW_POLY_DEGREE == 3
782 mantissa = POLY2_AVX2_FMA(frac,
783 2.28330284476918490682f,
784 -1.04913055217340124191f,
785 0.204446009836232697516f);
786#else
787#error
788#endif
789
790 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
791 logarithm = _mm256_mul_ps(logarithm, ln2);
792
793
794 // Now calculate b*lna
795 bVal = _mm256_loadu_ps(bPtr);
796 bVal = _mm256_mul_ps(bVal, logarithm);
797
798 // Now compute exp(b*lna)
799 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
800
801 fx = _mm256_fmadd_ps(bVal, log2EF, half);
802
803 emm0 = _mm256_cvttps_epi32(fx);
804 tmp = _mm256_cvtepi32_ps(emm0);
805
806 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
807 fx = _mm256_sub_ps(tmp, mask);
808
809 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
810 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
811 z = _mm256_mul_ps(bVal, bVal);
812
813 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
814 y = _mm256_fmadd_ps(y, bVal, exp_p2);
815 y = _mm256_fmadd_ps(y, bVal, exp_p3);
816 y = _mm256_fmadd_ps(y, bVal, exp_p4);
817 y = _mm256_fmadd_ps(y, bVal, exp_p5);
818 y = _mm256_fmadd_ps(y, z, bVal);
819 y = _mm256_add_ps(y, one);
820
821 emm0 =
822 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
823
824 pow2n = _mm256_castsi256_ps(emm0);
825 cVal = _mm256_mul_ps(y, pow2n);
826
827 _mm256_storeu_ps(cPtr, cVal);
828
829 aPtr += 8;
830 bPtr += 8;
831 cPtr += 8;
832 }
833
834 number = eighthPoints * 8;
835 for (; number < num_points; number++) {
836 *cPtr++ = pow(*aPtr++, *bPtr++);
837 }
838}
839
840#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
841
842#ifdef LV_HAVE_AVX2
843#include <immintrin.h>
844
845#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
846#define POLY1_AVX2(x, c0, c1) \
847 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
848#define POLY2_AVX2(x, c0, c1, c2) \
849 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
850#define POLY3_AVX2(x, c0, c1, c2, c3) \
851 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
852#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
853 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
854#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
855 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
856
857static inline void volk_32f_x2_pow_32f_u_avx2(float* cVector,
858 const float* bVector,
859 const float* aVector,
860 unsigned int num_points)
861{
862 float* cPtr = cVector;
863 const float* bPtr = bVector;
864 const float* aPtr = aVector;
865
866 unsigned int number = 0;
867 const unsigned int eighthPoints = num_points / 8;
868
869 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
870 __m256 tmp, fx, mask, pow2n, z, y;
871 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
872 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
873 __m256i bias, exp, emm0, pi32_0x7f;
874
875 one = _mm256_set1_ps(1.0);
876 exp_hi = _mm256_set1_ps(88.3762626647949);
877 exp_lo = _mm256_set1_ps(-88.3762626647949);
878 ln2 = _mm256_set1_ps(0.6931471805);
879 log2EF = _mm256_set1_ps(1.44269504088896341);
880 half = _mm256_set1_ps(0.5);
881 exp_C1 = _mm256_set1_ps(0.693359375);
882 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
883 pi32_0x7f = _mm256_set1_epi32(0x7f);
884
885 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
886 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
887 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
888 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
889 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
890 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
891
892 for (; number < eighthPoints; number++) {
893 // First compute the logarithm
894 aVal = _mm256_loadu_ps(aPtr);
895 bias = _mm256_set1_epi32(127);
896 leadingOne = _mm256_set1_ps(1.0f);
897 exp = _mm256_sub_epi32(
898 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
899 _mm256_set1_epi32(0x7f800000)),
900 23),
901 bias);
902 logarithm = _mm256_cvtepi32_ps(exp);
903
904 frac = _mm256_or_ps(
905 leadingOne,
906 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
907
908#if POW_POLY_DEGREE == 6
909 mantissa = POLY5_AVX2(frac,
910 3.1157899f,
911 -3.3241990f,
912 2.5988452f,
913 -1.2315303f,
914 3.1821337e-1f,
915 -3.4436006e-2f);
916#elif POW_POLY_DEGREE == 5
917 mantissa = POLY4_AVX2(frac,
918 2.8882704548164776201f,
919 -2.52074962577807006663f,
920 1.48116647521213171641f,
921 -0.465725644288844778798f,
922 0.0596515482674574969533f);
923#elif POW_POLY_DEGREE == 4
924 mantissa = POLY3_AVX2(frac,
925 2.61761038894603480148f,
926 -1.75647175389045657003f,
927 0.688243882994381274313f,
928 -0.107254423828329604454f);
929#elif POW_POLY_DEGREE == 3
930 mantissa = POLY2_AVX2(frac,
931 2.28330284476918490682f,
932 -1.04913055217340124191f,
933 0.204446009836232697516f);
934#else
935#error
936#endif
937
938 logarithm = _mm256_add_ps(
939 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
940 logarithm = _mm256_mul_ps(logarithm, ln2);
941
942 // Now calculate b*lna
943 bVal = _mm256_loadu_ps(bPtr);
944 bVal = _mm256_mul_ps(bVal, logarithm);
945
946 // Now compute exp(b*lna)
947 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
948
949 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
950
951 emm0 = _mm256_cvttps_epi32(fx);
952 tmp = _mm256_cvtepi32_ps(emm0);
953
954 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
955 fx = _mm256_sub_ps(tmp, mask);
956
957 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
958 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
959 z = _mm256_mul_ps(bVal, bVal);
960
961 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
962 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
963 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
964 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
965 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
966 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
967 y = _mm256_add_ps(y, one);
968
969 emm0 =
970 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
971
972 pow2n = _mm256_castsi256_ps(emm0);
973 cVal = _mm256_mul_ps(y, pow2n);
974
975 _mm256_storeu_ps(cPtr, cVal);
976
977 aPtr += 8;
978 bPtr += 8;
979 cPtr += 8;
980 }
981
982 number = eighthPoints * 8;
983 for (; number < num_points; number++) {
984 *cPtr++ = pow(*aPtr++, *bPtr++);
985 }
986}
987
988#endif /* LV_HAVE_AVX2 for unaligned */
989
990#endif /* INCLUDED_volk_32f_x2_log2_32f_u_H */
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:534