71#ifndef INCLUDED_volk_32f_x2_pow_32f_a_H
72#define INCLUDED_volk_32f_x2_pow_32f_a_H
79#define POW_POLY_DEGREE 3
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
84#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
85#define POLY1_AVX2_FMA(x, c0, c1) \
86 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
87#define POLY2_AVX2_FMA(x, c0, c1, c2) \
88 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
89#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
90 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
91#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
92 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
93#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
94 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
96static inline void volk_32f_x2_pow_32f_a_avx2_fma(
float* cVector,
99 unsigned int num_points)
101 float* cPtr = cVector;
102 const float* bPtr = bVector;
103 const float* aPtr = aVector;
105 unsigned int number = 0;
106 const unsigned int eighthPoints = num_points / 8;
108 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
109 __m256 tmp, fx, mask, pow2n, z, y;
110 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
111 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
112 __m256i bias, exp, emm0, pi32_0x7f;
114 one = _mm256_set1_ps(1.0);
115 exp_hi = _mm256_set1_ps(88.3762626647949);
116 exp_lo = _mm256_set1_ps(-88.3762626647949);
117 ln2 = _mm256_set1_ps(0.6931471805);
118 log2EF = _mm256_set1_ps(1.44269504088896341);
119 half = _mm256_set1_ps(0.5);
120 exp_C1 = _mm256_set1_ps(0.693359375);
121 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
122 pi32_0x7f = _mm256_set1_epi32(0x7f);
124 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
125 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
126 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
127 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
128 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
129 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
131 for (; number < eighthPoints; number++) {
133 aVal = _mm256_load_ps(aPtr);
134 bias = _mm256_set1_epi32(127);
135 leadingOne = _mm256_set1_ps(1.0f);
136 exp = _mm256_sub_epi32(
137 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
138 _mm256_set1_epi32(0x7f800000)),
141 logarithm = _mm256_cvtepi32_ps(exp);
145 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
147#if POW_POLY_DEGREE == 6
148 mantissa = POLY5_AVX2_FMA(frac,
155#elif POW_POLY_DEGREE == 5
156 mantissa = POLY4_AVX2_FMA(frac,
157 2.8882704548164776201f,
158 -2.52074962577807006663f,
159 1.48116647521213171641f,
160 -0.465725644288844778798f,
161 0.0596515482674574969533f);
162#elif POW_POLY_DEGREE == 4
163 mantissa = POLY3_AVX2_FMA(frac,
164 2.61761038894603480148f,
165 -1.75647175389045657003f,
166 0.688243882994381274313f,
167 -0.107254423828329604454f);
168#elif POW_POLY_DEGREE == 3
169 mantissa = POLY2_AVX2_FMA(frac,
170 2.28330284476918490682f,
171 -1.04913055217340124191f,
172 0.204446009836232697516f);
177 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
178 logarithm = _mm256_mul_ps(logarithm, ln2);
181 bVal = _mm256_load_ps(bPtr);
182 bVal = _mm256_mul_ps(bVal, logarithm);
185 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
187 fx = _mm256_fmadd_ps(bVal, log2EF, half);
189 emm0 = _mm256_cvttps_epi32(fx);
190 tmp = _mm256_cvtepi32_ps(emm0);
192 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
193 fx = _mm256_sub_ps(tmp, mask);
195 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
196 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
197 z = _mm256_mul_ps(bVal, bVal);
199 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
200 y = _mm256_fmadd_ps(y, bVal, exp_p2);
201 y = _mm256_fmadd_ps(y, bVal, exp_p3);
202 y = _mm256_fmadd_ps(y, bVal, exp_p4);
203 y = _mm256_fmadd_ps(y, bVal, exp_p5);
204 y = _mm256_fmadd_ps(y, z, bVal);
205 y = _mm256_add_ps(y, one);
208 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
210 pow2n = _mm256_castsi256_ps(emm0);
211 cVal = _mm256_mul_ps(y, pow2n);
213 _mm256_store_ps(cPtr, cVal);
220 number = eighthPoints * 8;
221 for (; number < num_points; number++) {
222 *cPtr++ = pow(*aPtr++, *bPtr++);
229#include <immintrin.h>
231#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
232#define POLY1_AVX2(x, c0, c1) \
233 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
234#define POLY2_AVX2(x, c0, c1, c2) \
235 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
236#define POLY3_AVX2(x, c0, c1, c2, c3) \
237 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
238#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
239 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
240#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
241 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
243static inline void volk_32f_x2_pow_32f_a_avx2(
float* cVector,
244 const float* bVector,
245 const float* aVector,
246 unsigned int num_points)
248 float* cPtr = cVector;
249 const float* bPtr = bVector;
250 const float* aPtr = aVector;
252 unsigned int number = 0;
253 const unsigned int eighthPoints = num_points / 8;
255 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
256 __m256 tmp, fx, mask, pow2n, z, y;
257 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
258 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
259 __m256i bias, exp, emm0, pi32_0x7f;
261 one = _mm256_set1_ps(1.0);
262 exp_hi = _mm256_set1_ps(88.3762626647949);
263 exp_lo = _mm256_set1_ps(-88.3762626647949);
264 ln2 = _mm256_set1_ps(0.6931471805);
265 log2EF = _mm256_set1_ps(1.44269504088896341);
266 half = _mm256_set1_ps(0.5);
267 exp_C1 = _mm256_set1_ps(0.693359375);
268 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
269 pi32_0x7f = _mm256_set1_epi32(0x7f);
271 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
272 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
273 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
274 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
275 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
276 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
278 for (; number < eighthPoints; number++) {
280 aVal = _mm256_load_ps(aPtr);
281 bias = _mm256_set1_epi32(127);
282 leadingOne = _mm256_set1_ps(1.0f);
283 exp = _mm256_sub_epi32(
284 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
285 _mm256_set1_epi32(0x7f800000)),
288 logarithm = _mm256_cvtepi32_ps(exp);
292 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
294#if POW_POLY_DEGREE == 6
295 mantissa = POLY5_AVX2(frac,
302#elif POW_POLY_DEGREE == 5
303 mantissa = POLY4_AVX2(frac,
304 2.8882704548164776201f,
305 -2.52074962577807006663f,
306 1.48116647521213171641f,
307 -0.465725644288844778798f,
308 0.0596515482674574969533f);
309#elif POW_POLY_DEGREE == 4
310 mantissa = POLY3_AVX2(frac,
311 2.61761038894603480148f,
312 -1.75647175389045657003f,
313 0.688243882994381274313f,
314 -0.107254423828329604454f);
315#elif POW_POLY_DEGREE == 3
316 mantissa = POLY2_AVX2(frac,
317 2.28330284476918490682f,
318 -1.04913055217340124191f,
319 0.204446009836232697516f);
324 logarithm = _mm256_add_ps(
325 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
326 logarithm = _mm256_mul_ps(logarithm, ln2);
329 bVal = _mm256_load_ps(bPtr);
330 bVal = _mm256_mul_ps(bVal, logarithm);
333 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
335 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
337 emm0 = _mm256_cvttps_epi32(fx);
338 tmp = _mm256_cvtepi32_ps(emm0);
340 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
341 fx = _mm256_sub_ps(tmp, mask);
343 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
344 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
345 z = _mm256_mul_ps(bVal, bVal);
347 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
348 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
349 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
350 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
351 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
352 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
353 y = _mm256_add_ps(y, one);
356 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
358 pow2n = _mm256_castsi256_ps(emm0);
359 cVal = _mm256_mul_ps(y, pow2n);
361 _mm256_store_ps(cPtr, cVal);
368 number = eighthPoints * 8;
369 for (; number < num_points; number++) {
370 *cPtr++ = pow(*aPtr++, *bPtr++);
378#include <smmintrin.h>
380#define POLY0(x, c0) _mm_set1_ps(c0)
381#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
382#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
383#define POLY3(x, c0, c1, c2, c3) \
384 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
385#define POLY4(x, c0, c1, c2, c3, c4) \
386 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
387#define POLY5(x, c0, c1, c2, c3, c4, c5) \
388 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
390static inline void volk_32f_x2_pow_32f_a_sse4_1(
float* cVector,
391 const float* bVector,
392 const float* aVector,
393 unsigned int num_points)
395 float* cPtr = cVector;
396 const float* bPtr = bVector;
397 const float* aPtr = aVector;
399 unsigned int number = 0;
400 const unsigned int quarterPoints = num_points / 4;
402 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
403 __m128 tmp, fx, mask, pow2n, z, y;
404 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
405 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
406 __m128i bias, exp, emm0, pi32_0x7f;
408 one = _mm_set1_ps(1.0);
409 exp_hi = _mm_set1_ps(88.3762626647949);
410 exp_lo = _mm_set1_ps(-88.3762626647949);
411 ln2 = _mm_set1_ps(0.6931471805);
412 log2EF = _mm_set1_ps(1.44269504088896341);
413 half = _mm_set1_ps(0.5);
414 exp_C1 = _mm_set1_ps(0.693359375);
415 exp_C2 = _mm_set1_ps(-2.12194440e-4);
416 pi32_0x7f = _mm_set1_epi32(0x7f);
418 exp_p0 = _mm_set1_ps(1.9875691500e-4);
419 exp_p1 = _mm_set1_ps(1.3981999507e-3);
420 exp_p2 = _mm_set1_ps(8.3334519073e-3);
421 exp_p3 = _mm_set1_ps(4.1665795894e-2);
422 exp_p4 = _mm_set1_ps(1.6666665459e-1);
423 exp_p5 = _mm_set1_ps(5.0000001201e-1);
425 for (; number < quarterPoints; number++) {
427 aVal = _mm_load_ps(aPtr);
428 bias = _mm_set1_epi32(127);
429 leadingOne = _mm_set1_ps(1.0f);
432 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
434 logarithm = _mm_cvtepi32_ps(exp);
436 frac = _mm_or_ps(leadingOne,
437 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
439#if POW_POLY_DEGREE == 6
440 mantissa = POLY5(frac,
447#elif POW_POLY_DEGREE == 5
448 mantissa = POLY4(frac,
449 2.8882704548164776201f,
450 -2.52074962577807006663f,
451 1.48116647521213171641f,
452 -0.465725644288844778798f,
453 0.0596515482674574969533f);
454#elif POW_POLY_DEGREE == 4
455 mantissa = POLY3(frac,
456 2.61761038894603480148f,
457 -1.75647175389045657003f,
458 0.688243882994381274313f,
459 -0.107254423828329604454f);
460#elif POW_POLY_DEGREE == 3
461 mantissa = POLY2(frac,
462 2.28330284476918490682f,
463 -1.04913055217340124191f,
464 0.204446009836232697516f);
470 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
471 logarithm = _mm_mul_ps(logarithm, ln2);
475 bVal = _mm_load_ps(bPtr);
476 bVal = _mm_mul_ps(bVal, logarithm);
479 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
481 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
483 emm0 = _mm_cvttps_epi32(fx);
484 tmp = _mm_cvtepi32_ps(emm0);
486 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
487 fx = _mm_sub_ps(tmp, mask);
489 tmp = _mm_mul_ps(fx, exp_C1);
490 z = _mm_mul_ps(fx, exp_C2);
491 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
492 z = _mm_mul_ps(bVal, bVal);
494 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
495 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
496 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
497 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
498 y = _mm_add_ps(y, one);
500 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
502 pow2n = _mm_castsi128_ps(emm0);
503 cVal = _mm_mul_ps(y, pow2n);
505 _mm_store_ps(cPtr, cVal);
512 number = quarterPoints * 4;
513 for (; number < num_points; number++) {
514 *cPtr++ = powf(*aPtr++, *bPtr++);
522#ifndef INCLUDED_volk_32f_x2_pow_32f_u_H
523#define INCLUDED_volk_32f_x2_pow_32f_u_H
530#define POW_POLY_DEGREE 3
532#ifdef LV_HAVE_GENERIC
535 const float* bVector,
536 const float* aVector,
537 unsigned int num_points)
539 float* cPtr = cVector;
540 const float* bPtr = bVector;
541 const float* aPtr = aVector;
542 unsigned int number = 0;
544 for (number = 0; number < num_points; number++) {
545 *cPtr++ = powf(*aPtr++, *bPtr++);
552#include <smmintrin.h>
554#define POLY0(x, c0) _mm_set1_ps(c0)
555#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
556#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
557#define POLY3(x, c0, c1, c2, c3) \
558 _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
559#define POLY4(x, c0, c1, c2, c3, c4) \
560 _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
561#define POLY5(x, c0, c1, c2, c3, c4, c5) \
562 _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
564static inline void volk_32f_x2_pow_32f_u_sse4_1(
float* cVector,
565 const float* bVector,
566 const float* aVector,
567 unsigned int num_points)
569 float* cPtr = cVector;
570 const float* bPtr = bVector;
571 const float* aPtr = aVector;
573 unsigned int number = 0;
574 const unsigned int quarterPoints = num_points / 4;
576 __m128 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
577 __m128 tmp, fx, mask, pow2n, z, y;
578 __m128 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
579 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
580 __m128i bias, exp, emm0, pi32_0x7f;
582 one = _mm_set1_ps(1.0);
583 exp_hi = _mm_set1_ps(88.3762626647949);
584 exp_lo = _mm_set1_ps(-88.3762626647949);
585 ln2 = _mm_set1_ps(0.6931471805);
586 log2EF = _mm_set1_ps(1.44269504088896341);
587 half = _mm_set1_ps(0.5);
588 exp_C1 = _mm_set1_ps(0.693359375);
589 exp_C2 = _mm_set1_ps(-2.12194440e-4);
590 pi32_0x7f = _mm_set1_epi32(0x7f);
592 exp_p0 = _mm_set1_ps(1.9875691500e-4);
593 exp_p1 = _mm_set1_ps(1.3981999507e-3);
594 exp_p2 = _mm_set1_ps(8.3334519073e-3);
595 exp_p3 = _mm_set1_ps(4.1665795894e-2);
596 exp_p4 = _mm_set1_ps(1.6666665459e-1);
597 exp_p5 = _mm_set1_ps(5.0000001201e-1);
599 for (; number < quarterPoints; number++) {
601 aVal = _mm_loadu_ps(aPtr);
602 bias = _mm_set1_epi32(127);
603 leadingOne = _mm_set1_ps(1.0f);
606 _mm_and_si128(_mm_castps_si128(aVal), _mm_set1_epi32(0x7f800000)), 23),
608 logarithm = _mm_cvtepi32_ps(exp);
610 frac = _mm_or_ps(leadingOne,
611 _mm_and_ps(aVal, _mm_castsi128_ps(_mm_set1_epi32(0x7fffff))));
613#if POW_POLY_DEGREE == 6
614 mantissa = POLY5(frac,
621#elif POW_POLY_DEGREE == 5
622 mantissa = POLY4(frac,
623 2.8882704548164776201f,
624 -2.52074962577807006663f,
625 1.48116647521213171641f,
626 -0.465725644288844778798f,
627 0.0596515482674574969533f);
628#elif POW_POLY_DEGREE == 4
629 mantissa = POLY3(frac,
630 2.61761038894603480148f,
631 -1.75647175389045657003f,
632 0.688243882994381274313f,
633 -0.107254423828329604454f);
634#elif POW_POLY_DEGREE == 3
635 mantissa = POLY2(frac,
636 2.28330284476918490682f,
637 -1.04913055217340124191f,
638 0.204446009836232697516f);
644 _mm_add_ps(logarithm, _mm_mul_ps(mantissa, _mm_sub_ps(frac, leadingOne)));
645 logarithm = _mm_mul_ps(logarithm, ln2);
649 bVal = _mm_loadu_ps(bPtr);
650 bVal = _mm_mul_ps(bVal, logarithm);
653 bVal = _mm_max_ps(_mm_min_ps(bVal, exp_hi), exp_lo);
655 fx = _mm_add_ps(_mm_mul_ps(bVal, log2EF), half);
657 emm0 = _mm_cvttps_epi32(fx);
658 tmp = _mm_cvtepi32_ps(emm0);
660 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
661 fx = _mm_sub_ps(tmp, mask);
663 tmp = _mm_mul_ps(fx, exp_C1);
664 z = _mm_mul_ps(fx, exp_C2);
665 bVal = _mm_sub_ps(_mm_sub_ps(bVal, tmp), z);
666 z = _mm_mul_ps(bVal, bVal);
668 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, bVal), exp_p1), bVal);
669 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), bVal), exp_p3);
670 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, bVal), exp_p4), bVal);
671 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), bVal);
672 y = _mm_add_ps(y, one);
674 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
676 pow2n = _mm_castsi128_ps(emm0);
677 cVal = _mm_mul_ps(y, pow2n);
679 _mm_storeu_ps(cPtr, cVal);
686 number = quarterPoints * 4;
687 for (; number < num_points; number++) {
688 *cPtr++ = powf(*aPtr++, *bPtr++);
694#if LV_HAVE_AVX2 && LV_HAVE_FMA
695#include <immintrin.h>
697#define POLY0_AVX2_FMA(x, c0) _mm256_set1_ps(c0)
698#define POLY1_AVX2_FMA(x, c0, c1) \
699 _mm256_fmadd_ps(POLY0_AVX2_FMA(x, c1), x, _mm256_set1_ps(c0))
700#define POLY2_AVX2_FMA(x, c0, c1, c2) \
701 _mm256_fmadd_ps(POLY1_AVX2_FMA(x, c1, c2), x, _mm256_set1_ps(c0))
702#define POLY3_AVX2_FMA(x, c0, c1, c2, c3) \
703 _mm256_fmadd_ps(POLY2_AVX2_FMA(x, c1, c2, c3), x, _mm256_set1_ps(c0))
704#define POLY4_AVX2_FMA(x, c0, c1, c2, c3, c4) \
705 _mm256_fmadd_ps(POLY3_AVX2_FMA(x, c1, c2, c3, c4), x, _mm256_set1_ps(c0))
706#define POLY5_AVX2_FMA(x, c0, c1, c2, c3, c4, c5) \
707 _mm256_fmadd_ps(POLY4_AVX2_FMA(x, c1, c2, c3, c4, c5), x, _mm256_set1_ps(c0))
709static inline void volk_32f_x2_pow_32f_u_avx2_fma(
float* cVector,
710 const float* bVector,
711 const float* aVector,
712 unsigned int num_points)
714 float* cPtr = cVector;
715 const float* bPtr = bVector;
716 const float* aPtr = aVector;
718 unsigned int number = 0;
719 const unsigned int eighthPoints = num_points / 8;
721 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
722 __m256 tmp, fx, mask, pow2n, z, y;
723 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
724 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
725 __m256i bias, exp, emm0, pi32_0x7f;
727 one = _mm256_set1_ps(1.0);
728 exp_hi = _mm256_set1_ps(88.3762626647949);
729 exp_lo = _mm256_set1_ps(-88.3762626647949);
730 ln2 = _mm256_set1_ps(0.6931471805);
731 log2EF = _mm256_set1_ps(1.44269504088896341);
732 half = _mm256_set1_ps(0.5);
733 exp_C1 = _mm256_set1_ps(0.693359375);
734 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
735 pi32_0x7f = _mm256_set1_epi32(0x7f);
737 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
738 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
739 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
740 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
741 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
742 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
744 for (; number < eighthPoints; number++) {
746 aVal = _mm256_loadu_ps(aPtr);
747 bias = _mm256_set1_epi32(127);
748 leadingOne = _mm256_set1_ps(1.0f);
749 exp = _mm256_sub_epi32(
750 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
751 _mm256_set1_epi32(0x7f800000)),
754 logarithm = _mm256_cvtepi32_ps(exp);
758 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
760#if POW_POLY_DEGREE == 6
761 mantissa = POLY5_AVX2_FMA(frac,
768#elif POW_POLY_DEGREE == 5
769 mantissa = POLY4_AVX2_FMA(frac,
770 2.8882704548164776201f,
771 -2.52074962577807006663f,
772 1.48116647521213171641f,
773 -0.465725644288844778798f,
774 0.0596515482674574969533f);
775#elif POW_POLY_DEGREE == 4
776 mantissa = POLY3_AVX2_FMA(frac,
777 2.61761038894603480148f,
778 -1.75647175389045657003f,
779 0.688243882994381274313f,
780 -0.107254423828329604454f);
781#elif POW_POLY_DEGREE == 3
782 mantissa = POLY2_AVX2_FMA(frac,
783 2.28330284476918490682f,
784 -1.04913055217340124191f,
785 0.204446009836232697516f);
790 logarithm = _mm256_fmadd_ps(mantissa, _mm256_sub_ps(frac, leadingOne), logarithm);
791 logarithm = _mm256_mul_ps(logarithm, ln2);
795 bVal = _mm256_loadu_ps(bPtr);
796 bVal = _mm256_mul_ps(bVal, logarithm);
799 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
801 fx = _mm256_fmadd_ps(bVal, log2EF, half);
803 emm0 = _mm256_cvttps_epi32(fx);
804 tmp = _mm256_cvtepi32_ps(emm0);
806 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
807 fx = _mm256_sub_ps(tmp, mask);
809 tmp = _mm256_fnmadd_ps(fx, exp_C1, bVal);
810 bVal = _mm256_fnmadd_ps(fx, exp_C2, tmp);
811 z = _mm256_mul_ps(bVal, bVal);
813 y = _mm256_fmadd_ps(exp_p0, bVal, exp_p1);
814 y = _mm256_fmadd_ps(y, bVal, exp_p2);
815 y = _mm256_fmadd_ps(y, bVal, exp_p3);
816 y = _mm256_fmadd_ps(y, bVal, exp_p4);
817 y = _mm256_fmadd_ps(y, bVal, exp_p5);
818 y = _mm256_fmadd_ps(y, z, bVal);
819 y = _mm256_add_ps(y, one);
822 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
824 pow2n = _mm256_castsi256_ps(emm0);
825 cVal = _mm256_mul_ps(y, pow2n);
827 _mm256_storeu_ps(cPtr, cVal);
834 number = eighthPoints * 8;
835 for (; number < num_points; number++) {
836 *cPtr++ = pow(*aPtr++, *bPtr++);
843#include <immintrin.h>
845#define POLY0_AVX2(x, c0) _mm256_set1_ps(c0)
846#define POLY1_AVX2(x, c0, c1) \
847 _mm256_add_ps(_mm256_mul_ps(POLY0_AVX2(x, c1), x), _mm256_set1_ps(c0))
848#define POLY2_AVX2(x, c0, c1, c2) \
849 _mm256_add_ps(_mm256_mul_ps(POLY1_AVX2(x, c1, c2), x), _mm256_set1_ps(c0))
850#define POLY3_AVX2(x, c0, c1, c2, c3) \
851 _mm256_add_ps(_mm256_mul_ps(POLY2_AVX2(x, c1, c2, c3), x), _mm256_set1_ps(c0))
852#define POLY4_AVX2(x, c0, c1, c2, c3, c4) \
853 _mm256_add_ps(_mm256_mul_ps(POLY3_AVX2(x, c1, c2, c3, c4), x), _mm256_set1_ps(c0))
854#define POLY5_AVX2(x, c0, c1, c2, c3, c4, c5) \
855 _mm256_add_ps(_mm256_mul_ps(POLY4_AVX2(x, c1, c2, c3, c4, c5), x), _mm256_set1_ps(c0))
857static inline void volk_32f_x2_pow_32f_u_avx2(
float* cVector,
858 const float* bVector,
859 const float* aVector,
860 unsigned int num_points)
862 float* cPtr = cVector;
863 const float* bPtr = bVector;
864 const float* aPtr = aVector;
866 unsigned int number = 0;
867 const unsigned int eighthPoints = num_points / 8;
869 __m256 aVal, bVal, cVal, logarithm, mantissa, frac, leadingOne;
870 __m256 tmp, fx, mask, pow2n, z, y;
871 __m256 one, exp_hi, exp_lo, ln2, log2EF, half, exp_C1, exp_C2;
872 __m256 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
873 __m256i bias, exp, emm0, pi32_0x7f;
875 one = _mm256_set1_ps(1.0);
876 exp_hi = _mm256_set1_ps(88.3762626647949);
877 exp_lo = _mm256_set1_ps(-88.3762626647949);
878 ln2 = _mm256_set1_ps(0.6931471805);
879 log2EF = _mm256_set1_ps(1.44269504088896341);
880 half = _mm256_set1_ps(0.5);
881 exp_C1 = _mm256_set1_ps(0.693359375);
882 exp_C2 = _mm256_set1_ps(-2.12194440e-4);
883 pi32_0x7f = _mm256_set1_epi32(0x7f);
885 exp_p0 = _mm256_set1_ps(1.9875691500e-4);
886 exp_p1 = _mm256_set1_ps(1.3981999507e-3);
887 exp_p2 = _mm256_set1_ps(8.3334519073e-3);
888 exp_p3 = _mm256_set1_ps(4.1665795894e-2);
889 exp_p4 = _mm256_set1_ps(1.6666665459e-1);
890 exp_p5 = _mm256_set1_ps(5.0000001201e-1);
892 for (; number < eighthPoints; number++) {
894 aVal = _mm256_loadu_ps(aPtr);
895 bias = _mm256_set1_epi32(127);
896 leadingOne = _mm256_set1_ps(1.0f);
897 exp = _mm256_sub_epi32(
898 _mm256_srli_epi32(_mm256_and_si256(_mm256_castps_si256(aVal),
899 _mm256_set1_epi32(0x7f800000)),
902 logarithm = _mm256_cvtepi32_ps(exp);
906 _mm256_and_ps(aVal, _mm256_castsi256_ps(_mm256_set1_epi32(0x7fffff))));
908#if POW_POLY_DEGREE == 6
909 mantissa = POLY5_AVX2(frac,
916#elif POW_POLY_DEGREE == 5
917 mantissa = POLY4_AVX2(frac,
918 2.8882704548164776201f,
919 -2.52074962577807006663f,
920 1.48116647521213171641f,
921 -0.465725644288844778798f,
922 0.0596515482674574969533f);
923#elif POW_POLY_DEGREE == 4
924 mantissa = POLY3_AVX2(frac,
925 2.61761038894603480148f,
926 -1.75647175389045657003f,
927 0.688243882994381274313f,
928 -0.107254423828329604454f);
929#elif POW_POLY_DEGREE == 3
930 mantissa = POLY2_AVX2(frac,
931 2.28330284476918490682f,
932 -1.04913055217340124191f,
933 0.204446009836232697516f);
938 logarithm = _mm256_add_ps(
939 _mm256_mul_ps(mantissa, _mm256_sub_ps(frac, leadingOne)), logarithm);
940 logarithm = _mm256_mul_ps(logarithm, ln2);
943 bVal = _mm256_loadu_ps(bPtr);
944 bVal = _mm256_mul_ps(bVal, logarithm);
947 bVal = _mm256_max_ps(_mm256_min_ps(bVal, exp_hi), exp_lo);
949 fx = _mm256_add_ps(_mm256_mul_ps(bVal, log2EF), half);
951 emm0 = _mm256_cvttps_epi32(fx);
952 tmp = _mm256_cvtepi32_ps(emm0);
954 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), one);
955 fx = _mm256_sub_ps(tmp, mask);
957 tmp = _mm256_sub_ps(bVal, _mm256_mul_ps(fx, exp_C1));
958 bVal = _mm256_sub_ps(tmp, _mm256_mul_ps(fx, exp_C2));
959 z = _mm256_mul_ps(bVal, bVal);
961 y = _mm256_add_ps(_mm256_mul_ps(exp_p0, bVal), exp_p1);
962 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p2);
963 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p3);
964 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p4);
965 y = _mm256_add_ps(_mm256_mul_ps(y, bVal), exp_p5);
966 y = _mm256_add_ps(_mm256_mul_ps(y, z), bVal);
967 y = _mm256_add_ps(y, one);
970 _mm256_slli_epi32(_mm256_add_epi32(_mm256_cvttps_epi32(fx), pi32_0x7f), 23);
972 pow2n = _mm256_castsi256_ps(emm0);
973 cVal = _mm256_mul_ps(y, pow2n);
975 _mm256_storeu_ps(cPtr, cVal);
982 number = eighthPoints * 8;
983 for (; number < num_points; number++) {
984 *cPtr++ = pow(*aPtr++, *bPtr++);
static void volk_32f_x2_pow_32f_generic(float *cVector, const float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_x2_pow_32f.h:534