78#ifndef INCLUDED_volk_32f_acos_32f_a_H
79#define INCLUDED_volk_32f_acos_32f_a_H
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
84static inline void volk_32f_acos_32f_a_avx2_fma(
float* bVector,
86 unsigned int num_points)
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
95 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
98 pi = _mm256_set1_ps(3.14159265358979323846);
99 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
100 fzeroes = _mm256_setzero_ps();
101 fones = _mm256_set1_ps(1.0);
102 ftwos = _mm256_set1_ps(2.0);
103 ffours = _mm256_set1_ps(4.0);
105 for (; number < eighthPoints; number++) {
106 aVal = _mm256_load_ps(aPtr);
108 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
109 _mm256_sub_ps(fones, aVal))),
112 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
113 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
114 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
116 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
118 for (
i = 0;
i < 2;
i++)
119 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
120 x = _mm256_div_ps(fones, x);
124 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
126 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
129 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
131 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132 arccosine = _mm256_sub_ps(
133 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
134 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
135 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
137 _mm256_store_ps(bPtr, arccosine);
142 number = eighthPoints * 8;
143 for (; number < num_points; number++) {
144 *bPtr++ = acos(*aPtr++);
152#include <immintrin.h>
157 float* bPtr = bVector;
158 const float* aPtr = aVector;
160 unsigned int number = 0;
161 unsigned int eighthPoints = num_points / 8;
164 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
165 __m256 fzeroes, fones, ftwos, ffours, condition;
167 pi = _mm256_set1_ps(3.14159265358979323846);
168 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
169 fzeroes = _mm256_setzero_ps();
170 fones = _mm256_set1_ps(1.0);
171 ftwos = _mm256_set1_ps(2.0);
172 ffours = _mm256_set1_ps(4.0);
174 for (; number < eighthPoints; number++) {
175 aVal = _mm256_load_ps(aPtr);
177 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
178 _mm256_sub_ps(fones, aVal))),
181 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
182 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
183 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
185 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
187 for (
i = 0;
i < 2;
i++)
189 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
190 x = _mm256_div_ps(fones, x);
193 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
194 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
196 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
197 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
200 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
202 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
203 arccosine = _mm256_sub_ps(
204 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
205 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
206 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
208 _mm256_store_ps(bPtr, arccosine);
213 number = eighthPoints * 8;
214 for (; number < num_points; number++) {
215 *bPtr++ = acos(*aPtr++);
222#include <smmintrin.h>
225volk_32f_acos_32f_a_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
227 float* bPtr = bVector;
228 const float* aPtr = aVector;
230 unsigned int number = 0;
231 unsigned int quarterPoints = num_points / 4;
234 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
235 __m128 fzeroes, fones, ftwos, ffours, condition;
237 pi = _mm_set1_ps(3.14159265358979323846);
238 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
239 fzeroes = _mm_setzero_ps();
240 fones = _mm_set1_ps(1.0);
241 ftwos = _mm_set1_ps(2.0);
242 ffours = _mm_set1_ps(4.0);
244 for (; number < quarterPoints; number++) {
245 aVal = _mm_load_ps(aPtr);
248 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
251 condition = _mm_cmplt_ps(z, fzeroes);
252 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
253 condition = _mm_cmplt_ps(z, fones);
254 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
256 for (
i = 0;
i < 2;
i++)
257 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
258 x = _mm_div_ps(fones, x);
261 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
262 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
264 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
265 condition = _mm_cmpgt_ps(z, fones);
267 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
269 condition = _mm_cmplt_ps(aVal, fzeroes);
271 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
272 condition = _mm_cmplt_ps(d, fzeroes);
273 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
275 _mm_store_ps(bPtr, arccosine);
280 number = quarterPoints * 4;
281 for (; number < num_points; number++) {
282 *bPtr++ = acosf(*aPtr++);
291#ifndef INCLUDED_volk_32f_acos_32f_u_H
292#define INCLUDED_volk_32f_acos_32f_u_H
294#if LV_HAVE_AVX2 && LV_HAVE_FMA
295#include <immintrin.h>
297static inline void volk_32f_acos_32f_u_avx2_fma(
float* bVector,
298 const float* aVector,
299 unsigned int num_points)
301 float* bPtr = bVector;
302 const float* aPtr = aVector;
304 unsigned int number = 0;
305 unsigned int eighthPoints = num_points / 8;
308 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
309 __m256 fzeroes, fones, ftwos, ffours, condition;
311 pi = _mm256_set1_ps(3.14159265358979323846);
312 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
313 fzeroes = _mm256_setzero_ps();
314 fones = _mm256_set1_ps(1.0);
315 ftwos = _mm256_set1_ps(2.0);
316 ffours = _mm256_set1_ps(4.0);
318 for (; number < eighthPoints; number++) {
319 aVal = _mm256_loadu_ps(aPtr);
321 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
322 _mm256_sub_ps(fones, aVal))),
325 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
326 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
327 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
329 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
331 for (
i = 0;
i < 2;
i++)
332 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
333 x = _mm256_div_ps(fones, x);
337 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
339 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
340 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
342 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
344 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
345 arccosine = _mm256_sub_ps(
346 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
347 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
348 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
350 _mm256_storeu_ps(bPtr, arccosine);
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 *bPtr++ = acos(*aPtr++);
365#include <immintrin.h>
370 float* bPtr = bVector;
371 const float* aPtr = aVector;
373 unsigned int number = 0;
374 unsigned int eighthPoints = num_points / 8;
377 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
378 __m256 fzeroes, fones, ftwos, ffours, condition;
380 pi = _mm256_set1_ps(3.14159265358979323846);
381 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
382 fzeroes = _mm256_setzero_ps();
383 fones = _mm256_set1_ps(1.0);
384 ftwos = _mm256_set1_ps(2.0);
385 ffours = _mm256_set1_ps(4.0);
387 for (; number < eighthPoints; number++) {
388 aVal = _mm256_loadu_ps(aPtr);
390 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
391 _mm256_sub_ps(fones, aVal))),
394 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
395 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
396 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
398 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
400 for (
i = 0;
i < 2;
i++)
402 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
403 x = _mm256_div_ps(fones, x);
406 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
407 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
409 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
410 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
413 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
415 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
416 arccosine = _mm256_sub_ps(
417 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
418 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
419 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
421 _mm256_storeu_ps(bPtr, arccosine);
426 number = eighthPoints * 8;
427 for (; number < num_points; number++) {
428 *bPtr++ = acos(*aPtr++);
435#include <smmintrin.h>
438volk_32f_acos_32f_u_sse4_1(
float* bVector,
const float* aVector,
unsigned int num_points)
440 float* bPtr = bVector;
441 const float* aPtr = aVector;
443 unsigned int number = 0;
444 unsigned int quarterPoints = num_points / 4;
447 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
448 __m128 fzeroes, fones, ftwos, ffours, condition;
450 pi = _mm_set1_ps(3.14159265358979323846);
451 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
452 fzeroes = _mm_setzero_ps();
453 fones = _mm_set1_ps(1.0);
454 ftwos = _mm_set1_ps(2.0);
455 ffours = _mm_set1_ps(4.0);
457 for (; number < quarterPoints; number++) {
458 aVal = _mm_loadu_ps(aPtr);
461 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
464 condition = _mm_cmplt_ps(z, fzeroes);
465 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
466 condition = _mm_cmplt_ps(z, fones);
467 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
469 for (
i = 0;
i < 2;
i++)
470 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
471 x = _mm_div_ps(fones, x);
475 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
476 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
478 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
479 condition = _mm_cmpgt_ps(z, fones);
481 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
483 condition = _mm_cmplt_ps(aVal, fzeroes);
485 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
486 condition = _mm_cmplt_ps(d, fzeroes);
487 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
489 _mm_storeu_ps(bPtr, arccosine);
494 number = quarterPoints * 4;
495 for (; number < num_points; number++) {
496 *bPtr++ = acosf(*aPtr++);
502#ifdef LV_HAVE_GENERIC
507 float* bPtr = bVector;
508 const float* aPtr = aVector;
509 unsigned int number = 0;
511 for (number = 0; number < num_points; number++) {
512 *bPtr++ = acosf(*aPtr++);
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:505
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:76
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:368
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:155
for i
Definition: volk_config_fixed.tmpl.h:25