Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_acos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#include <inttypes.h>
71#include <math.h>
72#include <stdio.h>
73
74/* This is the number of terms of Taylor series to evaluate, increase this for more
75 * accuracy*/
76#define ACOS_TERMS 2
77
78#ifndef INCLUDED_volk_32f_acos_32f_a_H
79#define INCLUDED_volk_32f_acos_32f_a_H
80
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
82#include <immintrin.h>
83
84static inline void volk_32f_acos_32f_a_avx2_fma(float* bVector,
85 const float* aVector,
86 unsigned int num_points)
87{
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
90
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
93 int i, j;
94
95 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
97
98 pi = _mm256_set1_ps(3.14159265358979323846);
99 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
100 fzeroes = _mm256_setzero_ps();
101 fones = _mm256_set1_ps(1.0);
102 ftwos = _mm256_set1_ps(2.0);
103 ffours = _mm256_set1_ps(4.0);
104
105 for (; number < eighthPoints; number++) {
106 aVal = _mm256_load_ps(aPtr);
107 d = aVal;
108 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
109 _mm256_sub_ps(fones, aVal))),
110 aVal);
111 z = aVal;
112 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
113 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
114 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
115 x = _mm256_add_ps(
116 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
117
118 for (i = 0; i < 2; i++)
119 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
120 x = _mm256_div_ps(fones, x);
121 y = fzeroes;
122 for (j = ACOS_TERMS - 1; j >= 0; j--)
123 y = _mm256_fmadd_ps(
124 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
125
126 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
127 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
128
129 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
130 arccosine = y;
131 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
132 arccosine = _mm256_sub_ps(
133 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
134 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
135 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
136
137 _mm256_store_ps(bPtr, arccosine);
138 aPtr += 8;
139 bPtr += 8;
140 }
141
142 number = eighthPoints * 8;
143 for (; number < num_points; number++) {
144 *bPtr++ = acos(*aPtr++);
145 }
146}
147
148#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
149
150
151#ifdef LV_HAVE_AVX
152#include <immintrin.h>
153
154static inline void
155volk_32f_acos_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
156{
157 float* bPtr = bVector;
158 const float* aPtr = aVector;
159
160 unsigned int number = 0;
161 unsigned int eighthPoints = num_points / 8;
162 int i, j;
163
164 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
165 __m256 fzeroes, fones, ftwos, ffours, condition;
166
167 pi = _mm256_set1_ps(3.14159265358979323846);
168 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
169 fzeroes = _mm256_setzero_ps();
170 fones = _mm256_set1_ps(1.0);
171 ftwos = _mm256_set1_ps(2.0);
172 ffours = _mm256_set1_ps(4.0);
173
174 for (; number < eighthPoints; number++) {
175 aVal = _mm256_load_ps(aPtr);
176 d = aVal;
177 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
178 _mm256_sub_ps(fones, aVal))),
179 aVal);
180 z = aVal;
181 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
182 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
183 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
184 x = _mm256_add_ps(
185 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
186
187 for (i = 0; i < 2; i++)
188 x = _mm256_add_ps(x,
189 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
190 x = _mm256_div_ps(fones, x);
191 y = fzeroes;
192 for (j = ACOS_TERMS - 1; j >= 0; j--)
193 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
194 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
195
196 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
197 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
198
199 y = _mm256_add_ps(
200 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
201 arccosine = y;
202 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
203 arccosine = _mm256_sub_ps(
204 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
205 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
206 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
207
208 _mm256_store_ps(bPtr, arccosine);
209 aPtr += 8;
210 bPtr += 8;
211 }
212
213 number = eighthPoints * 8;
214 for (; number < num_points; number++) {
215 *bPtr++ = acos(*aPtr++);
216 }
217}
218
219#endif /* LV_HAVE_AVX2 for aligned */
220
221#ifdef LV_HAVE_SSE4_1
222#include <smmintrin.h>
223
224static inline void
225volk_32f_acos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
226{
227 float* bPtr = bVector;
228 const float* aPtr = aVector;
229
230 unsigned int number = 0;
231 unsigned int quarterPoints = num_points / 4;
232 int i, j;
233
234 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
235 __m128 fzeroes, fones, ftwos, ffours, condition;
236
237 pi = _mm_set1_ps(3.14159265358979323846);
238 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
239 fzeroes = _mm_setzero_ps();
240 fones = _mm_set1_ps(1.0);
241 ftwos = _mm_set1_ps(2.0);
242 ffours = _mm_set1_ps(4.0);
243
244 for (; number < quarterPoints; number++) {
245 aVal = _mm_load_ps(aPtr);
246 d = aVal;
247 aVal = _mm_div_ps(
248 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
249 aVal);
250 z = aVal;
251 condition = _mm_cmplt_ps(z, fzeroes);
252 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
253 condition = _mm_cmplt_ps(z, fones);
254 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
255
256 for (i = 0; i < 2; i++)
257 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
258 x = _mm_div_ps(fones, x);
259 y = fzeroes;
260 for (j = ACOS_TERMS - 1; j >= 0; j--)
261 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
262 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
263
264 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
265 condition = _mm_cmpgt_ps(z, fones);
266
267 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
268 arccosine = y;
269 condition = _mm_cmplt_ps(aVal, fzeroes);
270 arccosine =
271 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
272 condition = _mm_cmplt_ps(d, fzeroes);
273 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
274
275 _mm_store_ps(bPtr, arccosine);
276 aPtr += 4;
277 bPtr += 4;
278 }
279
280 number = quarterPoints * 4;
281 for (; number < num_points; number++) {
282 *bPtr++ = acosf(*aPtr++);
283 }
284}
285
286#endif /* LV_HAVE_SSE4_1 for aligned */
287
288#endif /* INCLUDED_volk_32f_acos_32f_a_H */
289
290
291#ifndef INCLUDED_volk_32f_acos_32f_u_H
292#define INCLUDED_volk_32f_acos_32f_u_H
293
294#if LV_HAVE_AVX2 && LV_HAVE_FMA
295#include <immintrin.h>
296
297static inline void volk_32f_acos_32f_u_avx2_fma(float* bVector,
298 const float* aVector,
299 unsigned int num_points)
300{
301 float* bPtr = bVector;
302 const float* aPtr = aVector;
303
304 unsigned int number = 0;
305 unsigned int eighthPoints = num_points / 8;
306 int i, j;
307
308 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
309 __m256 fzeroes, fones, ftwos, ffours, condition;
310
311 pi = _mm256_set1_ps(3.14159265358979323846);
312 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
313 fzeroes = _mm256_setzero_ps();
314 fones = _mm256_set1_ps(1.0);
315 ftwos = _mm256_set1_ps(2.0);
316 ffours = _mm256_set1_ps(4.0);
317
318 for (; number < eighthPoints; number++) {
319 aVal = _mm256_loadu_ps(aPtr);
320 d = aVal;
321 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
322 _mm256_sub_ps(fones, aVal))),
323 aVal);
324 z = aVal;
325 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
326 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
327 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
328 x = _mm256_add_ps(
329 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
330
331 for (i = 0; i < 2; i++)
332 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
333 x = _mm256_div_ps(fones, x);
334 y = fzeroes;
335 for (j = ACOS_TERMS - 1; j >= 0; j--)
336 y = _mm256_fmadd_ps(
337 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
338
339 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
340 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
341
342 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
343 arccosine = y;
344 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
345 arccosine = _mm256_sub_ps(
346 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
347 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
348 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
349
350 _mm256_storeu_ps(bPtr, arccosine);
351 aPtr += 8;
352 bPtr += 8;
353 }
354
355 number = eighthPoints * 8;
356 for (; number < num_points; number++) {
357 *bPtr++ = acos(*aPtr++);
358 }
359}
360
361#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
362
363
364#ifdef LV_HAVE_AVX
365#include <immintrin.h>
366
367static inline void
368volk_32f_acos_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
369{
370 float* bPtr = bVector;
371 const float* aPtr = aVector;
372
373 unsigned int number = 0;
374 unsigned int eighthPoints = num_points / 8;
375 int i, j;
376
377 __m256 aVal, d, pi, pio2, x, y, z, arccosine;
378 __m256 fzeroes, fones, ftwos, ffours, condition;
379
380 pi = _mm256_set1_ps(3.14159265358979323846);
381 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
382 fzeroes = _mm256_setzero_ps();
383 fones = _mm256_set1_ps(1.0);
384 ftwos = _mm256_set1_ps(2.0);
385 ffours = _mm256_set1_ps(4.0);
386
387 for (; number < eighthPoints; number++) {
388 aVal = _mm256_loadu_ps(aPtr);
389 d = aVal;
390 aVal = _mm256_div_ps(_mm256_sqrt_ps(_mm256_mul_ps(_mm256_add_ps(fones, aVal),
391 _mm256_sub_ps(fones, aVal))),
392 aVal);
393 z = aVal;
394 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
395 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
396 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
397 x = _mm256_add_ps(
398 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
399
400 for (i = 0; i < 2; i++)
401 x = _mm256_add_ps(x,
402 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
403 x = _mm256_div_ps(fones, x);
404 y = fzeroes;
405 for (j = ACOS_TERMS - 1; j >= 0; j--)
406 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
407 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
408
409 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
410 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
411
412 y = _mm256_add_ps(
413 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
414 arccosine = y;
415 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
416 arccosine = _mm256_sub_ps(
417 arccosine, _mm256_and_ps(_mm256_mul_ps(arccosine, ftwos), condition));
418 condition = _mm256_cmp_ps(d, fzeroes, _CMP_LT_OS);
419 arccosine = _mm256_add_ps(arccosine, _mm256_and_ps(pi, condition));
420
421 _mm256_storeu_ps(bPtr, arccosine);
422 aPtr += 8;
423 bPtr += 8;
424 }
425
426 number = eighthPoints * 8;
427 for (; number < num_points; number++) {
428 *bPtr++ = acos(*aPtr++);
429 }
430}
431
432#endif /* LV_HAVE_AVX2 for unaligned */
433
434#ifdef LV_HAVE_SSE4_1
435#include <smmintrin.h>
436
437static inline void
438volk_32f_acos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
439{
440 float* bPtr = bVector;
441 const float* aPtr = aVector;
442
443 unsigned int number = 0;
444 unsigned int quarterPoints = num_points / 4;
445 int i, j;
446
447 __m128 aVal, d, pi, pio2, x, y, z, arccosine;
448 __m128 fzeroes, fones, ftwos, ffours, condition;
449
450 pi = _mm_set1_ps(3.14159265358979323846);
451 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
452 fzeroes = _mm_setzero_ps();
453 fones = _mm_set1_ps(1.0);
454 ftwos = _mm_set1_ps(2.0);
455 ffours = _mm_set1_ps(4.0);
456
457 for (; number < quarterPoints; number++) {
458 aVal = _mm_loadu_ps(aPtr);
459 d = aVal;
460 aVal = _mm_div_ps(
461 _mm_sqrt_ps(_mm_mul_ps(_mm_add_ps(fones, aVal), _mm_sub_ps(fones, aVal))),
462 aVal);
463 z = aVal;
464 condition = _mm_cmplt_ps(z, fzeroes);
465 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
466 condition = _mm_cmplt_ps(z, fones);
467 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
468
469 for (i = 0; i < 2; i++)
470 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
471 x = _mm_div_ps(fones, x);
472 y = fzeroes;
473
474 for (j = ACOS_TERMS - 1; j >= 0; j--)
475 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
476 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
477
478 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
479 condition = _mm_cmpgt_ps(z, fones);
480
481 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
482 arccosine = y;
483 condition = _mm_cmplt_ps(aVal, fzeroes);
484 arccosine =
485 _mm_sub_ps(arccosine, _mm_and_ps(_mm_mul_ps(arccosine, ftwos), condition));
486 condition = _mm_cmplt_ps(d, fzeroes);
487 arccosine = _mm_add_ps(arccosine, _mm_and_ps(pi, condition));
488
489 _mm_storeu_ps(bPtr, arccosine);
490 aPtr += 4;
491 bPtr += 4;
492 }
493
494 number = quarterPoints * 4;
495 for (; number < num_points; number++) {
496 *bPtr++ = acosf(*aPtr++);
497 }
498}
499
500#endif /* LV_HAVE_SSE4_1 for aligned */
501
502#ifdef LV_HAVE_GENERIC
503
504static inline void
505volk_32f_acos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
506{
507 float* bPtr = bVector;
508 const float* aPtr = aVector;
509 unsigned int number = 0;
510
511 for (number = 0; number < num_points; number++) {
512 *bPtr++ = acosf(*aPtr++);
513 }
514}
515#endif /* LV_HAVE_GENERIC */
516
517#endif /* INCLUDED_volk_32f_acos_32f_u_H */
static void volk_32f_acos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:505
#define ACOS_TERMS
Definition: volk_32f_acos_32f.h:76
static void volk_32f_acos_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:368
static void volk_32f_acos_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_acos_32f.h:155
for i
Definition: volk_config_fixed.tmpl.h:25