Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_atan_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
70#include <inttypes.h>
71#include <math.h>
72#include <stdio.h>
73
74/* This is the number of terms of Taylor series to evaluate, increase this for more
75 * accuracy*/
76#define TERMS 2
77
78#ifndef INCLUDED_volk_32f_atan_32f_a_H
79#define INCLUDED_volk_32f_atan_32f_a_H
80
81#if LV_HAVE_AVX2 && LV_HAVE_FMA
82#include <immintrin.h>
83
84static inline void volk_32f_atan_32f_a_avx2_fma(float* bVector,
85 const float* aVector,
86 unsigned int num_points)
87{
88 float* bPtr = bVector;
89 const float* aPtr = aVector;
90
91 unsigned int number = 0;
92 unsigned int eighthPoints = num_points / 8;
93 int i, j;
94
95 __m256 aVal, pio2, x, y, z, arctangent;
96 __m256 fzeroes, fones, ftwos, ffours, condition;
97
98 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
99 fzeroes = _mm256_setzero_ps();
100 fones = _mm256_set1_ps(1.0);
101 ftwos = _mm256_set1_ps(2.0);
102 ffours = _mm256_set1_ps(4.0);
103
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106 z = aVal;
107 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
108 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
109 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
110 x = _mm256_add_ps(
111 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
112
113 for (i = 0; i < 2; i++) {
114 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
115 }
116 x = _mm256_div_ps(fones, x);
117 y = fzeroes;
118 for (j = TERMS - 1; j >= 0; j--) {
119 y = _mm256_fmadd_ps(
120 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
121 }
122
123 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
124 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
125
126 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
127 arctangent = y;
128 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
129 arctangent = _mm256_sub_ps(
130 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
131
132 _mm256_store_ps(bPtr, arctangent);
133 aPtr += 8;
134 bPtr += 8;
135 }
136
137 number = eighthPoints * 8;
138 for (; number < num_points; number++) {
139 *bPtr++ = atan(*aPtr++);
140 }
141}
142
143#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
144
145
146#ifdef LV_HAVE_AVX
147#include <immintrin.h>
148
149static inline void
150volk_32f_atan_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
151{
152 float* bPtr = bVector;
153 const float* aPtr = aVector;
154
155 unsigned int number = 0;
156 unsigned int eighthPoints = num_points / 8;
157 int i, j;
158
159 __m256 aVal, pio2, x, y, z, arctangent;
160 __m256 fzeroes, fones, ftwos, ffours, condition;
161
162 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
163 fzeroes = _mm256_setzero_ps();
164 fones = _mm256_set1_ps(1.0);
165 ftwos = _mm256_set1_ps(2.0);
166 ffours = _mm256_set1_ps(4.0);
167
168 for (; number < eighthPoints; number++) {
169 aVal = _mm256_load_ps(aPtr);
170 z = aVal;
171 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
172 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
173 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
174 x = _mm256_add_ps(
175 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
176
177 for (i = 0; i < 2; i++) {
178 x = _mm256_add_ps(x,
179 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
180 }
181 x = _mm256_div_ps(fones, x);
182 y = fzeroes;
183 for (j = TERMS - 1; j >= 0; j--) {
184 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
185 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
186 }
187
188 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
189 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
190
191 y = _mm256_add_ps(
192 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
193 arctangent = y;
194 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
195 arctangent = _mm256_sub_ps(
196 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
197
198 _mm256_store_ps(bPtr, arctangent);
199 aPtr += 8;
200 bPtr += 8;
201 }
202
203 number = eighthPoints * 8;
204 for (; number < num_points; number++) {
205 *bPtr++ = atan(*aPtr++);
206 }
207}
208
209#endif /* LV_HAVE_AVX for aligned */
210
211#ifdef LV_HAVE_SSE4_1
212#include <smmintrin.h>
213
214static inline void
215volk_32f_atan_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
216{
217 float* bPtr = bVector;
218 const float* aPtr = aVector;
219
220 unsigned int number = 0;
221 unsigned int quarterPoints = num_points / 4;
222 int i, j;
223
224 __m128 aVal, pio2, x, y, z, arctangent;
225 __m128 fzeroes, fones, ftwos, ffours, condition;
226
227 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
228 fzeroes = _mm_setzero_ps();
229 fones = _mm_set1_ps(1.0);
230 ftwos = _mm_set1_ps(2.0);
231 ffours = _mm_set1_ps(4.0);
232
233 for (; number < quarterPoints; number++) {
234 aVal = _mm_load_ps(aPtr);
235 z = aVal;
236 condition = _mm_cmplt_ps(z, fzeroes);
237 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
238 condition = _mm_cmplt_ps(z, fones);
239 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
240
241 for (i = 0; i < 2; i++) {
242 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
243 }
244 x = _mm_div_ps(fones, x);
245 y = fzeroes;
246 for (j = TERMS - 1; j >= 0; j--) {
247 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
248 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
249 }
250
251 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
252 condition = _mm_cmpgt_ps(z, fones);
253
254 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
255 arctangent = y;
256 condition = _mm_cmplt_ps(aVal, fzeroes);
257 arctangent =
258 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
259
260 _mm_store_ps(bPtr, arctangent);
261 aPtr += 4;
262 bPtr += 4;
263 }
264
265 number = quarterPoints * 4;
266 for (; number < num_points; number++) {
267 *bPtr++ = atanf(*aPtr++);
268 }
269}
270
271#endif /* LV_HAVE_SSE4_1 for aligned */
272
273#endif /* INCLUDED_volk_32f_atan_32f_a_H */
274
275#ifndef INCLUDED_volk_32f_atan_32f_u_H
276#define INCLUDED_volk_32f_atan_32f_u_H
277
278#if LV_HAVE_AVX2 && LV_HAVE_FMA
279#include <immintrin.h>
280
281static inline void volk_32f_atan_32f_u_avx2_fma(float* bVector,
282 const float* aVector,
283 unsigned int num_points)
284{
285 float* bPtr = bVector;
286 const float* aPtr = aVector;
287
288 unsigned int number = 0;
289 unsigned int eighthPoints = num_points / 8;
290 int i, j;
291
292 __m256 aVal, pio2, x, y, z, arctangent;
293 __m256 fzeroes, fones, ftwos, ffours, condition;
294
295 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
296 fzeroes = _mm256_setzero_ps();
297 fones = _mm256_set1_ps(1.0);
298 ftwos = _mm256_set1_ps(2.0);
299 ffours = _mm256_set1_ps(4.0);
300
301 for (; number < eighthPoints; number++) {
302 aVal = _mm256_loadu_ps(aPtr);
303 z = aVal;
304 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
305 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
306 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
307 x = _mm256_add_ps(
308 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
309
310 for (i = 0; i < 2; i++) {
311 x = _mm256_add_ps(x, _mm256_sqrt_ps(_mm256_fmadd_ps(x, x, fones)));
312 }
313 x = _mm256_div_ps(fones, x);
314 y = fzeroes;
315 for (j = TERMS - 1; j >= 0; j--) {
316 y = _mm256_fmadd_ps(
317 y, _mm256_mul_ps(x, x), _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
318 }
319
320 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
321 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
322
323 y = _mm256_add_ps(y, _mm256_and_ps(_mm256_fnmadd_ps(y, ftwos, pio2), condition));
324 arctangent = y;
325 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
326 arctangent = _mm256_sub_ps(
327 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
328
329 _mm256_storeu_ps(bPtr, arctangent);
330 aPtr += 8;
331 bPtr += 8;
332 }
333
334 number = eighthPoints * 8;
335 for (; number < num_points; number++) {
336 *bPtr++ = atan(*aPtr++);
337 }
338}
339
340#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
341
342
343#ifdef LV_HAVE_AVX
344#include <immintrin.h>
345
346static inline void
347volk_32f_atan_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
348{
349 float* bPtr = bVector;
350 const float* aPtr = aVector;
351
352 unsigned int number = 0;
353 unsigned int eighthPoints = num_points / 8;
354 int i, j;
355
356 __m256 aVal, pio2, x, y, z, arctangent;
357 __m256 fzeroes, fones, ftwos, ffours, condition;
358
359 pio2 = _mm256_set1_ps(3.14159265358979323846 / 2);
360 fzeroes = _mm256_setzero_ps();
361 fones = _mm256_set1_ps(1.0);
362 ftwos = _mm256_set1_ps(2.0);
363 ffours = _mm256_set1_ps(4.0);
364
365 for (; number < eighthPoints; number++) {
366 aVal = _mm256_loadu_ps(aPtr);
367 z = aVal;
368 condition = _mm256_cmp_ps(z, fzeroes, _CMP_LT_OS);
369 z = _mm256_sub_ps(z, _mm256_and_ps(_mm256_mul_ps(z, ftwos), condition));
370 condition = _mm256_cmp_ps(z, fones, _CMP_LT_OS);
371 x = _mm256_add_ps(
372 z, _mm256_and_ps(_mm256_sub_ps(_mm256_div_ps(fones, z), z), condition));
373
374 for (i = 0; i < 2; i++) {
375 x = _mm256_add_ps(x,
376 _mm256_sqrt_ps(_mm256_add_ps(fones, _mm256_mul_ps(x, x))));
377 }
378 x = _mm256_div_ps(fones, x);
379 y = fzeroes;
380 for (j = TERMS - 1; j >= 0; j--) {
381 y = _mm256_add_ps(_mm256_mul_ps(y, _mm256_mul_ps(x, x)),
382 _mm256_set1_ps(pow(-1, j) / (2 * j + 1)));
383 }
384
385 y = _mm256_mul_ps(y, _mm256_mul_ps(x, ffours));
386 condition = _mm256_cmp_ps(z, fones, _CMP_GT_OS);
387
388 y = _mm256_add_ps(
389 y, _mm256_and_ps(_mm256_sub_ps(pio2, _mm256_mul_ps(y, ftwos)), condition));
390 arctangent = y;
391 condition = _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS);
392 arctangent = _mm256_sub_ps(
393 arctangent, _mm256_and_ps(_mm256_mul_ps(arctangent, ftwos), condition));
394
395 _mm256_storeu_ps(bPtr, arctangent);
396 aPtr += 8;
397 bPtr += 8;
398 }
399
400 number = eighthPoints * 8;
401 for (; number < num_points; number++) {
402 *bPtr++ = atan(*aPtr++);
403 }
404}
405
406#endif /* LV_HAVE_AVX for unaligned */
407
408#ifdef LV_HAVE_SSE4_1
409#include <smmintrin.h>
410
411static inline void
412volk_32f_atan_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
413{
414 float* bPtr = bVector;
415 const float* aPtr = aVector;
416
417 unsigned int number = 0;
418 unsigned int quarterPoints = num_points / 4;
419 int i, j;
420
421 __m128 aVal, pio2, x, y, z, arctangent;
422 __m128 fzeroes, fones, ftwos, ffours, condition;
423
424 pio2 = _mm_set1_ps(3.14159265358979323846 / 2);
425 fzeroes = _mm_setzero_ps();
426 fones = _mm_set1_ps(1.0);
427 ftwos = _mm_set1_ps(2.0);
428 ffours = _mm_set1_ps(4.0);
429
430 for (; number < quarterPoints; number++) {
431 aVal = _mm_loadu_ps(aPtr);
432 z = aVal;
433 condition = _mm_cmplt_ps(z, fzeroes);
434 z = _mm_sub_ps(z, _mm_and_ps(_mm_mul_ps(z, ftwos), condition));
435 condition = _mm_cmplt_ps(z, fones);
436 x = _mm_add_ps(z, _mm_and_ps(_mm_sub_ps(_mm_div_ps(fones, z), z), condition));
437
438 for (i = 0; i < 2; i++)
439 x = _mm_add_ps(x, _mm_sqrt_ps(_mm_add_ps(fones, _mm_mul_ps(x, x))));
440 x = _mm_div_ps(fones, x);
441 y = fzeroes;
442 for (j = TERMS - 1; j >= 0; j--)
443 y = _mm_add_ps(_mm_mul_ps(y, _mm_mul_ps(x, x)),
444 _mm_set1_ps(pow(-1, j) / (2 * j + 1)));
445
446 y = _mm_mul_ps(y, _mm_mul_ps(x, ffours));
447 condition = _mm_cmpgt_ps(z, fones);
448
449 y = _mm_add_ps(y, _mm_and_ps(_mm_sub_ps(pio2, _mm_mul_ps(y, ftwos)), condition));
450 arctangent = y;
451 condition = _mm_cmplt_ps(aVal, fzeroes);
452 arctangent =
453 _mm_sub_ps(arctangent, _mm_and_ps(_mm_mul_ps(arctangent, ftwos), condition));
454
455 _mm_storeu_ps(bPtr, arctangent);
456 aPtr += 4;
457 bPtr += 4;
458 }
459
460 number = quarterPoints * 4;
461 for (; number < num_points; number++) {
462 *bPtr++ = atanf(*aPtr++);
463 }
464}
465
466#endif /* LV_HAVE_SSE4_1 for unaligned */
467
468#ifdef LV_HAVE_GENERIC
469
470static inline void
471volk_32f_atan_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
472{
473 float* bPtr = bVector;
474 const float* aPtr = aVector;
475 unsigned int number = 0;
476
477 for (number = 0; number < num_points; number++) {
478 *bPtr++ = atanf(*aPtr++);
479 }
480}
481#endif /* LV_HAVE_GENERIC */
482
483#endif /* INCLUDED_volk_32f_atan_32f_u_H */
static void volk_32f_atan_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:347
static void volk_32f_atan_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:471
#define TERMS
Definition: volk_32f_atan_32f.h:76
static void volk_32f_atan_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_atan_32f.h:150
for i
Definition: volk_config_fixed.tmpl.h:25