Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_cos_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
72#include <inttypes.h>
73#include <math.h>
74#include <stdio.h>
75
76#ifndef INCLUDED_volk_32f_cos_32f_a_H
77#define INCLUDED_volk_32f_cos_32f_a_H
78
79#ifdef LV_HAVE_AVX512F
80
81#include <immintrin.h>
82static inline void volk_32f_cos_32f_a_avx512f(float* cosVector,
83 const float* inVector,
84 unsigned int num_points)
85{
86 float* cosPtr = cosVector;
87 const float* inPtr = inVector;
88
89 unsigned int number = 0;
90 unsigned int sixteenPoints = num_points / 16;
91 unsigned int i = 0;
92
93 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
94 fones, sine, cosine;
95 __m512i q, zeros, ones, twos, fours;
96
97 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101 ffours = _mm512_set1_ps(4.0);
102 ftwos = _mm512_set1_ps(2.0);
103 fones = _mm512_set1_ps(1.0);
104 zeros = _mm512_setzero_epi32();
105 ones = _mm512_set1_epi32(1);
106 twos = _mm512_set1_epi32(2);
107 fours = _mm512_set1_epi32(4);
108
109 cp1 = _mm512_set1_ps(1.0);
110 cp2 = _mm512_set1_ps(0.08333333333333333);
111 cp3 = _mm512_set1_ps(0.002777777777777778);
112 cp4 = _mm512_set1_ps(4.96031746031746e-05);
113 cp5 = _mm512_set1_ps(5.511463844797178e-07);
114 __mmask16 condition1, condition2;
115
116 for (; number < sixteenPoints; number++) {
117 aVal = _mm512_load_ps(inPtr);
118 // s = fabs(aVal)
119 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
120
121 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
122 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
123 // r = q + q&1, q indicates quadrant, r gives
124 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
125
126 s = _mm512_fnmadd_ps(r, pio4A, s);
127 s = _mm512_fnmadd_ps(r, pio4B, s);
128 s = _mm512_fnmadd_ps(r, pio4C, s);
129
130 s = _mm512_div_ps(
131 s,
132 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
133 s = _mm512_mul_ps(s, s);
134 // Evaluate Taylor series
135 s = _mm512_mul_ps(
136 _mm512_fmadd_ps(
137 _mm512_fmsub_ps(
138 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
139 s,
140 cp1),
141 s);
142
143 for (i = 0; i < 3; i++)
144 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145 s = _mm512_div_ps(s, ftwos);
146
147 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148 cosine = _mm512_sub_ps(fones, s);
149
150 // if(((q+1)&2) != 0) { cosine=sine;}
151 condition1 = _mm512_cmpneq_epi32_mask(
152 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
153
154 // if(((q+2)&4) != 0) { cosine = -cosine;}
155 condition2 = _mm512_cmpneq_epi32_mask(
156 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
157 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
158 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
159 _mm512_store_ps(cosPtr, cosine);
160 inPtr += 16;
161 cosPtr += 16;
162 }
163
164 number = sixteenPoints * 16;
165 for (; number < num_points; number++) {
166 *cosPtr++ = cosf(*inPtr++);
167 }
168}
169#endif
170
171#if LV_HAVE_AVX2 && LV_HAVE_FMA
172#include <immintrin.h>
173
174static inline void
175volk_32f_cos_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
176{
177 float* bPtr = bVector;
178 const float* aPtr = aVector;
179
180 unsigned int number = 0;
181 unsigned int eighthPoints = num_points / 8;
182 unsigned int i = 0;
183
184 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
185 fones, fzeroes;
186 __m256 sine, cosine;
187 __m256i q, ones, twos, fours;
188
189 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
190 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
191 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
192 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
193 ffours = _mm256_set1_ps(4.0);
194 ftwos = _mm256_set1_ps(2.0);
195 fones = _mm256_set1_ps(1.0);
196 fzeroes = _mm256_setzero_ps();
197 __m256i zeroes = _mm256_set1_epi32(0);
198 ones = _mm256_set1_epi32(1);
199 __m256i allones = _mm256_set1_epi32(0xffffffff);
200 twos = _mm256_set1_epi32(2);
201 fours = _mm256_set1_epi32(4);
202
203 cp1 = _mm256_set1_ps(1.0);
204 cp2 = _mm256_set1_ps(0.08333333333333333);
205 cp3 = _mm256_set1_ps(0.002777777777777778);
206 cp4 = _mm256_set1_ps(4.96031746031746e-05);
207 cp5 = _mm256_set1_ps(5.511463844797178e-07);
208 union bit256 condition1;
209 union bit256 condition3;
210
211 for (; number < eighthPoints; number++) {
212
213 aVal = _mm256_load_ps(aPtr);
214 // s = fabs(aVal)
215 s = _mm256_sub_ps(aVal,
216 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
217 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
218 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
219 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
220 // r = q + q&1, q indicates quadrant, r gives
221 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
222
223 s = _mm256_fnmadd_ps(r, pio4A, s);
224 s = _mm256_fnmadd_ps(r, pio4B, s);
225 s = _mm256_fnmadd_ps(r, pio4C, s);
226
227 s = _mm256_div_ps(
228 s,
229 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
230 s = _mm256_mul_ps(s, s);
231 // Evaluate Taylor series
232 s = _mm256_mul_ps(
233 _mm256_fmadd_ps(
234 _mm256_fmsub_ps(
235 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
236 s,
237 cp1),
238 s);
239
240 for (i = 0; i < 3; i++)
241 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
242 s = _mm256_div_ps(s, ftwos);
243
244 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
245 cosine = _mm256_sub_ps(fones, s);
246
247 // if(((q+1)&2) != 0) { cosine=sine;}
248 condition1.int_vec =
249 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
250 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
251
252 // if(((q+2)&4) != 0) { cosine = -cosine;}
253 condition3.int_vec = _mm256_cmpeq_epi32(
254 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
255 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
256
257 cosine = _mm256_add_ps(
258 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
259 cosine = _mm256_sub_ps(cosine,
260 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
261 condition3.float_vec));
262 _mm256_store_ps(bPtr, cosine);
263 aPtr += 8;
264 bPtr += 8;
265 }
266
267 number = eighthPoints * 8;
268 for (; number < num_points; number++) {
269 *bPtr++ = cos(*aPtr++);
270 }
271}
272
273#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
274
275#ifdef LV_HAVE_AVX2
276#include <immintrin.h>
277
278static inline void
279volk_32f_cos_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
280{
281 float* bPtr = bVector;
282 const float* aPtr = aVector;
283
284 unsigned int number = 0;
285 unsigned int eighthPoints = num_points / 8;
286 unsigned int i = 0;
287
288 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
289 fones, fzeroes;
290 __m256 sine, cosine;
291 __m256i q, ones, twos, fours;
292
293 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
294 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
295 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
296 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
297 ffours = _mm256_set1_ps(4.0);
298 ftwos = _mm256_set1_ps(2.0);
299 fones = _mm256_set1_ps(1.0);
300 fzeroes = _mm256_setzero_ps();
301 __m256i zeroes = _mm256_set1_epi32(0);
302 ones = _mm256_set1_epi32(1);
303 __m256i allones = _mm256_set1_epi32(0xffffffff);
304 twos = _mm256_set1_epi32(2);
305 fours = _mm256_set1_epi32(4);
306
307 cp1 = _mm256_set1_ps(1.0);
308 cp2 = _mm256_set1_ps(0.08333333333333333);
309 cp3 = _mm256_set1_ps(0.002777777777777778);
310 cp4 = _mm256_set1_ps(4.96031746031746e-05);
311 cp5 = _mm256_set1_ps(5.511463844797178e-07);
312 union bit256 condition1;
313 union bit256 condition3;
314
315 for (; number < eighthPoints; number++) {
316
317 aVal = _mm256_load_ps(aPtr);
318 // s = fabs(aVal)
319 s = _mm256_sub_ps(aVal,
320 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
321 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
322 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
323 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
324 // r = q + q&1, q indicates quadrant, r gives
325 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
326
327 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
328 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
329 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
330
331 s = _mm256_div_ps(
332 s,
333 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
334 s = _mm256_mul_ps(s, s);
335 // Evaluate Taylor series
336 s = _mm256_mul_ps(
337 _mm256_add_ps(
338 _mm256_mul_ps(
339 _mm256_sub_ps(
340 _mm256_mul_ps(
341 _mm256_add_ps(
342 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
343 s),
344 cp3),
345 s),
346 cp2),
347 s),
348 cp1),
349 s);
350
351 for (i = 0; i < 3; i++)
352 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
353 s = _mm256_div_ps(s, ftwos);
354
355 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
356 cosine = _mm256_sub_ps(fones, s);
357
358 // if(((q+1)&2) != 0) { cosine=sine;}
359 condition1.int_vec =
360 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
361 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
362
363 // if(((q+2)&4) != 0) { cosine = -cosine;}
364 condition3.int_vec = _mm256_cmpeq_epi32(
365 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
366 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
367
368 cosine = _mm256_add_ps(
369 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
370 cosine = _mm256_sub_ps(cosine,
371 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
372 condition3.float_vec));
373 _mm256_store_ps(bPtr, cosine);
374 aPtr += 8;
375 bPtr += 8;
376 }
377
378 number = eighthPoints * 8;
379 for (; number < num_points; number++) {
380 *bPtr++ = cos(*aPtr++);
381 }
382}
383
384#endif /* LV_HAVE_AVX2 for aligned */
385
386#ifdef LV_HAVE_SSE4_1
387#include <smmintrin.h>
388
389static inline void
390volk_32f_cos_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
391{
392 float* bPtr = bVector;
393 const float* aPtr = aVector;
394
395 unsigned int number = 0;
396 unsigned int quarterPoints = num_points / 4;
397 unsigned int i = 0;
398
399 __m128 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
400 fones, fzeroes;
401 __m128 sine, cosine;
402 __m128i q, ones, twos, fours;
403
404 m4pi = _mm_set1_ps(1.273239544735162542821171882678754627704620361328125);
405 pio4A = _mm_set1_ps(0.7853981554508209228515625);
406 pio4B = _mm_set1_ps(0.794662735614792836713604629039764404296875e-8);
407 pio4C = _mm_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
408 ffours = _mm_set1_ps(4.0);
409 ftwos = _mm_set1_ps(2.0);
410 fones = _mm_set1_ps(1.0);
411 fzeroes = _mm_setzero_ps();
412 __m128i zeroes = _mm_set1_epi32(0);
413 ones = _mm_set1_epi32(1);
414 __m128i allones = _mm_set1_epi32(0xffffffff);
415 twos = _mm_set1_epi32(2);
416 fours = _mm_set1_epi32(4);
417
418 cp1 = _mm_set1_ps(1.0);
419 cp2 = _mm_set1_ps(0.08333333333333333);
420 cp3 = _mm_set1_ps(0.002777777777777778);
421 cp4 = _mm_set1_ps(4.96031746031746e-05);
422 cp5 = _mm_set1_ps(5.511463844797178e-07);
423 union bit128 condition1;
424 union bit128 condition3;
425
426 for (; number < quarterPoints; number++) {
427
428 aVal = _mm_load_ps(aPtr);
429 // s = fabs(aVal)
430 s = _mm_sub_ps(aVal,
431 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
432 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
433 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
434 // r = q + q&1, q indicates quadrant, r gives
435 r = _mm_cvtepi32_ps(_mm_add_epi32(q, _mm_and_si128(q, ones)));
436
437 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4A));
438 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4B));
439 s = _mm_sub_ps(s, _mm_mul_ps(r, pio4C));
440
441 s = _mm_div_ps(
442 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
443 s = _mm_mul_ps(s, s);
444 // Evaluate Taylor series
445 s = _mm_mul_ps(
446 _mm_add_ps(
447 _mm_mul_ps(
448 _mm_sub_ps(
449 _mm_mul_ps(
450 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
451 cp3),
452 s),
453 cp2),
454 s),
455 cp1),
456 s);
457
458 for (i = 0; i < 3; i++)
459 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
460 s = _mm_div_ps(s, ftwos);
461
462 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
463 cosine = _mm_sub_ps(fones, s);
464
465 // if(((q+1)&2) != 0) { cosine=sine;}
466 condition1.int_vec =
467 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, ones), twos), zeroes);
468 condition1.int_vec = _mm_xor_si128(allones, condition1.int_vec);
469
470 // if(((q+2)&4) != 0) { cosine = -cosine;}
471 condition3.int_vec =
472 _mm_cmpeq_epi32(_mm_and_si128(_mm_add_epi32(q, twos), fours), zeroes);
473 condition3.int_vec = _mm_xor_si128(allones, condition3.int_vec);
474
475 cosine = _mm_add_ps(cosine,
476 _mm_and_ps(_mm_sub_ps(sine, cosine), condition1.float_vec));
477 cosine = _mm_sub_ps(
478 cosine,
479 _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3.float_vec));
480 _mm_store_ps(bPtr, cosine);
481 aPtr += 4;
482 bPtr += 4;
483 }
484
485 number = quarterPoints * 4;
486 for (; number < num_points; number++) {
487 *bPtr++ = cosf(*aPtr++);
488 }
489}
490
491#endif /* LV_HAVE_SSE4_1 for aligned */
492
493#endif /* INCLUDED_volk_32f_cos_32f_a_H */
494
495
496#ifndef INCLUDED_volk_32f_cos_32f_u_H
497#define INCLUDED_volk_32f_cos_32f_u_H
498
499#ifdef LV_HAVE_AVX512F
500
501#include <immintrin.h>
502static inline void volk_32f_cos_32f_u_avx512f(float* cosVector,
503 const float* inVector,
504 unsigned int num_points)
505{
506 float* cosPtr = cosVector;
507 const float* inPtr = inVector;
508
509 unsigned int number = 0;
510 unsigned int sixteenPoints = num_points / 16;
511 unsigned int i = 0;
512
513 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
514 fones, sine, cosine;
515 __m512i q, zeros, ones, twos, fours;
516
517 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
518 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
519 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
520 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
521 ffours = _mm512_set1_ps(4.0);
522 ftwos = _mm512_set1_ps(2.0);
523 fones = _mm512_set1_ps(1.0);
524 zeros = _mm512_setzero_epi32();
525 ones = _mm512_set1_epi32(1);
526 twos = _mm512_set1_epi32(2);
527 fours = _mm512_set1_epi32(4);
528
529 cp1 = _mm512_set1_ps(1.0);
530 cp2 = _mm512_set1_ps(0.08333333333333333);
531 cp3 = _mm512_set1_ps(0.002777777777777778);
532 cp4 = _mm512_set1_ps(4.96031746031746e-05);
533 cp5 = _mm512_set1_ps(5.511463844797178e-07);
534 __mmask16 condition1, condition2;
535 for (; number < sixteenPoints; number++) {
536 aVal = _mm512_loadu_ps(inPtr);
537 // s = fabs(aVal)
538 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
539
540 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
541 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
542 // r = q + q&1, q indicates quadrant, r gives
543 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
544
545 s = _mm512_fnmadd_ps(r, pio4A, s);
546 s = _mm512_fnmadd_ps(r, pio4B, s);
547 s = _mm512_fnmadd_ps(r, pio4C, s);
548
549 s = _mm512_div_ps(
550 s,
551 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
552 s = _mm512_mul_ps(s, s);
553 // Evaluate Taylor series
554 s = _mm512_mul_ps(
555 _mm512_fmadd_ps(
556 _mm512_fmsub_ps(
557 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
558 s,
559 cp1),
560 s);
561
562 for (i = 0; i < 3; i++)
563 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
564 s = _mm512_div_ps(s, ftwos);
565
566 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
567 cosine = _mm512_sub_ps(fones, s);
568
569 // if(((q+1)&2) != 0) { cosine=sine;}
570 condition1 = _mm512_cmpneq_epi32_mask(
571 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
572
573 // if(((q+2)&4) != 0) { cosine = -cosine;}
574 condition2 = _mm512_cmpneq_epi32_mask(
575 _mm512_and_si512(_mm512_add_epi32(q, twos), fours), zeros);
576
577 cosine = _mm512_mask_blend_ps(condition1, cosine, sine);
578 cosine = _mm512_mask_mul_ps(cosine, condition2, cosine, _mm512_set1_ps(-1.f));
579 _mm512_storeu_ps(cosPtr, cosine);
580 inPtr += 16;
581 cosPtr += 16;
582 }
583
584 number = sixteenPoints * 16;
585 for (; number < num_points; number++) {
586 *cosPtr++ = cosf(*inPtr++);
587 }
588}
589#endif
590
591#if LV_HAVE_AVX2 && LV_HAVE_FMA
592#include <immintrin.h>
593
594static inline void
595volk_32f_cos_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
596{
597 float* bPtr = bVector;
598 const float* aPtr = aVector;
599
600 unsigned int number = 0;
601 unsigned int eighthPoints = num_points / 8;
602 unsigned int i = 0;
603
604 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
605 fones, fzeroes;
606 __m256 sine, cosine;
607 __m256i q, ones, twos, fours;
608
609 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
610 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
611 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
612 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
613 ffours = _mm256_set1_ps(4.0);
614 ftwos = _mm256_set1_ps(2.0);
615 fones = _mm256_set1_ps(1.0);
616 fzeroes = _mm256_setzero_ps();
617 __m256i zeroes = _mm256_set1_epi32(0);
618 ones = _mm256_set1_epi32(1);
619 __m256i allones = _mm256_set1_epi32(0xffffffff);
620 twos = _mm256_set1_epi32(2);
621 fours = _mm256_set1_epi32(4);
622
623 cp1 = _mm256_set1_ps(1.0);
624 cp2 = _mm256_set1_ps(0.08333333333333333);
625 cp3 = _mm256_set1_ps(0.002777777777777778);
626 cp4 = _mm256_set1_ps(4.96031746031746e-05);
627 cp5 = _mm256_set1_ps(5.511463844797178e-07);
628 union bit256 condition1;
629 union bit256 condition3;
630
631 for (; number < eighthPoints; number++) {
632
633 aVal = _mm256_loadu_ps(aPtr);
634 // s = fabs(aVal)
635 s = _mm256_sub_ps(aVal,
636 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
637 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
638 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
639 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
640 // r = q + q&1, q indicates quadrant, r gives
641 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
642
643 s = _mm256_fnmadd_ps(r, pio4A, s);
644 s = _mm256_fnmadd_ps(r, pio4B, s);
645 s = _mm256_fnmadd_ps(r, pio4C, s);
646
647 s = _mm256_div_ps(
648 s,
649 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
650 s = _mm256_mul_ps(s, s);
651 // Evaluate Taylor series
652 s = _mm256_mul_ps(
653 _mm256_fmadd_ps(
654 _mm256_fmsub_ps(
655 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
656 s,
657 cp1),
658 s);
659
660 for (i = 0; i < 3; i++)
661 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
662 s = _mm256_div_ps(s, ftwos);
663
664 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
665 cosine = _mm256_sub_ps(fones, s);
666
667 // if(((q+1)&2) != 0) { cosine=sine;}
668 condition1.int_vec =
669 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
670 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
671
672 // if(((q+2)&4) != 0) { cosine = -cosine;}
673 condition3.int_vec = _mm256_cmpeq_epi32(
674 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
675 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
676
677 cosine = _mm256_add_ps(
678 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
679 cosine = _mm256_sub_ps(cosine,
680 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
681 condition3.float_vec));
682 _mm256_storeu_ps(bPtr, cosine);
683 aPtr += 8;
684 bPtr += 8;
685 }
686
687 number = eighthPoints * 8;
688 for (; number < num_points; number++) {
689 *bPtr++ = cos(*aPtr++);
690 }
691}
692
693#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
694
695#ifdef LV_HAVE_AVX2
696#include <immintrin.h>
697
698static inline void
699volk_32f_cos_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
700{
701 float* bPtr = bVector;
702 const float* aPtr = aVector;
703
704 unsigned int number = 0;
705 unsigned int eighthPoints = num_points / 8;
706 unsigned int i = 0;
707
708 __m256 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
709 fones, fzeroes;
710 __m256 sine, cosine;
711 __m256i q, ones, twos, fours;
712
713 m4pi = _mm256_set1_ps(1.273239544735162542821171882678754627704620361328125);
714 pio4A = _mm256_set1_ps(0.7853981554508209228515625);
715 pio4B = _mm256_set1_ps(0.794662735614792836713604629039764404296875e-8);
716 pio4C = _mm256_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
717 ffours = _mm256_set1_ps(4.0);
718 ftwos = _mm256_set1_ps(2.0);
719 fones = _mm256_set1_ps(1.0);
720 fzeroes = _mm256_setzero_ps();
721 __m256i zeroes = _mm256_set1_epi32(0);
722 ones = _mm256_set1_epi32(1);
723 __m256i allones = _mm256_set1_epi32(0xffffffff);
724 twos = _mm256_set1_epi32(2);
725 fours = _mm256_set1_epi32(4);
726
727 cp1 = _mm256_set1_ps(1.0);
728 cp2 = _mm256_set1_ps(0.08333333333333333);
729 cp3 = _mm256_set1_ps(0.002777777777777778);
730 cp4 = _mm256_set1_ps(4.96031746031746e-05);
731 cp5 = _mm256_set1_ps(5.511463844797178e-07);
732 union bit256 condition1;
733 union bit256 condition3;
734
735 for (; number < eighthPoints; number++) {
736
737 aVal = _mm256_loadu_ps(aPtr);
738 // s = fabs(aVal)
739 s = _mm256_sub_ps(aVal,
740 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
741 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
742 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
743 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
744 // r = q + q&1, q indicates quadrant, r gives
745 r = _mm256_cvtepi32_ps(_mm256_add_epi32(q, _mm256_and_si256(q, ones)));
746
747 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4A));
748 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4B));
749 s = _mm256_sub_ps(s, _mm256_mul_ps(r, pio4C));
750
751 s = _mm256_div_ps(
752 s,
753 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
754 s = _mm256_mul_ps(s, s);
755 // Evaluate Taylor series
756 s = _mm256_mul_ps(
757 _mm256_add_ps(
758 _mm256_mul_ps(
759 _mm256_sub_ps(
760 _mm256_mul_ps(
761 _mm256_add_ps(
762 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
763 s),
764 cp3),
765 s),
766 cp2),
767 s),
768 cp1),
769 s);
770
771 for (i = 0; i < 3; i++)
772 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
773 s = _mm256_div_ps(s, ftwos);
774
775 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
776 cosine = _mm256_sub_ps(fones, s);
777
778 // if(((q+1)&2) != 0) { cosine=sine;}
779 condition1.int_vec =
780 _mm256_cmpeq_epi32(_mm256_and_si256(_mm256_add_epi32(q, ones), twos), zeroes);
781 condition1.int_vec = _mm256_xor_si256(allones, condition1.int_vec);
782
783 // if(((q+2)&4) != 0) { cosine = -cosine;}
784 condition3.int_vec = _mm256_cmpeq_epi32(
785 _mm256_and_si256(_mm256_add_epi32(q, twos), fours), zeroes);
786 condition3.int_vec = _mm256_xor_si256(allones, condition3.int_vec);
787
788 cosine = _mm256_add_ps(
789 cosine, _mm256_and_ps(_mm256_sub_ps(sine, cosine), condition1.float_vec));
790 cosine = _mm256_sub_ps(cosine,
791 _mm256_and_ps(_mm256_mul_ps(cosine, _mm256_set1_ps(2.0f)),
792 condition3.float_vec));
793 _mm256_storeu_ps(bPtr, cosine);
794 aPtr += 8;
795 bPtr += 8;
796 }
797
798 number = eighthPoints * 8;
799 for (; number < num_points; number++) {
800 *bPtr++ = cos(*aPtr++);
801 }
802}
803
804#endif /* LV_HAVE_AVX2 for unaligned */
805
806#ifdef LV_HAVE_SSE4_1
807#include <smmintrin.h>
808
809static inline void
810volk_32f_cos_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
811{
812 float* bPtr = bVector;
813 const float* aPtr = aVector;
814
815 unsigned int number = 0;
816 unsigned int quarterPoints = num_points / 4;
817 unsigned int i = 0;
818
819 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
820 fzeroes;
821 __m128 sine, cosine, condition1, condition3;
822 __m128i q, r, ones, twos, fours;
823
824 m4pi = _mm_set1_ps(1.273239545);
825 pio4A = _mm_set1_ps(0.78515625);
826 pio4B = _mm_set1_ps(0.241876e-3);
827 ffours = _mm_set1_ps(4.0);
828 ftwos = _mm_set1_ps(2.0);
829 fones = _mm_set1_ps(1.0);
830 fzeroes = _mm_setzero_ps();
831 ones = _mm_set1_epi32(1);
832 twos = _mm_set1_epi32(2);
833 fours = _mm_set1_epi32(4);
834
835 cp1 = _mm_set1_ps(1.0);
836 cp2 = _mm_set1_ps(0.83333333e-1);
837 cp3 = _mm_set1_ps(0.2777778e-2);
838 cp4 = _mm_set1_ps(0.49603e-4);
839 cp5 = _mm_set1_ps(0.551e-6);
840
841 for (; number < quarterPoints; number++) {
842 aVal = _mm_loadu_ps(aPtr);
843 s = _mm_sub_ps(aVal,
844 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
845 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
846 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
847
848 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
849 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
850
851 s = _mm_div_ps(
852 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
853 s = _mm_mul_ps(s, s);
854 // Evaluate Taylor series
855 s = _mm_mul_ps(
856 _mm_add_ps(
857 _mm_mul_ps(
858 _mm_sub_ps(
859 _mm_mul_ps(
860 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
861 cp3),
862 s),
863 cp2),
864 s),
865 cp1),
866 s);
867
868 for (i = 0; i < 3; i++) {
869 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
870 }
871 s = _mm_div_ps(s, ftwos);
872
873 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
874 cosine = _mm_sub_ps(fones, s);
875
876 condition1 = _mm_cmpneq_ps(
877 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
878
879 condition3 = _mm_cmpneq_ps(
880 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, twos), fours)), fzeroes);
881
882 cosine = _mm_add_ps(cosine, _mm_and_ps(_mm_sub_ps(sine, cosine), condition1));
883 cosine = _mm_sub_ps(
884 cosine, _mm_and_ps(_mm_mul_ps(cosine, _mm_set1_ps(2.0f)), condition3));
885 _mm_storeu_ps(bPtr, cosine);
886 aPtr += 4;
887 bPtr += 4;
888 }
889
890 number = quarterPoints * 4;
891 for (; number < num_points; number++) {
892 *bPtr++ = cosf(*aPtr++);
893 }
894}
895
896#endif /* LV_HAVE_SSE4_1 for unaligned */
897
898
899#ifdef LV_HAVE_GENERIC
900
901/*
902 * For derivation see
903 * Shibata, Naoki, "Efficient evaluation methods of elementary functions
904 * suitable for SIMD computation," in Springer-Verlag 2010
905 */
906static inline void volk_32f_cos_32f_generic_fast(float* bVector,
907 const float* aVector,
908 unsigned int num_points)
909{
910 float* bPtr = bVector;
911 const float* aPtr = aVector;
912
913 float m4pi = 1.273239544735162542821171882678754627704620361328125;
914 float pio4A = 0.7853981554508209228515625;
915 float pio4B = 0.794662735614792836713604629039764404296875e-8;
916 float pio4C = 0.306161699786838294306516483068750264552437361480769e-16;
917 int N = 3; // order of argument reduction
918
919 unsigned int number;
920 for (number = 0; number < num_points; number++) {
921 float s = fabs(*aPtr);
922 int q = (int)(s * m4pi);
923 int r = q + (q & 1);
924 s -= r * pio4A;
925 s -= r * pio4B;
926 s -= r * pio4C;
927
928 s = s * 0.125; // 2^-N (<--3)
929 s = s * s;
930 s = ((((s / 1814400. - 1.0 / 20160.0) * s + 1.0 / 360.0) * s - 1.0 / 12.0) * s +
931 1.0) *
932 s;
933
934 int i;
935 for (i = 0; i < N; ++i) {
936 s = (4.0 - s) * s;
937 }
938 s = s / 2.0;
939
940 float sine = sqrt((2.0 - s) * s);
941 float cosine = 1 - s;
942
943 if (((q + 1) & 2) != 0) {
944 s = cosine;
945 cosine = sine;
946 sine = s;
947 }
948 if (((q + 2) & 4) != 0) {
949 cosine = -cosine;
950 }
951 *bPtr = cosine;
952 bPtr++;
953 aPtr++;
954 }
955}
956
957#endif /* LV_HAVE_GENERIC */
958
959
960#ifdef LV_HAVE_GENERIC
961
962static inline void
963volk_32f_cos_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
964{
965 float* bPtr = bVector;
966 const float* aPtr = aVector;
967 unsigned int number = 0;
968
969 for (; number < num_points; number++) {
970 *bPtr++ = cosf(*aPtr++);
971 }
972}
973
974#endif /* LV_HAVE_GENERIC */
975
976
977#ifdef LV_HAVE_NEON
978#include <arm_neon.h>
980
981static inline void
982volk_32f_cos_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
983{
984 unsigned int number = 0;
985 unsigned int quarter_points = num_points / 4;
986 float* bVectorPtr = bVector;
987 const float* aVectorPtr = aVector;
988
989 float32x4_t b_vec;
990 float32x4_t a_vec;
991
992 for (number = 0; number < quarter_points; number++) {
993 a_vec = vld1q_f32(aVectorPtr);
994 // Prefetch next one, speeds things up
995 __VOLK_PREFETCH(aVectorPtr + 4);
996 b_vec = _vcosq_f32(a_vec);
997 vst1q_f32(bVectorPtr, b_vec);
998 // move pointers ahead
999 bVectorPtr += 4;
1000 aVectorPtr += 4;
1001 }
1002
1003 // Deal with the rest
1004 for (number = quarter_points * 4; number < num_points; number++) {
1005 *bVectorPtr++ = cosf(*aVectorPtr++);
1006 }
1007}
1008
1009#endif /* LV_HAVE_NEON */
1010
1011
1012#endif /* INCLUDED_volk_32f_cos_32f_u_H */
Definition: volk_common.h:111
Definition: volk_common.h:128
static void volk_32f_cos_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:963
static void volk_32f_cos_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:982
static void volk_32f_cos_32f_generic_fast(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_cos_32f.h:906
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vcosq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:268