Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_sin_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
72#include <inttypes.h>
73#include <math.h>
74#include <stdio.h>
75
76#ifndef INCLUDED_volk_32f_sin_32f_a_H
77#define INCLUDED_volk_32f_sin_32f_a_H
78#ifdef LV_HAVE_AVX512F
79
80#include <immintrin.h>
81static inline void volk_32f_sin_32f_a_avx512f(float* sinVector,
82 const float* inVector,
83 unsigned int num_points)
84{
85 float* sinPtr = sinVector;
86 const float* inPtr = inVector;
87
88 unsigned int number = 0;
89 unsigned int sixteenPoints = num_points / 16;
90 unsigned int i = 0;
91
92 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
93 fones;
94 __m512 sine, cosine;
95 __m512i q, zeros, ones, twos, fours;
96
97 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
98 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
99 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
100 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
101 ffours = _mm512_set1_ps(4.0);
102 ftwos = _mm512_set1_ps(2.0);
103 fones = _mm512_set1_ps(1.0);
104 zeros = _mm512_setzero_epi32();
105 ones = _mm512_set1_epi32(1);
106 twos = _mm512_set1_epi32(2);
107 fours = _mm512_set1_epi32(4);
108
109 cp1 = _mm512_set1_ps(1.0);
110 cp2 = _mm512_set1_ps(0.08333333333333333);
111 cp3 = _mm512_set1_ps(0.002777777777777778);
112 cp4 = _mm512_set1_ps(4.96031746031746e-05);
113 cp5 = _mm512_set1_ps(5.511463844797178e-07);
114 __mmask16 condition1, condition2, ltZero;
115
116 for (; number < sixteenPoints; number++) {
117 aVal = _mm512_load_ps(inPtr);
118 // s = fabs(aVal)
119 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
120
121 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
122 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
123 // r = q + q&1, q indicates quadrant, r gives
124 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
125
126 s = _mm512_fnmadd_ps(r, pio4A, s);
127 s = _mm512_fnmadd_ps(r, pio4B, s);
128 s = _mm512_fnmadd_ps(r, pio4C, s);
129
130 s = _mm512_div_ps(
131 s,
132 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
133 s = _mm512_mul_ps(s, s);
134 // Evaluate Taylor series
135 s = _mm512_mul_ps(
136 _mm512_fmadd_ps(
137 _mm512_fmsub_ps(
138 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
139 s,
140 cp1),
141 s);
142
143 for (i = 0; i < 3; i++)
144 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
145 s = _mm512_div_ps(s, ftwos);
146
147 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
148 cosine = _mm512_sub_ps(fones, s);
149
150 condition1 = _mm512_cmpneq_epi32_mask(
151 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
152 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
153 condition2 = _mm512_kxor(
154 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
155
156 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
157 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
158 _mm512_store_ps(sinPtr, sine);
159 inPtr += 16;
160 sinPtr += 16;
161 }
162
163 number = sixteenPoints * 16;
164 for (; number < num_points; number++) {
165 *sinPtr++ = sinf(*inPtr++);
166 }
167}
168#endif
169#if LV_HAVE_AVX2 && LV_HAVE_FMA
170#include <immintrin.h>
171
172static inline void
173volk_32f_sin_32f_a_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
174{
175 float* bPtr = bVector;
176 const float* aPtr = aVector;
177
178 unsigned int number = 0;
179 unsigned int eighthPoints = num_points / 8;
180 unsigned int i = 0;
181
182 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
183 fzeroes;
184 __m256 sine, cosine, condition1, condition2;
185 __m256i q, r, ones, twos, fours;
186
187 m4pi = _mm256_set1_ps(1.273239545);
188 pio4A = _mm256_set1_ps(0.78515625);
189 pio4B = _mm256_set1_ps(0.241876e-3);
190 ffours = _mm256_set1_ps(4.0);
191 ftwos = _mm256_set1_ps(2.0);
192 fones = _mm256_set1_ps(1.0);
193 fzeroes = _mm256_setzero_ps();
194 ones = _mm256_set1_epi32(1);
195 twos = _mm256_set1_epi32(2);
196 fours = _mm256_set1_epi32(4);
197
198 cp1 = _mm256_set1_ps(1.0);
199 cp2 = _mm256_set1_ps(0.83333333e-1);
200 cp3 = _mm256_set1_ps(0.2777778e-2);
201 cp4 = _mm256_set1_ps(0.49603e-4);
202 cp5 = _mm256_set1_ps(0.551e-6);
203
204 for (; number < eighthPoints; number++) {
205 aVal = _mm256_load_ps(aPtr);
206 s = _mm256_sub_ps(aVal,
207 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
208 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
209 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
210 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
211
212 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
213 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
214
215 s = _mm256_div_ps(
216 s,
217 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
218 s = _mm256_mul_ps(s, s);
219 // Evaluate Taylor series
220 s = _mm256_mul_ps(
221 _mm256_fmadd_ps(
222 _mm256_fmsub_ps(
223 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
224 s,
225 cp1),
226 s);
227
228 for (i = 0; i < 3; i++) {
229 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
230 }
231 s = _mm256_div_ps(s, ftwos);
232
233 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
234 cosine = _mm256_sub_ps(fones, s);
235
236 condition1 = _mm256_cmp_ps(
237 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
238 fzeroes,
239 _CMP_NEQ_UQ);
240 condition2 = _mm256_cmp_ps(
241 _mm256_cmp_ps(
242 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
243 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
244 _CMP_NEQ_UQ);
245 // Need this condition only for cos
246 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
247 // twos), fours)), fzeroes);
248
249 sine =
250 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
251 sine = _mm256_sub_ps(
252 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
253 _mm256_store_ps(bPtr, sine);
254 aPtr += 8;
255 bPtr += 8;
256 }
257
258 number = eighthPoints * 8;
259 for (; number < num_points; number++) {
260 *bPtr++ = sin(*aPtr++);
261 }
262}
263
264#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for aligned */
265
266#ifdef LV_HAVE_AVX2
267#include <immintrin.h>
268
269static inline void
270volk_32f_sin_32f_a_avx2(float* bVector, const float* aVector, unsigned int num_points)
271{
272 float* bPtr = bVector;
273 const float* aPtr = aVector;
274
275 unsigned int number = 0;
276 unsigned int eighthPoints = num_points / 8;
277 unsigned int i = 0;
278
279 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
280 fzeroes;
281 __m256 sine, cosine, condition1, condition2;
282 __m256i q, r, ones, twos, fours;
283
284 m4pi = _mm256_set1_ps(1.273239545);
285 pio4A = _mm256_set1_ps(0.78515625);
286 pio4B = _mm256_set1_ps(0.241876e-3);
287 ffours = _mm256_set1_ps(4.0);
288 ftwos = _mm256_set1_ps(2.0);
289 fones = _mm256_set1_ps(1.0);
290 fzeroes = _mm256_setzero_ps();
291 ones = _mm256_set1_epi32(1);
292 twos = _mm256_set1_epi32(2);
293 fours = _mm256_set1_epi32(4);
294
295 cp1 = _mm256_set1_ps(1.0);
296 cp2 = _mm256_set1_ps(0.83333333e-1);
297 cp3 = _mm256_set1_ps(0.2777778e-2);
298 cp4 = _mm256_set1_ps(0.49603e-4);
299 cp5 = _mm256_set1_ps(0.551e-6);
300
301 for (; number < eighthPoints; number++) {
302 aVal = _mm256_load_ps(aPtr);
303 s = _mm256_sub_ps(aVal,
304 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
305 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
306 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
307 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
308
309 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
310 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
311
312 s = _mm256_div_ps(
313 s,
314 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
315 s = _mm256_mul_ps(s, s);
316 // Evaluate Taylor series
317 s = _mm256_mul_ps(
318 _mm256_add_ps(
319 _mm256_mul_ps(
320 _mm256_sub_ps(
321 _mm256_mul_ps(
322 _mm256_add_ps(
323 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
324 s),
325 cp3),
326 s),
327 cp2),
328 s),
329 cp1),
330 s);
331
332 for (i = 0; i < 3; i++) {
333 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
334 }
335 s = _mm256_div_ps(s, ftwos);
336
337 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
338 cosine = _mm256_sub_ps(fones, s);
339
340 condition1 = _mm256_cmp_ps(
341 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
342 fzeroes,
343 _CMP_NEQ_UQ);
344 condition2 = _mm256_cmp_ps(
345 _mm256_cmp_ps(
346 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
347 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
348 _CMP_NEQ_UQ);
349 // Need this condition only for cos
350 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
351 // twos), fours)), fzeroes);
352
353 sine =
354 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
355 sine = _mm256_sub_ps(
356 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
357 _mm256_store_ps(bPtr, sine);
358 aPtr += 8;
359 bPtr += 8;
360 }
361
362 number = eighthPoints * 8;
363 for (; number < num_points; number++) {
364 *bPtr++ = sin(*aPtr++);
365 }
366}
367
368#endif /* LV_HAVE_AVX2 for aligned */
369
370#ifdef LV_HAVE_SSE4_1
371#include <smmintrin.h>
372
373static inline void
374volk_32f_sin_32f_a_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
375{
376 float* bPtr = bVector;
377 const float* aPtr = aVector;
378
379 unsigned int number = 0;
380 unsigned int quarterPoints = num_points / 4;
381 unsigned int i = 0;
382
383 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
384 fzeroes;
385 __m128 sine, cosine, condition1, condition2;
386 __m128i q, r, ones, twos, fours;
387
388 m4pi = _mm_set1_ps(1.273239545);
389 pio4A = _mm_set1_ps(0.78515625);
390 pio4B = _mm_set1_ps(0.241876e-3);
391 ffours = _mm_set1_ps(4.0);
392 ftwos = _mm_set1_ps(2.0);
393 fones = _mm_set1_ps(1.0);
394 fzeroes = _mm_setzero_ps();
395 ones = _mm_set1_epi32(1);
396 twos = _mm_set1_epi32(2);
397 fours = _mm_set1_epi32(4);
398
399 cp1 = _mm_set1_ps(1.0);
400 cp2 = _mm_set1_ps(0.83333333e-1);
401 cp3 = _mm_set1_ps(0.2777778e-2);
402 cp4 = _mm_set1_ps(0.49603e-4);
403 cp5 = _mm_set1_ps(0.551e-6);
404
405 for (; number < quarterPoints; number++) {
406 aVal = _mm_load_ps(aPtr);
407 s = _mm_sub_ps(aVal,
408 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
409 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
410 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
411
412 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
413 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
414
415 s = _mm_div_ps(
416 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
417 s = _mm_mul_ps(s, s);
418 // Evaluate Taylor series
419 s = _mm_mul_ps(
420 _mm_add_ps(
421 _mm_mul_ps(
422 _mm_sub_ps(
423 _mm_mul_ps(
424 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
425 cp3),
426 s),
427 cp2),
428 s),
429 cp1),
430 s);
431
432 for (i = 0; i < 3; i++) {
433 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
434 }
435 s = _mm_div_ps(s, ftwos);
436
437 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
438 cosine = _mm_sub_ps(fones, s);
439
440 condition1 = _mm_cmpneq_ps(
441 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
442 condition2 = _mm_cmpneq_ps(
443 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
444 _mm_cmplt_ps(aVal, fzeroes));
445 // Need this condition only for cos
446 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
447 // twos), fours)), fzeroes);
448
449 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
450 sine =
451 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
452 _mm_store_ps(bPtr, sine);
453 aPtr += 4;
454 bPtr += 4;
455 }
456
457 number = quarterPoints * 4;
458 for (; number < num_points; number++) {
459 *bPtr++ = sinf(*aPtr++);
460 }
461}
462
463#endif /* LV_HAVE_SSE4_1 for aligned */
464
465
466#endif /* INCLUDED_volk_32f_sin_32f_a_H */
467
468#ifndef INCLUDED_volk_32f_sin_32f_u_H
469#define INCLUDED_volk_32f_sin_32f_u_H
470
471#ifdef LV_HAVE_AVX512F
472
473#include <immintrin.h>
474static inline void volk_32f_sin_32f_u_avx512f(float* sinVector,
475 const float* inVector,
476 unsigned int num_points)
477{
478 float* sinPtr = sinVector;
479 const float* inPtr = inVector;
480
481 unsigned int number = 0;
482 unsigned int sixteenPoints = num_points / 16;
483 unsigned int i = 0;
484
485 __m512 aVal, s, r, m4pi, pio4A, pio4B, pio4C, cp1, cp2, cp3, cp4, cp5, ffours, ftwos,
486 fones;
487 __m512 sine, cosine;
488 __m512i q, zeros, ones, twos, fours;
489
490 m4pi = _mm512_set1_ps(1.273239544735162542821171882678754627704620361328125);
491 pio4A = _mm512_set1_ps(0.7853981554508209228515625);
492 pio4B = _mm512_set1_ps(0.794662735614792836713604629039764404296875e-8);
493 pio4C = _mm512_set1_ps(0.306161699786838294306516483068750264552437361480769e-16);
494 ffours = _mm512_set1_ps(4.0);
495 ftwos = _mm512_set1_ps(2.0);
496 fones = _mm512_set1_ps(1.0);
497 zeros = _mm512_setzero_epi32();
498 ones = _mm512_set1_epi32(1);
499 twos = _mm512_set1_epi32(2);
500 fours = _mm512_set1_epi32(4);
501
502 cp1 = _mm512_set1_ps(1.0);
503 cp2 = _mm512_set1_ps(0.08333333333333333);
504 cp3 = _mm512_set1_ps(0.002777777777777778);
505 cp4 = _mm512_set1_ps(4.96031746031746e-05);
506 cp5 = _mm512_set1_ps(5.511463844797178e-07);
507 __mmask16 condition1, condition2, ltZero;
508
509 for (; number < sixteenPoints; number++) {
510 aVal = _mm512_loadu_ps(inPtr);
511 // s = fabs(aVal)
512 s = (__m512)(_mm512_and_si512((__m512i)(aVal), _mm512_set1_epi32(0x7fffffff)));
513
514 // q = (int) (s * (4/pi)), floor(aVal / (pi/4))
515 q = _mm512_cvtps_epi32(_mm512_floor_ps(_mm512_mul_ps(s, m4pi)));
516 // r = q + q&1, q indicates quadrant, r gives
517 r = _mm512_cvtepi32_ps(_mm512_add_epi32(q, _mm512_and_si512(q, ones)));
518
519 s = _mm512_fnmadd_ps(r, pio4A, s);
520 s = _mm512_fnmadd_ps(r, pio4B, s);
521 s = _mm512_fnmadd_ps(r, pio4C, s);
522
523 s = _mm512_div_ps(
524 s,
525 _mm512_set1_ps(8.0f)); // The constant is 2^N, for 3 times argument reduction
526 s = _mm512_mul_ps(s, s);
527 // Evaluate Taylor series
528 s = _mm512_mul_ps(
529 _mm512_fmadd_ps(
530 _mm512_fmsub_ps(
531 _mm512_fmadd_ps(_mm512_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
532 s,
533 cp1),
534 s);
535
536 for (i = 0; i < 3; i++)
537 s = _mm512_mul_ps(s, _mm512_sub_ps(ffours, s));
538 s = _mm512_div_ps(s, ftwos);
539
540 sine = _mm512_sqrt_ps(_mm512_mul_ps(_mm512_sub_ps(ftwos, s), s));
541 cosine = _mm512_sub_ps(fones, s);
542
543 condition1 = _mm512_cmpneq_epi32_mask(
544 _mm512_and_si512(_mm512_add_epi32(q, ones), twos), zeros);
545 ltZero = _mm512_cmp_ps_mask(aVal, _mm512_setzero_ps(), _CMP_LT_OS);
546 condition2 = _mm512_kxor(
547 _mm512_cmpneq_epi32_mask(_mm512_and_epi32(q, fours), zeros), ltZero);
548
549 sine = _mm512_mask_blend_ps(condition1, sine, cosine);
550 sine = _mm512_mask_mul_ps(sine, condition2, sine, _mm512_set1_ps(-1.f));
551 _mm512_storeu_ps(sinPtr, sine);
552 inPtr += 16;
553 sinPtr += 16;
554 }
555
556 number = sixteenPoints * 16;
557 for (; number < num_points; number++) {
558 *sinPtr++ = sinf(*inPtr++);
559 }
560}
561#endif
562
563#if LV_HAVE_AVX2 && LV_HAVE_FMA
564#include <immintrin.h>
565
566static inline void
567volk_32f_sin_32f_u_avx2_fma(float* bVector, const float* aVector, unsigned int num_points)
568{
569 float* bPtr = bVector;
570 const float* aPtr = aVector;
571
572 unsigned int number = 0;
573 unsigned int eighthPoints = num_points / 8;
574 unsigned int i = 0;
575
576 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
577 fzeroes;
578 __m256 sine, cosine, condition1, condition2;
579 __m256i q, r, ones, twos, fours;
580
581 m4pi = _mm256_set1_ps(1.273239545);
582 pio4A = _mm256_set1_ps(0.78515625);
583 pio4B = _mm256_set1_ps(0.241876e-3);
584 ffours = _mm256_set1_ps(4.0);
585 ftwos = _mm256_set1_ps(2.0);
586 fones = _mm256_set1_ps(1.0);
587 fzeroes = _mm256_setzero_ps();
588 ones = _mm256_set1_epi32(1);
589 twos = _mm256_set1_epi32(2);
590 fours = _mm256_set1_epi32(4);
591
592 cp1 = _mm256_set1_ps(1.0);
593 cp2 = _mm256_set1_ps(0.83333333e-1);
594 cp3 = _mm256_set1_ps(0.2777778e-2);
595 cp4 = _mm256_set1_ps(0.49603e-4);
596 cp5 = _mm256_set1_ps(0.551e-6);
597
598 for (; number < eighthPoints; number++) {
599 aVal = _mm256_loadu_ps(aPtr);
600 s = _mm256_sub_ps(aVal,
601 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
602 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
603 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
604 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
605
606 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4A, s);
607 s = _mm256_fnmadd_ps(_mm256_cvtepi32_ps(r), pio4B, s);
608
609 s = _mm256_div_ps(
610 s,
611 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
612 s = _mm256_mul_ps(s, s);
613 // Evaluate Taylor series
614 s = _mm256_mul_ps(
615 _mm256_fmadd_ps(
616 _mm256_fmsub_ps(
617 _mm256_fmadd_ps(_mm256_fmsub_ps(s, cp5, cp4), s, cp3), s, cp2),
618 s,
619 cp1),
620 s);
621
622 for (i = 0; i < 3; i++) {
623 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
624 }
625 s = _mm256_div_ps(s, ftwos);
626
627 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
628 cosine = _mm256_sub_ps(fones, s);
629
630 condition1 = _mm256_cmp_ps(
631 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
632 fzeroes,
633 _CMP_NEQ_UQ);
634 condition2 = _mm256_cmp_ps(
635 _mm256_cmp_ps(
636 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
637 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
638 _CMP_NEQ_UQ);
639 // Need this condition only for cos
640 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
641 // twos), fours)), fzeroes);
642
643 sine =
644 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
645 sine = _mm256_sub_ps(
646 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
647 _mm256_storeu_ps(bPtr, sine);
648 aPtr += 8;
649 bPtr += 8;
650 }
651
652 number = eighthPoints * 8;
653 for (; number < num_points; number++) {
654 *bPtr++ = sin(*aPtr++);
655 }
656}
657
658#endif /* LV_HAVE_AVX2 && LV_HAVE_FMA for unaligned */
659
660#ifdef LV_HAVE_AVX2
661#include <immintrin.h>
662
663static inline void
664volk_32f_sin_32f_u_avx2(float* bVector, const float* aVector, unsigned int num_points)
665{
666 float* bPtr = bVector;
667 const float* aPtr = aVector;
668
669 unsigned int number = 0;
670 unsigned int eighthPoints = num_points / 8;
671 unsigned int i = 0;
672
673 __m256 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
674 fzeroes;
675 __m256 sine, cosine, condition1, condition2;
676 __m256i q, r, ones, twos, fours;
677
678 m4pi = _mm256_set1_ps(1.273239545);
679 pio4A = _mm256_set1_ps(0.78515625);
680 pio4B = _mm256_set1_ps(0.241876e-3);
681 ffours = _mm256_set1_ps(4.0);
682 ftwos = _mm256_set1_ps(2.0);
683 fones = _mm256_set1_ps(1.0);
684 fzeroes = _mm256_setzero_ps();
685 ones = _mm256_set1_epi32(1);
686 twos = _mm256_set1_epi32(2);
687 fours = _mm256_set1_epi32(4);
688
689 cp1 = _mm256_set1_ps(1.0);
690 cp2 = _mm256_set1_ps(0.83333333e-1);
691 cp3 = _mm256_set1_ps(0.2777778e-2);
692 cp4 = _mm256_set1_ps(0.49603e-4);
693 cp5 = _mm256_set1_ps(0.551e-6);
694
695 for (; number < eighthPoints; number++) {
696 aVal = _mm256_loadu_ps(aPtr);
697 s = _mm256_sub_ps(aVal,
698 _mm256_and_ps(_mm256_mul_ps(aVal, ftwos),
699 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS)));
700 q = _mm256_cvtps_epi32(_mm256_floor_ps(_mm256_mul_ps(s, m4pi)));
701 r = _mm256_add_epi32(q, _mm256_and_si256(q, ones));
702
703 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4A));
704 s = _mm256_sub_ps(s, _mm256_mul_ps(_mm256_cvtepi32_ps(r), pio4B));
705
706 s = _mm256_div_ps(
707 s,
708 _mm256_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
709 s = _mm256_mul_ps(s, s);
710 // Evaluate Taylor series
711 s = _mm256_mul_ps(
712 _mm256_add_ps(
713 _mm256_mul_ps(
714 _mm256_sub_ps(
715 _mm256_mul_ps(
716 _mm256_add_ps(
717 _mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(s, cp5), cp4),
718 s),
719 cp3),
720 s),
721 cp2),
722 s),
723 cp1),
724 s);
725
726 for (i = 0; i < 3; i++) {
727 s = _mm256_mul_ps(s, _mm256_sub_ps(ffours, s));
728 }
729 s = _mm256_div_ps(s, ftwos);
730
731 sine = _mm256_sqrt_ps(_mm256_mul_ps(_mm256_sub_ps(ftwos, s), s));
732 cosine = _mm256_sub_ps(fones, s);
733
734 condition1 = _mm256_cmp_ps(
735 _mm256_cvtepi32_ps(_mm256_and_si256(_mm256_add_epi32(q, ones), twos)),
736 fzeroes,
737 _CMP_NEQ_UQ);
738 condition2 = _mm256_cmp_ps(
739 _mm256_cmp_ps(
740 _mm256_cvtepi32_ps(_mm256_and_si256(q, fours)), fzeroes, _CMP_NEQ_UQ),
741 _mm256_cmp_ps(aVal, fzeroes, _CMP_LT_OS),
742 _CMP_NEQ_UQ);
743 // Need this condition only for cos
744 // condition3 = _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q,
745 // twos), fours)), fzeroes);
746
747 sine =
748 _mm256_add_ps(sine, _mm256_and_ps(_mm256_sub_ps(cosine, sine), condition1));
749 sine = _mm256_sub_ps(
750 sine, _mm256_and_ps(_mm256_mul_ps(sine, _mm256_set1_ps(2.0f)), condition2));
751 _mm256_storeu_ps(bPtr, sine);
752 aPtr += 8;
753 bPtr += 8;
754 }
755
756 number = eighthPoints * 8;
757 for (; number < num_points; number++) {
758 *bPtr++ = sin(*aPtr++);
759 }
760}
761
762#endif /* LV_HAVE_AVX2 for unaligned */
763
764
765#ifdef LV_HAVE_SSE4_1
766#include <smmintrin.h>
767
768static inline void
769volk_32f_sin_32f_u_sse4_1(float* bVector, const float* aVector, unsigned int num_points)
770{
771 float* bPtr = bVector;
772 const float* aPtr = aVector;
773
774 unsigned int number = 0;
775 unsigned int quarterPoints = num_points / 4;
776 unsigned int i = 0;
777
778 __m128 aVal, s, m4pi, pio4A, pio4B, cp1, cp2, cp3, cp4, cp5, ffours, ftwos, fones,
779 fzeroes;
780 __m128 sine, cosine, condition1, condition2;
781 __m128i q, r, ones, twos, fours;
782
783 m4pi = _mm_set1_ps(1.273239545);
784 pio4A = _mm_set1_ps(0.78515625);
785 pio4B = _mm_set1_ps(0.241876e-3);
786 ffours = _mm_set1_ps(4.0);
787 ftwos = _mm_set1_ps(2.0);
788 fones = _mm_set1_ps(1.0);
789 fzeroes = _mm_setzero_ps();
790 ones = _mm_set1_epi32(1);
791 twos = _mm_set1_epi32(2);
792 fours = _mm_set1_epi32(4);
793
794 cp1 = _mm_set1_ps(1.0);
795 cp2 = _mm_set1_ps(0.83333333e-1);
796 cp3 = _mm_set1_ps(0.2777778e-2);
797 cp4 = _mm_set1_ps(0.49603e-4);
798 cp5 = _mm_set1_ps(0.551e-6);
799
800 for (; number < quarterPoints; number++) {
801 aVal = _mm_loadu_ps(aPtr);
802 s = _mm_sub_ps(aVal,
803 _mm_and_ps(_mm_mul_ps(aVal, ftwos), _mm_cmplt_ps(aVal, fzeroes)));
804 q = _mm_cvtps_epi32(_mm_floor_ps(_mm_mul_ps(s, m4pi)));
805 r = _mm_add_epi32(q, _mm_and_si128(q, ones));
806
807 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4A));
808 s = _mm_sub_ps(s, _mm_mul_ps(_mm_cvtepi32_ps(r), pio4B));
809
810 s = _mm_div_ps(
811 s, _mm_set1_ps(8.0)); // The constant is 2^N, for 3 times argument reduction
812 s = _mm_mul_ps(s, s);
813 // Evaluate Taylor series
814 s = _mm_mul_ps(
815 _mm_add_ps(
816 _mm_mul_ps(
817 _mm_sub_ps(
818 _mm_mul_ps(
819 _mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(s, cp5), cp4), s),
820 cp3),
821 s),
822 cp2),
823 s),
824 cp1),
825 s);
826
827 for (i = 0; i < 3; i++) {
828 s = _mm_mul_ps(s, _mm_sub_ps(ffours, s));
829 }
830 s = _mm_div_ps(s, ftwos);
831
832 sine = _mm_sqrt_ps(_mm_mul_ps(_mm_sub_ps(ftwos, s), s));
833 cosine = _mm_sub_ps(fones, s);
834
835 condition1 = _mm_cmpneq_ps(
836 _mm_cvtepi32_ps(_mm_and_si128(_mm_add_epi32(q, ones), twos)), fzeroes);
837 condition2 = _mm_cmpneq_ps(
838 _mm_cmpneq_ps(_mm_cvtepi32_ps(_mm_and_si128(q, fours)), fzeroes),
839 _mm_cmplt_ps(aVal, fzeroes));
840
841 sine = _mm_add_ps(sine, _mm_and_ps(_mm_sub_ps(cosine, sine), condition1));
842 sine =
843 _mm_sub_ps(sine, _mm_and_ps(_mm_mul_ps(sine, _mm_set1_ps(2.0f)), condition2));
844 _mm_storeu_ps(bPtr, sine);
845 aPtr += 4;
846 bPtr += 4;
847 }
848
849 number = quarterPoints * 4;
850 for (; number < num_points; number++) {
851 *bPtr++ = sinf(*aPtr++);
852 }
853}
854
855#endif /* LV_HAVE_SSE4_1 for unaligned */
856
857
858#ifdef LV_HAVE_GENERIC
859
860static inline void
861volk_32f_sin_32f_generic(float* bVector, const float* aVector, unsigned int num_points)
862{
863 float* bPtr = bVector;
864 const float* aPtr = aVector;
865 unsigned int number = 0;
866
867 for (number = 0; number < num_points; number++) {
868 *bPtr++ = sinf(*aPtr++);
869 }
870}
871
872#endif /* LV_HAVE_GENERIC */
873
874
875#ifdef LV_HAVE_NEON
876#include <arm_neon.h>
878
879static inline void
880volk_32f_sin_32f_neon(float* bVector, const float* aVector, unsigned int num_points)
881{
882 unsigned int number = 0;
883 unsigned int quarter_points = num_points / 4;
884 float* bVectorPtr = bVector;
885 const float* aVectorPtr = aVector;
886
887 float32x4_t b_vec;
888 float32x4_t a_vec;
889
890 for (number = 0; number < quarter_points; number++) {
891 a_vec = vld1q_f32(aVectorPtr);
892 // Prefetch next one, speeds things up
893 __VOLK_PREFETCH(aVectorPtr + 4);
894 b_vec = _vsinq_f32(a_vec);
895 vst1q_f32(bVectorPtr, b_vec);
896 // move pointers ahead
897 bVectorPtr += 4;
898 aVectorPtr += 4;
899 }
900
901 // Deal with the rest
902 for (number = quarter_points * 4; number < num_points; number++) {
903 *bVectorPtr++ = sinf(*aVectorPtr++);
904 }
905}
906
907#endif /* LV_HAVE_NEON */
908
909
910#endif /* INCLUDED_volk_32f_sin_32f_u_H */
static void volk_32f_sin_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:861
static void volk_32f_sin_32f_neon(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sin_32f.h:880
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
for i
Definition: volk_config_fixed.tmpl.h:25
static float32x4_t _vsinq_f32(float32x4_t x)
Definition: volk_neon_intrinsics.h:262