Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_expfast_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
65#include <inttypes.h>
66#include <math.h>
67#include <stdio.h>
68
69#define Mln2 0.6931471805f
70#define A 8388608.0f
71#define B 1065353216.0f
72#define C 60801.0f
73
74
75#ifndef INCLUDED_volk_32f_expfast_32f_a_H
76#define INCLUDED_volk_32f_expfast_32f_a_H
77
78#if LV_HAVE_AVX && LV_HAVE_FMA
79
80#include <immintrin.h>
81
82static inline void volk_32f_expfast_32f_a_avx_fma(float* bVector,
83 const float* aVector,
84 unsigned int num_points)
85{
86 float* bPtr = bVector;
87 const float* aPtr = aVector;
88
89 unsigned int number = 0;
90 const unsigned int eighthPoints = num_points / 8;
91
92 __m256 aVal, bVal, a, b;
93 __m256i exp;
94 a = _mm256_set1_ps(A / Mln2);
95 b = _mm256_set1_ps(B - C);
96
97 for (; number < eighthPoints; number++) {
98 aVal = _mm256_load_ps(aPtr);
99 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
100 bVal = _mm256_castsi256_ps(exp);
101
102 _mm256_store_ps(bPtr, bVal);
103 aPtr += 8;
104 bPtr += 8;
105 }
106
107 number = eighthPoints * 8;
108 for (; number < num_points; number++) {
109 *bPtr++ = expf(*aPtr++);
110 }
111}
112
113#endif /* LV_HAVE_AVX && LV_HAVE_FMA for aligned */
114
115#ifdef LV_HAVE_AVX
116
117#include <immintrin.h>
118
119static inline void
120volk_32f_expfast_32f_a_avx(float* bVector, const float* aVector, unsigned int num_points)
121{
122 float* bPtr = bVector;
123 const float* aPtr = aVector;
124
125 unsigned int number = 0;
126 const unsigned int eighthPoints = num_points / 8;
127
128 __m256 aVal, bVal, a, b;
129 __m256i exp;
130 a = _mm256_set1_ps(A / Mln2);
131 b = _mm256_set1_ps(B - C);
132
133 for (; number < eighthPoints; number++) {
134 aVal = _mm256_load_ps(aPtr);
135 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
136 bVal = _mm256_castsi256_ps(exp);
137
138 _mm256_store_ps(bPtr, bVal);
139 aPtr += 8;
140 bPtr += 8;
141 }
142
143 number = eighthPoints * 8;
144 for (; number < num_points; number++) {
145 *bPtr++ = expf(*aPtr++);
146 }
147}
148
149#endif /* LV_HAVE_AVX for aligned */
150
151#ifdef LV_HAVE_SSE4_1
152#include <smmintrin.h>
153
154static inline void volk_32f_expfast_32f_a_sse4_1(float* bVector,
155 const float* aVector,
156 unsigned int num_points)
157{
158 float* bPtr = bVector;
159 const float* aPtr = aVector;
160
161 unsigned int number = 0;
162 const unsigned int quarterPoints = num_points / 4;
163
164 __m128 aVal, bVal, a, b;
165 __m128i exp;
166 a = _mm_set1_ps(A / Mln2);
167 b = _mm_set1_ps(B - C);
168
169 for (; number < quarterPoints; number++) {
170 aVal = _mm_load_ps(aPtr);
171 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
172 bVal = _mm_castsi128_ps(exp);
173
174 _mm_store_ps(bPtr, bVal);
175 aPtr += 4;
176 bPtr += 4;
177 }
178
179 number = quarterPoints * 4;
180 for (; number < num_points; number++) {
181 *bPtr++ = expf(*aPtr++);
182 }
183}
184
185#endif /* LV_HAVE_SSE4_1 for aligned */
186
187#endif /* INCLUDED_volk_32f_expfast_32f_a_H */
188
189#ifndef INCLUDED_volk_32f_expfast_32f_u_H
190#define INCLUDED_volk_32f_expfast_32f_u_H
191
192#if LV_HAVE_AVX && LV_HAVE_FMA
193#include <immintrin.h>
194
195static inline void volk_32f_expfast_32f_u_avx_fma(float* bVector,
196 const float* aVector,
197 unsigned int num_points)
198{
199 float* bPtr = bVector;
200 const float* aPtr = aVector;
201
202 unsigned int number = 0;
203 const unsigned int eighthPoints = num_points / 8;
204
205 __m256 aVal, bVal, a, b;
206 __m256i exp;
207 a = _mm256_set1_ps(A / Mln2);
208 b = _mm256_set1_ps(B - C);
209
210 for (; number < eighthPoints; number++) {
211 aVal = _mm256_loadu_ps(aPtr);
212 exp = _mm256_cvtps_epi32(_mm256_fmadd_ps(a, aVal, b));
213 bVal = _mm256_castsi256_ps(exp);
214
215 _mm256_storeu_ps(bPtr, bVal);
216 aPtr += 8;
217 bPtr += 8;
218 }
219
220 number = eighthPoints * 8;
221 for (; number < num_points; number++) {
222 *bPtr++ = expf(*aPtr++);
223 }
224}
225
226#endif /* LV_HAVE_AVX && LV_HAVE_FMA for unaligned */
227
228#ifdef LV_HAVE_AVX
229#include <immintrin.h>
230
231static inline void
232volk_32f_expfast_32f_u_avx(float* bVector, const float* aVector, unsigned int num_points)
233{
234 float* bPtr = bVector;
235 const float* aPtr = aVector;
236
237 unsigned int number = 0;
238 const unsigned int eighthPoints = num_points / 8;
239
240 __m256 aVal, bVal, a, b;
241 __m256i exp;
242 a = _mm256_set1_ps(A / Mln2);
243 b = _mm256_set1_ps(B - C);
244
245 for (; number < eighthPoints; number++) {
246 aVal = _mm256_loadu_ps(aPtr);
247 exp = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(a, aVal), b));
248 bVal = _mm256_castsi256_ps(exp);
249
250 _mm256_storeu_ps(bPtr, bVal);
251 aPtr += 8;
252 bPtr += 8;
253 }
254
255 number = eighthPoints * 8;
256 for (; number < num_points; number++) {
257 *bPtr++ = expf(*aPtr++);
258 }
259}
260
261#endif /* LV_HAVE_AVX for unaligned */
262
263
264#ifdef LV_HAVE_SSE4_1
265#include <smmintrin.h>
266
267static inline void volk_32f_expfast_32f_u_sse4_1(float* bVector,
268 const float* aVector,
269 unsigned int num_points)
270{
271 float* bPtr = bVector;
272 const float* aPtr = aVector;
273
274 unsigned int number = 0;
275 const unsigned int quarterPoints = num_points / 4;
276
277 __m128 aVal, bVal, a, b;
278 __m128i exp;
279 a = _mm_set1_ps(A / Mln2);
280 b = _mm_set1_ps(B - C);
281
282 for (; number < quarterPoints; number++) {
283 aVal = _mm_loadu_ps(aPtr);
284 exp = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(a, aVal), b));
285 bVal = _mm_castsi128_ps(exp);
286
287 _mm_storeu_ps(bPtr, bVal);
288 aPtr += 4;
289 bPtr += 4;
290 }
291
292 number = quarterPoints * 4;
293 for (; number < num_points; number++) {
294 *bPtr++ = expf(*aPtr++);
295 }
296}
297
298#endif /* LV_HAVE_SSE4_1 for unaligned */
299
300
301#ifdef LV_HAVE_GENERIC
302
303static inline void volk_32f_expfast_32f_generic(float* bVector,
304 const float* aVector,
305 unsigned int num_points)
306{
307 float* bPtr = bVector;
308 const float* aPtr = aVector;
309 unsigned int number = 0;
310
311 for (number = 0; number < num_points; number++) {
312 *bPtr++ = expf(*aPtr++);
313 }
314}
315#endif /* LV_HAVE_GENERIC */
316
317#endif /* INCLUDED_volk_32f_expfast_32f_u_H */
#define Mln2
Definition: volk_32f_expfast_32f.h:69
#define B
Definition: volk_32f_expfast_32f.h:71
#define A
Definition: volk_32f_expfast_32f.h:70
static void volk_32f_expfast_32f_u_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:232
static void volk_32f_expfast_32f_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:303
#define C
Definition: volk_32f_expfast_32f.h:72
static void volk_32f_expfast_32f_a_avx(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_expfast_32f.h:120