Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_exp_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015-2020 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23/* SIMD (SSE4) implementation of exp
24 Inspired by Intel Approximate Math library, and based on the
25 corresponding algorithms of the cephes math library
26*/
27
28/* Copyright (C) 2007 Julien Pommier
29
30 This software is provided 'as-is', without any express or implied
31 warranty. In no event will the authors be held liable for any damages
32 arising from the use of this software.
33
34 Permission is granted to anyone to use this software for any purpose,
35 including commercial applications, and to alter it and redistribute it
36 freely, subject to the following restrictions:
37
38 1. The origin of this software must not be misrepresented; you must not
39 claim that you wrote the original software. If you use this software
40 in a product, an acknowledgment in the product documentation would be
41 appreciated but is not required.
42 2. Altered source versions must be plainly marked as such, and must not be
43 misrepresented as being the original software.
44 3. This notice may not be removed or altered from any source distribution.
45
46 (this is the zlib license)
47*/
48
95#include <inttypes.h>
96#include <math.h>
97#include <stdio.h>
98
99#ifndef INCLUDED_volk_32f_exp_32f_a_H
100#define INCLUDED_volk_32f_exp_32f_a_H
101
102#ifdef LV_HAVE_SSE2
103#include <emmintrin.h>
104
105static inline void
106volk_32f_exp_32f_a_sse2(float* bVector, const float* aVector, unsigned int num_points)
107{
108 float* bPtr = bVector;
109 const float* aPtr = aVector;
110
111 unsigned int number = 0;
112 unsigned int quarterPoints = num_points / 4;
113
114 // Declare variables and constants
115 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
116 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
117 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
118 __m128i emm0, pi32_0x7f;
119
120 one = _mm_set1_ps(1.0);
121 exp_hi = _mm_set1_ps(88.3762626647949);
122 exp_lo = _mm_set1_ps(-88.3762626647949);
123 log2EF = _mm_set1_ps(1.44269504088896341);
124 half = _mm_set1_ps(0.5);
125 exp_C1 = _mm_set1_ps(0.693359375);
126 exp_C2 = _mm_set1_ps(-2.12194440e-4);
127 pi32_0x7f = _mm_set1_epi32(0x7f);
128
129 exp_p0 = _mm_set1_ps(1.9875691500e-4);
130 exp_p1 = _mm_set1_ps(1.3981999507e-3);
131 exp_p2 = _mm_set1_ps(8.3334519073e-3);
132 exp_p3 = _mm_set1_ps(4.1665795894e-2);
133 exp_p4 = _mm_set1_ps(1.6666665459e-1);
134 exp_p5 = _mm_set1_ps(5.0000001201e-1);
135
136 for (; number < quarterPoints; number++) {
137 aVal = _mm_load_ps(aPtr);
138 tmp = _mm_setzero_ps();
139
140 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
141
142 /* express exp(x) as exp(g + n*log(2)) */
143 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
144
145 emm0 = _mm_cvttps_epi32(fx);
146 tmp = _mm_cvtepi32_ps(emm0);
147
148 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
149 fx = _mm_sub_ps(tmp, mask);
150
151 tmp = _mm_mul_ps(fx, exp_C1);
152 z = _mm_mul_ps(fx, exp_C2);
153 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
154 z = _mm_mul_ps(aVal, aVal);
155
156 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
157 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
158 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
159 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
160 y = _mm_add_ps(y, one);
161
162 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
163
164 pow2n = _mm_castsi128_ps(emm0);
165 bVal = _mm_mul_ps(y, pow2n);
166
167 _mm_store_ps(bPtr, bVal);
168 aPtr += 4;
169 bPtr += 4;
170 }
171
172 number = quarterPoints * 4;
173 for (; number < num_points; number++) {
174 *bPtr++ = expf(*aPtr++);
175 }
176}
177
178#endif /* LV_HAVE_SSE2 for aligned */
179
180
181#ifdef LV_HAVE_GENERIC
182
183static inline void
184volk_32f_exp_32f_a_generic(float* bVector, const float* aVector, unsigned int num_points)
185{
186 float* bPtr = bVector;
187 const float* aPtr = aVector;
188 unsigned int number = 0;
189
190 for (number = 0; number < num_points; number++) {
191 *bPtr++ = expf(*aPtr++);
192 }
193}
194
195#endif /* LV_HAVE_GENERIC */
196
197#endif /* INCLUDED_volk_32f_exp_32f_a_H */
198
199#ifndef INCLUDED_volk_32f_exp_32f_u_H
200#define INCLUDED_volk_32f_exp_32f_u_H
201
202#ifdef LV_HAVE_SSE2
203#include <emmintrin.h>
204
205static inline void
206volk_32f_exp_32f_u_sse2(float* bVector, const float* aVector, unsigned int num_points)
207{
208 float* bPtr = bVector;
209 const float* aPtr = aVector;
210
211 unsigned int number = 0;
212 unsigned int quarterPoints = num_points / 4;
213
214 // Declare variables and constants
215 __m128 aVal, bVal, tmp, fx, mask, pow2n, z, y;
216 __m128 one, exp_hi, exp_lo, log2EF, half, exp_C1, exp_C2;
217 __m128 exp_p0, exp_p1, exp_p2, exp_p3, exp_p4, exp_p5;
218 __m128i emm0, pi32_0x7f;
219
220 one = _mm_set1_ps(1.0);
221 exp_hi = _mm_set1_ps(88.3762626647949);
222 exp_lo = _mm_set1_ps(-88.3762626647949);
223 log2EF = _mm_set1_ps(1.44269504088896341);
224 half = _mm_set1_ps(0.5);
225 exp_C1 = _mm_set1_ps(0.693359375);
226 exp_C2 = _mm_set1_ps(-2.12194440e-4);
227 pi32_0x7f = _mm_set1_epi32(0x7f);
228
229 exp_p0 = _mm_set1_ps(1.9875691500e-4);
230 exp_p1 = _mm_set1_ps(1.3981999507e-3);
231 exp_p2 = _mm_set1_ps(8.3334519073e-3);
232 exp_p3 = _mm_set1_ps(4.1665795894e-2);
233 exp_p4 = _mm_set1_ps(1.6666665459e-1);
234 exp_p5 = _mm_set1_ps(5.0000001201e-1);
235
236
237 for (; number < quarterPoints; number++) {
238 aVal = _mm_loadu_ps(aPtr);
239 tmp = _mm_setzero_ps();
240
241 aVal = _mm_max_ps(_mm_min_ps(aVal, exp_hi), exp_lo);
242
243 /* express exp(x) as exp(g + n*log(2)) */
244 fx = _mm_add_ps(_mm_mul_ps(aVal, log2EF), half);
245
246 emm0 = _mm_cvttps_epi32(fx);
247 tmp = _mm_cvtepi32_ps(emm0);
248
249 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), one);
250 fx = _mm_sub_ps(tmp, mask);
251
252 tmp = _mm_mul_ps(fx, exp_C1);
253 z = _mm_mul_ps(fx, exp_C2);
254 aVal = _mm_sub_ps(_mm_sub_ps(aVal, tmp), z);
255 z = _mm_mul_ps(aVal, aVal);
256
257 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(exp_p0, aVal), exp_p1), aVal);
258 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p2), aVal), exp_p3);
259 y = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, aVal), exp_p4), aVal);
260 y = _mm_add_ps(_mm_mul_ps(_mm_add_ps(y, exp_p5), z), aVal);
261 y = _mm_add_ps(y, one);
262
263 emm0 = _mm_slli_epi32(_mm_add_epi32(_mm_cvttps_epi32(fx), pi32_0x7f), 23);
264
265 pow2n = _mm_castsi128_ps(emm0);
266 bVal = _mm_mul_ps(y, pow2n);
267
268 _mm_storeu_ps(bPtr, bVal);
269 aPtr += 4;
270 bPtr += 4;
271 }
272
273 number = quarterPoints * 4;
274 for (; number < num_points; number++) {
275 *bPtr++ = expf(*aPtr++);
276 }
277}
278
279#endif /* LV_HAVE_SSE2 for unaligned */
280
281
282#ifdef LV_HAVE_GENERIC
283
284static inline void
285volk_32f_exp_32f_u_generic(float* bVector, const float* aVector, unsigned int num_points)
286{
287 float* bPtr = bVector;
288 const float* aPtr = aVector;
289 unsigned int number = 0;
290
291 for (number = 0; number < num_points; number++) {
292 *bPtr++ = expf(*aPtr++);
293 }
294}
295
296#endif /* LV_HAVE_GENERIC */
297
298#endif /* INCLUDED_volk_32f_exp_32f_u_H */
static void volk_32f_exp_32f_a_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:106
static void volk_32f_exp_32f_u_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:285
static void volk_32f_exp_32f_u_sse2(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:206
static void volk_32f_exp_32f_a_generic(float *bVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_exp_32f.h:184