Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_invsqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2013, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
65#ifndef INCLUDED_volk_32f_invsqrt_32f_a_H
66#define INCLUDED_volk_32f_invsqrt_32f_a_H
67
68#include <inttypes.h>
69#include <math.h>
70#include <stdio.h>
71#include <string.h>
72
73static inline float Q_rsqrt(float number)
74{
75 float x2;
76 const float threehalfs = 1.5F;
77 union f32_to_i32 {
78 int32_t i;
79 float f;
80 } u;
81
82 x2 = number * 0.5F;
83 u.f = number;
84 u.i = 0x5f3759df - (u.i >> 1); // what the fuck?
85 u.f = u.f * (threehalfs - (x2 * u.f * u.f)); // 1st iteration
86 // u.f = u.f * ( threehalfs - ( x2 * u.f * u.f ) ); // 2nd iteration, this can be
87 // removed
88
89 return u.f;
90}
91
92#ifdef LV_HAVE_AVX
93#include <immintrin.h>
94
95static inline void
96volk_32f_invsqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
97{
98 unsigned int number = 0;
99 const unsigned int eighthPoints = num_points / 8;
100
101 float* cPtr = cVector;
102 const float* aPtr = aVector;
103 __m256 aVal, cVal;
104 for (; number < eighthPoints; number++) {
105 aVal = _mm256_load_ps(aPtr);
106 cVal = _mm256_rsqrt_ps(aVal);
107 _mm256_store_ps(cPtr, cVal);
108 aPtr += 8;
109 cPtr += 8;
110 }
111
112 number = eighthPoints * 8;
113 for (; number < num_points; number++)
114 *cPtr++ = Q_rsqrt(*aPtr++);
115}
116#endif /* LV_HAVE_AVX */
117
118
119#ifdef LV_HAVE_SSE
120#include <xmmintrin.h>
121
122static inline void
123volk_32f_invsqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
124{
125 unsigned int number = 0;
126 const unsigned int quarterPoints = num_points / 4;
127
128 float* cPtr = cVector;
129 const float* aPtr = aVector;
130
131 __m128 aVal, cVal;
132 for (; number < quarterPoints; number++) {
133
134 aVal = _mm_load_ps(aPtr);
135
136 cVal = _mm_rsqrt_ps(aVal);
137
138 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
139
140 aPtr += 4;
141 cPtr += 4;
142 }
143
144 number = quarterPoints * 4;
145 for (; number < num_points; number++) {
146 *cPtr++ = Q_rsqrt(*aPtr++);
147 }
148}
149#endif /* LV_HAVE_SSE */
150
151
152#ifdef LV_HAVE_NEON
153#include <arm_neon.h>
154
155static inline void
156volk_32f_invsqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
157{
158 unsigned int number;
159 const unsigned int quarter_points = num_points / 4;
160
161 float* cPtr = cVector;
162 const float* aPtr = aVector;
163 float32x4_t a_val, c_val;
164 for (number = 0; number < quarter_points; ++number) {
165 a_val = vld1q_f32(aPtr);
166 c_val = vrsqrteq_f32(a_val);
167 vst1q_f32(cPtr, c_val);
168 aPtr += 4;
169 cPtr += 4;
170 }
171
172 for (number = quarter_points * 4; number < num_points; number++)
173 *cPtr++ = Q_rsqrt(*aPtr++);
174}
175#endif /* LV_HAVE_NEON */
176
177
178#ifdef LV_HAVE_GENERIC
179
180static inline void volk_32f_invsqrt_32f_generic(float* cVector,
181 const float* aVector,
182 unsigned int num_points)
183{
184 float* cPtr = cVector;
185 const float* aPtr = aVector;
186 unsigned int number = 0;
187 for (number = 0; number < num_points; number++) {
188 *cPtr++ = Q_rsqrt(*aPtr++);
189 }
190}
191#endif /* LV_HAVE_GENERIC */
192
193#ifdef LV_HAVE_AVX
194#include <immintrin.h>
195
196static inline void
197volk_32f_invsqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
198{
199 unsigned int number = 0;
200 const unsigned int eighthPoints = num_points / 8;
201
202 float* cPtr = cVector;
203 const float* aPtr = aVector;
204 __m256 aVal, cVal;
205 for (; number < eighthPoints; number++) {
206 aVal = _mm256_loadu_ps(aPtr);
207 cVal = _mm256_rsqrt_ps(aVal);
208 _mm256_storeu_ps(cPtr, cVal);
209 aPtr += 8;
210 cPtr += 8;
211 }
212
213 number = eighthPoints * 8;
214 for (; number < num_points; number++)
215 *cPtr++ = Q_rsqrt(*aPtr++);
216}
217#endif /* LV_HAVE_AVX */
218
219#endif /* INCLUDED_volk_32f_invsqrt_32f_a_H */
static void volk_32f_invsqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:156
static void volk_32f_invsqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:96
static void volk_32f_invsqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:180
static void volk_32f_invsqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:123
static void volk_32f_invsqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_invsqrt_32f.h:197
static float Q_rsqrt(float number)
Definition: volk_32f_invsqrt_32f.h:73
for i
Definition: volk_config_fixed.tmpl.h:25