Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_sqrt_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
65#ifndef INCLUDED_volk_32f_sqrt_32f_a_H
66#define INCLUDED_volk_32f_sqrt_32f_a_H
67
68#include <inttypes.h>
69#include <math.h>
70#include <stdio.h>
71
72#ifdef LV_HAVE_SSE
73#include <xmmintrin.h>
74
75static inline void
76volk_32f_sqrt_32f_a_sse(float* cVector, const float* aVector, unsigned int num_points)
77{
78 unsigned int number = 0;
79 const unsigned int quarterPoints = num_points / 4;
80
81 float* cPtr = cVector;
82 const float* aPtr = aVector;
83
84 __m128 aVal, cVal;
85 for (; number < quarterPoints; number++) {
86 aVal = _mm_load_ps(aPtr);
87
88 cVal = _mm_sqrt_ps(aVal);
89
90 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
91
92 aPtr += 4;
93 cPtr += 4;
94 }
95
96 number = quarterPoints * 4;
97 for (; number < num_points; number++) {
98 *cPtr++ = sqrtf(*aPtr++);
99 }
100}
101
102#endif /* LV_HAVE_SSE */
103
104#ifdef LV_HAVE_AVX
105#include <immintrin.h>
106
107static inline void
108volk_32f_sqrt_32f_a_avx(float* cVector, const float* aVector, unsigned int num_points)
109{
110 unsigned int number = 0;
111 const unsigned int eighthPoints = num_points / 8;
112
113 float* cPtr = cVector;
114 const float* aPtr = aVector;
115
116 __m256 aVal, cVal;
117 for (; number < eighthPoints; number++) {
118 aVal = _mm256_load_ps(aPtr);
119
120 cVal = _mm256_sqrt_ps(aVal);
121
122 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
123
124 aPtr += 8;
125 cPtr += 8;
126 }
127
128 number = eighthPoints * 8;
129 for (; number < num_points; number++) {
130 *cPtr++ = sqrtf(*aPtr++);
131 }
132}
133
134#endif /* LV_HAVE_AVX */
135
136
137#ifdef LV_HAVE_NEON
138#include <arm_neon.h>
139
140static inline void
141volk_32f_sqrt_32f_neon(float* cVector, const float* aVector, unsigned int num_points)
142{
143 float* cPtr = cVector;
144 const float* aPtr = aVector;
145 unsigned int number = 0;
146 unsigned int quarter_points = num_points / 4;
147 float32x4_t in_vec, out_vec;
148
149 for (number = 0; number < quarter_points; number++) {
150 in_vec = vld1q_f32(aPtr);
151 // note that armv8 has vsqrt_f32 which will be much better
152 out_vec = vrecpeq_f32(vrsqrteq_f32(in_vec));
153 vst1q_f32(cPtr, out_vec);
154 aPtr += 4;
155 cPtr += 4;
156 }
157
158 for (number = quarter_points * 4; number < num_points; number++) {
159 *cPtr++ = sqrtf(*aPtr++);
160 }
161}
162
163#endif /* LV_HAVE_NEON */
164
165
166#ifdef LV_HAVE_GENERIC
167
168static inline void
169volk_32f_sqrt_32f_generic(float* cVector, const float* aVector, unsigned int num_points)
170{
171 float* cPtr = cVector;
172 const float* aPtr = aVector;
173 unsigned int number = 0;
174
175 for (number = 0; number < num_points; number++) {
176 *cPtr++ = sqrtf(*aPtr++);
177 }
178}
179
180#endif /* LV_HAVE_GENERIC */
181
182
183#ifdef LV_HAVE_ORC
184
185extern void volk_32f_sqrt_32f_a_orc_impl(float*, const float*, unsigned int);
186
187static inline void
188volk_32f_sqrt_32f_u_orc(float* cVector, const float* aVector, unsigned int num_points)
189{
190 volk_32f_sqrt_32f_a_orc_impl(cVector, aVector, num_points);
191}
192
193#endif /* LV_HAVE_ORC */
194
195#endif /* INCLUDED_volk_32f_sqrt_32f_a_H */
196
197#ifndef INCLUDED_volk_32f_sqrt_32f_u_H
198#define INCLUDED_volk_32f_sqrt_32f_u_H
199
200#include <inttypes.h>
201#include <math.h>
202#include <stdio.h>
203#ifdef LV_HAVE_AVX
204#include <immintrin.h>
205
206static inline void
207volk_32f_sqrt_32f_u_avx(float* cVector, const float* aVector, unsigned int num_points)
208{
209 unsigned int number = 0;
210 const unsigned int eighthPoints = num_points / 8;
211
212 float* cPtr = cVector;
213 const float* aPtr = aVector;
214
215 __m256 aVal, cVal;
216 for (; number < eighthPoints; number++) {
217 aVal = _mm256_loadu_ps(aPtr);
218
219 cVal = _mm256_sqrt_ps(aVal);
220
221 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
222
223 aPtr += 8;
224 cPtr += 8;
225 }
226
227 number = eighthPoints * 8;
228 for (; number < num_points; number++) {
229 *cPtr++ = sqrtf(*aPtr++);
230 }
231}
232
233#endif /* LV_HAVE_AVX */
234#endif /* INCLUDED_volk_32f_sqrt_32f_u_H */
static void volk_32f_sqrt_32f_neon(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:141
static void volk_32f_sqrt_32f_a_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:108
static void volk_32f_sqrt_32f_a_sse(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:76
static void volk_32f_sqrt_32f_u_avx(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:207
static void volk_32f_sqrt_32f_generic(float *cVector, const float *aVector, unsigned int num_points)
Definition: volk_32f_sqrt_32f.h:169