Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_x2_divide_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_x2_divide_32f_a_H
72#define INCLUDED_volk_32f_x2_divide_32f_a_H
73
74#include <inttypes.h>
75#include <stdio.h>
76
77#ifdef LV_HAVE_AVX512F
78#include <immintrin.h>
79
80static inline void volk_32f_x2_divide_32f_a_avx512f(float* cVector,
81 const float* aVector,
82 const float* bVector,
83 unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
87
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
91
92 __m512 aVal, bVal, cVal;
93 for (; number < sixteenthPoints; number++) {
94 aVal = _mm512_load_ps(aPtr);
95 bVal = _mm512_load_ps(bPtr);
96
97 cVal = _mm512_div_ps(aVal, bVal);
98
99 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
100
101 aPtr += 16;
102 bPtr += 16;
103 cPtr += 16;
104 }
105
106 number = sixteenthPoints * 16;
107 for (; number < num_points; number++) {
108 *cPtr++ = (*aPtr++) / (*bPtr++);
109 }
110}
111#endif /* LV_HAVE_AVX512F */
112
113
114#ifdef LV_HAVE_AVX
115#include <immintrin.h>
116
117static inline void volk_32f_x2_divide_32f_a_avx(float* cVector,
118 const float* aVector,
119 const float* bVector,
120 unsigned int num_points)
121{
122 unsigned int number = 0;
123 const unsigned int eighthPoints = num_points / 8;
124
125 float* cPtr = cVector;
126 const float* aPtr = aVector;
127 const float* bPtr = bVector;
128
129 __m256 aVal, bVal, cVal;
130 for (; number < eighthPoints; number++) {
131 aVal = _mm256_load_ps(aPtr);
132 bVal = _mm256_load_ps(bPtr);
133
134 cVal = _mm256_div_ps(aVal, bVal);
135
136 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
137
138 aPtr += 8;
139 bPtr += 8;
140 cPtr += 8;
141 }
142
143 number = eighthPoints * 8;
144 for (; number < num_points; number++) {
145 *cPtr++ = (*aPtr++) / (*bPtr++);
146 }
147}
148#endif /* LV_HAVE_AVX */
149
150
151#ifdef LV_HAVE_SSE
152#include <xmmintrin.h>
153
154static inline void volk_32f_x2_divide_32f_a_sse(float* cVector,
155 const float* aVector,
156 const float* bVector,
157 unsigned int num_points)
158{
159 unsigned int number = 0;
160 const unsigned int quarterPoints = num_points / 4;
161
162 float* cPtr = cVector;
163 const float* aPtr = aVector;
164 const float* bPtr = bVector;
165
166 __m128 aVal, bVal, cVal;
167 for (; number < quarterPoints; number++) {
168 aVal = _mm_load_ps(aPtr);
169 bVal = _mm_load_ps(bPtr);
170
171 cVal = _mm_div_ps(aVal, bVal);
172
173 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
174
175 aPtr += 4;
176 bPtr += 4;
177 cPtr += 4;
178 }
179
180 number = quarterPoints * 4;
181 for (; number < num_points; number++) {
182 *cPtr++ = (*aPtr++) / (*bPtr++);
183 }
184}
185#endif /* LV_HAVE_SSE */
186
187
188#ifdef LV_HAVE_NEON
189#include <arm_neon.h>
190
191static inline void volk_32f_x2_divide_32f_neon(float* cVector,
192 const float* aVector,
193 const float* bVector,
194 unsigned int num_points)
195{
196 float* cPtr = cVector;
197 const float* aPtr = aVector;
198 const float* bPtr = bVector;
199
200 float32x4x4_t aVal, bVal, bInv, cVal;
201
202 const unsigned int eighthPoints = num_points / 16;
203 unsigned int number = 0;
204 for (; number < eighthPoints; number++) {
205 aVal = vld4q_f32(aPtr);
206 aPtr += 16;
207 bVal = vld4q_f32(bPtr);
208 bPtr += 16;
209
210 __VOLK_PREFETCH(aPtr + 16);
211 __VOLK_PREFETCH(bPtr + 16);
212
213 bInv.val[0] = vrecpeq_f32(bVal.val[0]);
214 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
215 bInv.val[0] = vmulq_f32(bInv.val[0], vrecpsq_f32(bInv.val[0], bVal.val[0]));
216 cVal.val[0] = vmulq_f32(aVal.val[0], bInv.val[0]);
217
218 bInv.val[1] = vrecpeq_f32(bVal.val[1]);
219 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
220 bInv.val[1] = vmulq_f32(bInv.val[1], vrecpsq_f32(bInv.val[1], bVal.val[1]));
221 cVal.val[1] = vmulq_f32(aVal.val[1], bInv.val[1]);
222
223 bInv.val[2] = vrecpeq_f32(bVal.val[2]);
224 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
225 bInv.val[2] = vmulq_f32(bInv.val[2], vrecpsq_f32(bInv.val[2], bVal.val[2]));
226 cVal.val[2] = vmulq_f32(aVal.val[2], bInv.val[2]);
227
228 bInv.val[3] = vrecpeq_f32(bVal.val[3]);
229 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
230 bInv.val[3] = vmulq_f32(bInv.val[3], vrecpsq_f32(bInv.val[3], bVal.val[3]));
231 cVal.val[3] = vmulq_f32(aVal.val[3], bInv.val[3]);
232
233 vst4q_f32(cPtr, cVal);
234 cPtr += 16;
235 }
236
237 for (number = eighthPoints * 16; number < num_points; number++) {
238 *cPtr++ = (*aPtr++) / (*bPtr++);
239 }
240}
241
242#endif /* LV_HAVE_NEON */
243
244
245#ifdef LV_HAVE_GENERIC
246
247static inline void volk_32f_x2_divide_32f_generic(float* cVector,
248 const float* aVector,
249 const float* bVector,
250 unsigned int num_points)
251{
252 float* cPtr = cVector;
253 const float* aPtr = aVector;
254 const float* bPtr = bVector;
255 unsigned int number = 0;
256
257 for (number = 0; number < num_points; number++) {
258 *cPtr++ = (*aPtr++) / (*bPtr++);
259 }
260}
261#endif /* LV_HAVE_GENERIC */
262
263
264#ifdef LV_HAVE_ORC
265
266extern void volk_32f_x2_divide_32f_a_orc_impl(float* cVector,
267 const float* aVector,
268 const float* bVector,
269 unsigned int num_points);
270
271static inline void volk_32f_x2_divide_32f_u_orc(float* cVector,
272 const float* aVector,
273 const float* bVector,
274 unsigned int num_points)
275{
276 volk_32f_x2_divide_32f_a_orc_impl(cVector, aVector, bVector, num_points);
277}
278#endif /* LV_HAVE_ORC */
279
280
281#endif /* INCLUDED_volk_32f_x2_divide_32f_a_H */
282
283
284#ifndef INCLUDED_volk_32f_x2_divide_32f_u_H
285#define INCLUDED_volk_32f_x2_divide_32f_u_H
286
287#include <inttypes.h>
288#include <stdio.h>
289
290#ifdef LV_HAVE_AVX512F
291#include <immintrin.h>
292
293static inline void volk_32f_x2_divide_32f_u_avx512f(float* cVector,
294 const float* aVector,
295 const float* bVector,
296 unsigned int num_points)
297{
298 unsigned int number = 0;
299 const unsigned int sixteenthPoints = num_points / 16;
300
301 float* cPtr = cVector;
302 const float* aPtr = aVector;
303 const float* bPtr = bVector;
304
305 __m512 aVal, bVal, cVal;
306 for (; number < sixteenthPoints; number++) {
307 aVal = _mm512_loadu_ps(aPtr);
308 bVal = _mm512_loadu_ps(bPtr);
309
310 cVal = _mm512_div_ps(aVal, bVal);
311
312 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
313
314 aPtr += 16;
315 bPtr += 16;
316 cPtr += 16;
317 }
318
319 number = sixteenthPoints * 16;
320 for (; number < num_points; number++) {
321 *cPtr++ = (*aPtr++) / (*bPtr++);
322 }
323}
324#endif /* LV_HAVE_AVX512F */
325
326
327#ifdef LV_HAVE_AVX
328#include <immintrin.h>
329
330static inline void volk_32f_x2_divide_32f_u_avx(float* cVector,
331 const float* aVector,
332 const float* bVector,
333 unsigned int num_points)
334{
335 unsigned int number = 0;
336 const unsigned int eighthPoints = num_points / 8;
337
338 float* cPtr = cVector;
339 const float* aPtr = aVector;
340 const float* bPtr = bVector;
341
342 __m256 aVal, bVal, cVal;
343 for (; number < eighthPoints; number++) {
344 aVal = _mm256_loadu_ps(aPtr);
345 bVal = _mm256_loadu_ps(bPtr);
346
347 cVal = _mm256_div_ps(aVal, bVal);
348
349 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
350
351 aPtr += 8;
352 bPtr += 8;
353 cPtr += 8;
354 }
355
356 number = eighthPoints * 8;
357 for (; number < num_points; number++) {
358 *cPtr++ = (*aPtr++) / (*bPtr++);
359 }
360}
361#endif /* LV_HAVE_AVX */
362
363#endif /* INCLUDED_volk_32f_x2_divide_32f_u_H */
static void volk_32f_x2_divide_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:191
static void volk_32f_x2_divide_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:330
static void volk_32f_x2_divide_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:154
static void volk_32f_x2_divide_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:247
static void volk_32f_x2_divide_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_divide_32f.h:117
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62