Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_x2_divide_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
72#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
73#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
74
75#include <float.h>
76#include <inttypes.h>
77#include <volk/volk_complex.h>
78
79
80#ifdef LV_HAVE_GENERIC
81
82static inline void volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector,
83 const lv_32fc_t* aVector,
84 const lv_32fc_t* bVector,
85 unsigned int num_points)
86{
87 lv_32fc_t* cPtr = cVector;
88 const lv_32fc_t* aPtr = aVector;
89 const lv_32fc_t* bPtr = bVector;
90
91 for (unsigned int number = 0; number < num_points; number++) {
92 *cPtr++ = (*aPtr++) / (*bPtr++);
93 }
94}
95#endif /* LV_HAVE_GENERIC */
96
97
98#ifdef LV_HAVE_SSE3
99#include <pmmintrin.h>
101
102static inline void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t* cVector,
103 const lv_32fc_t* numeratorVector,
104 const lv_32fc_t* denumeratorVector,
105 unsigned int num_points)
106{
107 /*
108 * we'll do the "classical"
109 * a a b*
110 * --- = -------
111 * b |b|^2
112 * */
113 unsigned int number = 0;
114 const unsigned int quarterPoints = num_points / 4;
115
116 __m128 num01, num23, den01, den23, norm, result;
117 lv_32fc_t* c = cVector;
118 const lv_32fc_t* a = numeratorVector;
119 const lv_32fc_t* b = denumeratorVector;
120
121 for (; number < quarterPoints; number++) {
122 num01 = _mm_loadu_ps((float*)a); // first pair
123 den01 = _mm_loadu_ps((float*)b); // first pair
124 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
125 a += 2;
126 b += 2;
127
128 num23 = _mm_loadu_ps((float*)a); // second pair
129 den23 = _mm_loadu_ps((float*)b); // second pair
130 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
131 a += 2;
132 b += 2;
133
134 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
135 den01 = _mm_unpacklo_ps(norm, norm);
136 den23 = _mm_unpackhi_ps(norm, norm);
137
138 result = _mm_div_ps(num01, den01);
139 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
140 c += 2;
141 result = _mm_div_ps(num23, den23);
142 _mm_storeu_ps((float*)c, result); // Store the results back into the C container
143 c += 2;
144 }
145
146 number *= 4;
147 for (; number < num_points; number++) {
148 *c = (*a) / (*b);
149 a++;
150 b++;
151 c++;
152 }
153}
154#endif /* LV_HAVE_SSE3 */
155
156
157#ifdef LV_HAVE_AVX
158#include <immintrin.h>
160
161static inline void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector,
162 const lv_32fc_t* numeratorVector,
163 const lv_32fc_t* denumeratorVector,
164 unsigned int num_points)
165{
166 /*
167 * we'll do the "classical"
168 * a a b*
169 * --- = -------
170 * b |b|^2
171 * */
172 unsigned int number = 0;
173 const unsigned int quarterPoints = num_points / 4;
174
175 __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
176 lv_32fc_t* c = cVector;
177 const lv_32fc_t* a = numeratorVector;
178 const lv_32fc_t* b = denumeratorVector;
179
180 for (; number < quarterPoints; number++) {
181 num = _mm256_loadu_ps(
182 (float*)a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
183 denum = _mm256_loadu_ps(
184 (float*)b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
185 mul_conj = _mm256_complexconjugatemul_ps(num, denum);
186 sq = _mm256_mul_ps(denum, denum); // Square the values
187 mag_sq_un = _mm256_hadd_ps(
188 sq, sq); // obtain the actual squared magnitude, although out of order
189 mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
190 // best guide I found on using these functions:
191 // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
192 div = _mm256_div_ps(mul_conj, mag_sq);
193
194 _mm256_storeu_ps((float*)c, div); // Store the results back into the C container
195
196 a += 4;
197 b += 4;
198 c += 4;
199 }
200
201 number = quarterPoints * 4;
202
203 for (; number < num_points; number++) {
204 *c++ = (*a++) / (*b++);
205 }
206}
207#endif /* LV_HAVE_AVX */
208
209
210#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
211
212
213#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
214#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
215
216#include <float.h>
217#include <inttypes.h>
218#include <stdio.h>
219#include <volk/volk_complex.h>
220
221#ifdef LV_HAVE_SSE3
222#include <pmmintrin.h>
224
225static inline void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t* cVector,
226 const lv_32fc_t* numeratorVector,
227 const lv_32fc_t* denumeratorVector,
228 unsigned int num_points)
229{
230 /*
231 * we'll do the "classical"
232 * a a b*
233 * --- = -------
234 * b |b|^2
235 * */
236 unsigned int number = 0;
237 const unsigned int quarterPoints = num_points / 4;
238
239 __m128 num01, num23, den01, den23, norm, result;
240 lv_32fc_t* c = cVector;
241 const lv_32fc_t* a = numeratorVector;
242 const lv_32fc_t* b = denumeratorVector;
243
244 for (; number < quarterPoints; number++) {
245 num01 = _mm_load_ps((float*)a); // first pair
246 den01 = _mm_load_ps((float*)b); // first pair
247 num01 = _mm_complexconjugatemul_ps(num01, den01); // a conj(b)
248 a += 2;
249 b += 2;
250
251 num23 = _mm_load_ps((float*)a); // second pair
252 den23 = _mm_load_ps((float*)b); // second pair
253 num23 = _mm_complexconjugatemul_ps(num23, den23); // a conj(b)
254 a += 2;
255 b += 2;
256
257 norm = _mm_magnitudesquared_ps_sse3(den01, den23);
258
259 den01 = _mm_unpacklo_ps(norm, norm); // select the lower floats twice
260 den23 = _mm_unpackhi_ps(norm, norm); // select the upper floats twice
261
262 result = _mm_div_ps(num01, den01);
263 _mm_store_ps((float*)c, result); // Store the results back into the C container
264 c += 2;
265 result = _mm_div_ps(num23, den23);
266 _mm_store_ps((float*)c, result); // Store the results back into the C container
267 c += 2;
268 }
269
270 number *= 4;
271 for (; number < num_points; number++) {
272 *c = (*a) / (*b);
273 a++;
274 b++;
275 c++;
276 }
277}
278#endif /* LV_HAVE_SSE */
279
280#ifdef LV_HAVE_AVX
281#include <immintrin.h>
283
284static inline void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector,
285 const lv_32fc_t* numeratorVector,
286 const lv_32fc_t* denumeratorVector,
287 unsigned int num_points)
288{
289 /*
290 * Guide to AVX intrisics:
291 * https://software.intel.com/sites/landingpage/IntrinsicsGuide/#
292 *
293 * we'll do the "classical"
294 * a a b*
295 * --- = -------
296 * b |b|^2
297 *
298 */
299 lv_32fc_t* c = cVector;
300 const lv_32fc_t* a = numeratorVector;
301 const lv_32fc_t* b = denumeratorVector;
302
303 const unsigned int eigthPoints = num_points / 8;
304
305 __m256 num01, num23, denum01, denum23, complex_result, result0, result1;
306
307 for (unsigned int number = 0; number < eigthPoints; number++) {
308 // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
309 num01 = _mm256_load_ps((float*)a);
310 denum01 = _mm256_load_ps((float*)b);
311
312 num01 = _mm256_complexconjugatemul_ps(num01, denum01);
313 a += 4;
314 b += 4;
315
316 num23 = _mm256_load_ps((float*)a);
317 denum23 = _mm256_load_ps((float*)b);
318 num23 = _mm256_complexconjugatemul_ps(num23, denum23);
319 a += 4;
320 b += 4;
321
322 complex_result = _mm256_hadd_ps(_mm256_mul_ps(denum01, denum01),
323 _mm256_mul_ps(denum23, denum23));
324
325 denum01 = _mm256_shuffle_ps(complex_result, complex_result, 0x50);
326 denum23 = _mm256_shuffle_ps(complex_result, complex_result, 0xfa);
327
328 result0 = _mm256_div_ps(num01, denum01);
329 result1 = _mm256_div_ps(num23, denum23);
330
331 _mm256_store_ps((float*)c, result0);
332 c += 4;
333 _mm256_store_ps((float*)c, result1);
334 c += 4;
335 }
336
337 volk_32fc_x2_divide_32fc_generic(c, a, b, num_points - eigthPoints * 8);
338}
339#endif /* LV_HAVE_AVX */
340
341
342#ifdef LV_HAVE_NEON
343#include <arm_neon.h>
344
345static inline void volk_32fc_x2_divide_32fc_neon(lv_32fc_t* cVector,
346 const lv_32fc_t* aVector,
347 const lv_32fc_t* bVector,
348 unsigned int num_points)
349{
350 lv_32fc_t* cPtr = cVector;
351 const lv_32fc_t* aPtr = aVector;
352 const lv_32fc_t* bPtr = bVector;
353
354 float32x4x2_t aVal, bVal, cVal;
355 float32x4_t bAbs, bAbsInv;
356
357 const unsigned int quarterPoints = num_points / 4;
358 unsigned int number = 0;
359 for (; number < quarterPoints; number++) {
360 aVal = vld2q_f32((const float*)(aPtr));
361 bVal = vld2q_f32((const float*)(bPtr));
362 aPtr += 4;
363 bPtr += 4;
364 __VOLK_PREFETCH(aPtr + 4);
365 __VOLK_PREFETCH(bPtr + 4);
366
367 bAbs = vmulq_f32(bVal.val[0], bVal.val[0]);
368 bAbs = vmlaq_f32(bAbs, bVal.val[1], bVal.val[1]);
369
370 bAbsInv = vrecpeq_f32(bAbs);
371 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
372 bAbsInv = vmulq_f32(bAbsInv, vrecpsq_f32(bAbsInv, bAbs));
373
374 cVal.val[0] = vmulq_f32(aVal.val[0], bVal.val[0]);
375 cVal.val[0] = vmlaq_f32(cVal.val[0], aVal.val[1], bVal.val[1]);
376 cVal.val[0] = vmulq_f32(cVal.val[0], bAbsInv);
377
378 cVal.val[1] = vmulq_f32(aVal.val[1], bVal.val[0]);
379 cVal.val[1] = vmlsq_f32(cVal.val[1], aVal.val[0], bVal.val[1]);
380 cVal.val[1] = vmulq_f32(cVal.val[1], bAbsInv);
381
382 vst2q_f32((float*)(cPtr), cVal);
383 cPtr += 4;
384 }
385
386 for (number = quarterPoints * 4; number < num_points; number++) {
387 *cPtr++ = (*aPtr++) / (*bPtr++);
388 }
389}
390#endif /* LV_HAVE_NEON */
391
392
393#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
static void volk_32fc_x2_divide_32fc_a_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:225
static void volk_32fc_x2_divide_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:82
static void volk_32fc_x2_divide_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const lv_32fc_t *bVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:345
static void volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:161
static void volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:284
static void volk_32fc_x2_divide_32fc_u_sse3(lv_32fc_t *cVector, const lv_32fc_t *numeratorVector, const lv_32fc_t *denumeratorVector, unsigned int num_points)
Definition: volk_32fc_x2_divide_32fc.h:102
static __m256 _mm256_complexconjugatemul_ps(const __m256 x, const __m256 y)
Definition: volk_avx_intrinsics.h:51
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
float complex lv_32fc_t
Definition: volk_complex.h:65
static __m128 _mm_magnitudesquared_ps_sse3(__m128 cplxValue1, __m128 cplxValue2)
Definition: volk_sse3_intrinsics.h:51
static __m128 _mm_complexconjugatemul_ps(__m128 x, __m128 y)
Definition: volk_sse3_intrinsics.h:44