Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_x2_max_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_32f_x2_max_32f_a_H
72#define INCLUDED_volk_32f_x2_max_32f_a_H
73
74#include <inttypes.h>
75#include <stdio.h>
76
77#ifdef LV_HAVE_AVX512F
78#include <immintrin.h>
79
80static inline void volk_32f_x2_max_32f_a_avx512f(float* cVector,
81 const float* aVector,
82 const float* bVector,
83 unsigned int num_points)
84{
85 unsigned int number = 0;
86 const unsigned int sixteenthPoints = num_points / 16;
87
88 float* cPtr = cVector;
89 const float* aPtr = aVector;
90 const float* bPtr = bVector;
91
92 __m512 aVal, bVal, cVal;
93 for (; number < sixteenthPoints; number++) {
94 aVal = _mm512_load_ps(aPtr);
95 bVal = _mm512_load_ps(bPtr);
96
97 cVal = _mm512_max_ps(aVal, bVal);
98
99 _mm512_store_ps(cPtr, cVal); // Store the results back into the C container
100
101 aPtr += 16;
102 bPtr += 16;
103 cPtr += 16;
104 }
105
106 number = sixteenthPoints * 16;
107 for (; number < num_points; number++) {
108 const float a = *aPtr++;
109 const float b = *bPtr++;
110 *cPtr++ = (a > b ? a : b);
111 }
112}
113#endif /* LV_HAVE_AVX512F */
114
115#ifdef LV_HAVE_SSE
116#include <xmmintrin.h>
117
118static inline void volk_32f_x2_max_32f_a_sse(float* cVector,
119 const float* aVector,
120 const float* bVector,
121 unsigned int num_points)
122{
123 unsigned int number = 0;
124 const unsigned int quarterPoints = num_points / 4;
125
126 float* cPtr = cVector;
127 const float* aPtr = aVector;
128 const float* bPtr = bVector;
129
130 __m128 aVal, bVal, cVal;
131 for (; number < quarterPoints; number++) {
132 aVal = _mm_load_ps(aPtr);
133 bVal = _mm_load_ps(bPtr);
134
135 cVal = _mm_max_ps(aVal, bVal);
136
137 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
138
139 aPtr += 4;
140 bPtr += 4;
141 cPtr += 4;
142 }
143
144 number = quarterPoints * 4;
145 for (; number < num_points; number++) {
146 const float a = *aPtr++;
147 const float b = *bPtr++;
148 *cPtr++ = (a > b ? a : b);
149 }
150}
151#endif /* LV_HAVE_SSE */
152
153#ifdef LV_HAVE_AVX
154#include <immintrin.h>
155
156static inline void volk_32f_x2_max_32f_a_avx(float* cVector,
157 const float* aVector,
158 const float* bVector,
159 unsigned int num_points)
160{
161 unsigned int number = 0;
162 const unsigned int eighthPoints = num_points / 8;
163
164 float* cPtr = cVector;
165 const float* aPtr = aVector;
166 const float* bPtr = bVector;
167
168 __m256 aVal, bVal, cVal;
169 for (; number < eighthPoints; number++) {
170 aVal = _mm256_load_ps(aPtr);
171 bVal = _mm256_load_ps(bPtr);
172
173 cVal = _mm256_max_ps(aVal, bVal);
174
175 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
176
177 aPtr += 8;
178 bPtr += 8;
179 cPtr += 8;
180 }
181
182 number = eighthPoints * 8;
183 for (; number < num_points; number++) {
184 const float a = *aPtr++;
185 const float b = *bPtr++;
186 *cPtr++ = (a > b ? a : b);
187 }
188}
189#endif /* LV_HAVE_AVX */
190
191#ifdef LV_HAVE_NEON
192#include <arm_neon.h>
193
194static inline void volk_32f_x2_max_32f_neon(float* cVector,
195 const float* aVector,
196 const float* bVector,
197 unsigned int num_points)
198{
199 unsigned int quarter_points = num_points / 4;
200 float* cPtr = cVector;
201 const float* aPtr = aVector;
202 const float* bPtr = bVector;
203 unsigned int number = 0;
204
205 float32x4_t a_vec, b_vec, c_vec;
206 for (number = 0; number < quarter_points; number++) {
207 a_vec = vld1q_f32(aPtr);
208 b_vec = vld1q_f32(bPtr);
209 c_vec = vmaxq_f32(a_vec, b_vec);
210 vst1q_f32(cPtr, c_vec);
211 aPtr += 4;
212 bPtr += 4;
213 cPtr += 4;
214 }
215
216 for (number = quarter_points * 4; number < num_points; number++) {
217 const float a = *aPtr++;
218 const float b = *bPtr++;
219 *cPtr++ = (a > b ? a : b);
220 }
221}
222#endif /* LV_HAVE_NEON */
223
224
225#ifdef LV_HAVE_GENERIC
226
227static inline void volk_32f_x2_max_32f_generic(float* cVector,
228 const float* aVector,
229 const float* bVector,
230 unsigned int num_points)
231{
232 float* cPtr = cVector;
233 const float* aPtr = aVector;
234 const float* bPtr = bVector;
235 unsigned int number = 0;
236
237 for (number = 0; number < num_points; number++) {
238 const float a = *aPtr++;
239 const float b = *bPtr++;
240 *cPtr++ = (a > b ? a : b);
241 }
242}
243#endif /* LV_HAVE_GENERIC */
244
245#ifdef LV_HAVE_ORC
246extern void volk_32f_x2_max_32f_a_orc_impl(float* cVector,
247 const float* aVector,
248 const float* bVector,
249 unsigned int num_points);
250
251static inline void volk_32f_x2_max_32f_u_orc(float* cVector,
252 const float* aVector,
253 const float* bVector,
254 unsigned int num_points)
255{
256 volk_32f_x2_max_32f_a_orc_impl(cVector, aVector, bVector, num_points);
257}
258#endif /* LV_HAVE_ORC */
259
260
261#endif /* INCLUDED_volk_32f_x2_max_32f_a_H */
262
263
264#ifndef INCLUDED_volk_32f_x2_max_32f_u_H
265#define INCLUDED_volk_32f_x2_max_32f_u_H
266
267#include <inttypes.h>
268#include <stdio.h>
269
270#ifdef LV_HAVE_AVX512F
271#include <immintrin.h>
272
273static inline void volk_32f_x2_max_32f_u_avx512f(float* cVector,
274 const float* aVector,
275 const float* bVector,
276 unsigned int num_points)
277{
278 unsigned int number = 0;
279 const unsigned int sixteenthPoints = num_points / 16;
280
281 float* cPtr = cVector;
282 const float* aPtr = aVector;
283 const float* bPtr = bVector;
284
285 __m512 aVal, bVal, cVal;
286 for (; number < sixteenthPoints; number++) {
287 aVal = _mm512_loadu_ps(aPtr);
288 bVal = _mm512_loadu_ps(bPtr);
289
290 cVal = _mm512_max_ps(aVal, bVal);
291
292 _mm512_storeu_ps(cPtr, cVal); // Store the results back into the C container
293
294 aPtr += 16;
295 bPtr += 16;
296 cPtr += 16;
297 }
298
299 number = sixteenthPoints * 16;
300 for (; number < num_points; number++) {
301 const float a = *aPtr++;
302 const float b = *bPtr++;
303 *cPtr++ = (a > b ? a : b);
304 }
305}
306#endif /* LV_HAVE_AVX512F */
307
308#ifdef LV_HAVE_AVX
309#include <immintrin.h>
310
311static inline void volk_32f_x2_max_32f_u_avx(float* cVector,
312 const float* aVector,
313 const float* bVector,
314 unsigned int num_points)
315{
316 unsigned int number = 0;
317 const unsigned int eighthPoints = num_points / 8;
318
319 float* cPtr = cVector;
320 const float* aPtr = aVector;
321 const float* bPtr = bVector;
322
323 __m256 aVal, bVal, cVal;
324 for (; number < eighthPoints; number++) {
325 aVal = _mm256_loadu_ps(aPtr);
326 bVal = _mm256_loadu_ps(bPtr);
327
328 cVal = _mm256_max_ps(aVal, bVal);
329
330 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
331
332 aPtr += 8;
333 bPtr += 8;
334 cPtr += 8;
335 }
336
337 number = eighthPoints * 8;
338 for (; number < num_points; number++) {
339 const float a = *aPtr++;
340 const float b = *bPtr++;
341 *cPtr++ = (a > b ? a : b);
342 }
343}
344#endif /* LV_HAVE_AVX */
345
346#endif /* INCLUDED_volk_32f_x2_max_32f_u_H */
static void volk_32f_x2_max_32f_u_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:311
static void volk_32f_x2_max_32f_a_avx(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:156
static void volk_32f_x2_max_32f_neon(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:194
static void volk_32f_x2_max_32f_generic(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:227
static void volk_32f_x2_max_32f_a_sse(float *cVector, const float *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32f_x2_max_32f.h:118