Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32i_x2_or_32i.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
81#ifndef INCLUDED_volk_32i_x2_or_32i_a_H
82#define INCLUDED_volk_32i_x2_or_32i_a_H
83
84#include <inttypes.h>
85#include <stdio.h>
86
87#ifdef LV_HAVE_AVX512F
88#include <immintrin.h>
89
90static inline void volk_32i_x2_or_32i_a_avx512f(int32_t* cVector,
91 const int32_t* aVector,
92 const int32_t* bVector,
93 unsigned int num_points)
94{
95 unsigned int number = 0;
96 const unsigned int sixteenthPoints = num_points / 16;
97
98 int32_t* cPtr = (int32_t*)cVector;
99 const int32_t* aPtr = (int32_t*)aVector;
100 const int32_t* bPtr = (int32_t*)bVector;
101
102 __m512i aVal, bVal, cVal;
103 for (; number < sixteenthPoints; number++) {
104
105 aVal = _mm512_load_si512(aPtr);
106 bVal = _mm512_load_si512(bPtr);
107
108 cVal = _mm512_or_si512(aVal, bVal);
109
110 _mm512_store_si512(cPtr, cVal); // Store the results back into the C container
111
112 aPtr += 16;
113 bPtr += 16;
114 cPtr += 16;
115 }
116
117 number = sixteenthPoints * 16;
118 for (; number < num_points; number++) {
119 cVector[number] = aVector[number] | bVector[number];
120 }
121}
122#endif /* LV_HAVE_AVX512F */
123
124#ifdef LV_HAVE_AVX2
125#include <immintrin.h>
126
127static inline void volk_32i_x2_or_32i_a_avx2(int32_t* cVector,
128 const int32_t* aVector,
129 const int32_t* bVector,
130 unsigned int num_points)
131{
132 unsigned int number = 0;
133 const unsigned int oneEightPoints = num_points / 8;
134
135 int32_t* cPtr = cVector;
136 const int32_t* aPtr = aVector;
137 const int32_t* bPtr = bVector;
138
139 __m256i aVal, bVal, cVal;
140 for (; number < oneEightPoints; number++) {
141
142 aVal = _mm256_load_si256((__m256i*)aPtr);
143 bVal = _mm256_load_si256((__m256i*)bPtr);
144
145 cVal = _mm256_or_si256(aVal, bVal);
146
147 _mm256_store_si256((__m256i*)cPtr,
148 cVal); // Store the results back into the C container
149
150 aPtr += 8;
151 bPtr += 8;
152 cPtr += 8;
153 }
154
155 number = oneEightPoints * 8;
156 for (; number < num_points; number++) {
157 cVector[number] = aVector[number] | bVector[number];
158 }
159}
160#endif /* LV_HAVE_AVX2 */
161
162
163#ifdef LV_HAVE_SSE
164#include <xmmintrin.h>
165
166static inline void volk_32i_x2_or_32i_a_sse(int32_t* cVector,
167 const int32_t* aVector,
168 const int32_t* bVector,
169 unsigned int num_points)
170{
171 unsigned int number = 0;
172 const unsigned int quarterPoints = num_points / 4;
173
174 float* cPtr = (float*)cVector;
175 const float* aPtr = (float*)aVector;
176 const float* bPtr = (float*)bVector;
177
178 __m128 aVal, bVal, cVal;
179 for (; number < quarterPoints; number++) {
180 aVal = _mm_load_ps(aPtr);
181 bVal = _mm_load_ps(bPtr);
182
183 cVal = _mm_or_ps(aVal, bVal);
184
185 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
186
187 aPtr += 4;
188 bPtr += 4;
189 cPtr += 4;
190 }
191
192 number = quarterPoints * 4;
193 for (; number < num_points; number++) {
194 cVector[number] = aVector[number] | bVector[number];
195 }
196}
197#endif /* LV_HAVE_SSE */
198
199
200#ifdef LV_HAVE_NEON
201#include <arm_neon.h>
202
203static inline void volk_32i_x2_or_32i_neon(int32_t* cVector,
204 const int32_t* aVector,
205 const int32_t* bVector,
206 unsigned int num_points)
207{
208 int32_t* cPtr = cVector;
209 const int32_t* aPtr = aVector;
210 const int32_t* bPtr = bVector;
211 unsigned int number = 0;
212 unsigned int quarter_points = num_points / 4;
213
214 int32x4_t a_val, b_val, c_val;
215
216 for (number = 0; number < quarter_points; number++) {
217 a_val = vld1q_s32(aPtr);
218 b_val = vld1q_s32(bPtr);
219 c_val = vorrq_s32(a_val, b_val);
220 vst1q_s32(cPtr, c_val);
221 aPtr += 4;
222 bPtr += 4;
223 cPtr += 4;
224 }
225
226 for (number = quarter_points * 4; number < num_points; number++) {
227 *cPtr++ = (*aPtr++) | (*bPtr++);
228 }
229}
230#endif /* LV_HAVE_NEON */
231
232
233#ifdef LV_HAVE_GENERIC
234
235static inline void volk_32i_x2_or_32i_generic(int32_t* cVector,
236 const int32_t* aVector,
237 const int32_t* bVector,
238 unsigned int num_points)
239{
240 int32_t* cPtr = cVector;
241 const int32_t* aPtr = aVector;
242 const int32_t* bPtr = bVector;
243 unsigned int number = 0;
244
245 for (number = 0; number < num_points; number++) {
246 *cPtr++ = (*aPtr++) | (*bPtr++);
247 }
248}
249#endif /* LV_HAVE_GENERIC */
250
251
252#ifdef LV_HAVE_ORC
253extern void volk_32i_x2_or_32i_a_orc_impl(int32_t* cVector,
254 const int32_t* aVector,
255 const int32_t* bVector,
256 unsigned int num_points);
257
258static inline void volk_32i_x2_or_32i_u_orc(int32_t* cVector,
259 const int32_t* aVector,
260 const int32_t* bVector,
261 unsigned int num_points)
262{
263 volk_32i_x2_or_32i_a_orc_impl(cVector, aVector, bVector, num_points);
264}
265#endif /* LV_HAVE_ORC */
266
267
268#endif /* INCLUDED_volk_32i_x2_or_32i_a_H */
269
270
271#ifndef INCLUDED_volk_32i_x2_or_32i_u_H
272#define INCLUDED_volk_32i_x2_or_32i_u_H
273
274#include <inttypes.h>
275#include <stdio.h>
276
277#ifdef LV_HAVE_AVX512F
278#include <immintrin.h>
279
280static inline void volk_32i_x2_or_32i_u_avx512f(int32_t* cVector,
281 const int32_t* aVector,
282 const int32_t* bVector,
283 unsigned int num_points)
284{
285 unsigned int number = 0;
286 const unsigned int sixteenthPoints = num_points / 16;
287
288 int32_t* cPtr = (int32_t*)cVector;
289 const int32_t* aPtr = (int32_t*)aVector;
290 const int32_t* bPtr = (int32_t*)bVector;
291
292 __m512i aVal, bVal, cVal;
293 for (; number < sixteenthPoints; number++) {
294
295 aVal = _mm512_loadu_si512(aPtr);
296 bVal = _mm512_loadu_si512(bPtr);
297
298 cVal = _mm512_or_si512(aVal, bVal);
299
300 _mm512_storeu_si512(cPtr, cVal); // Store the results back into the C container
301
302 aPtr += 16;
303 bPtr += 16;
304 cPtr += 16;
305 }
306
307 number = sixteenthPoints * 16;
308 for (; number < num_points; number++) {
309 cVector[number] = aVector[number] | bVector[number];
310 }
311}
312#endif /* LV_HAVE_AVX512F */
313
314#ifdef LV_HAVE_AVX2
315#include <immintrin.h>
316
317static inline void volk_32i_x2_or_32i_u_avx2(int32_t* cVector,
318 const int32_t* aVector,
319 const int32_t* bVector,
320 unsigned int num_points)
321{
322 unsigned int number = 0;
323 const unsigned int oneEightPoints = num_points / 8;
324
325 int32_t* cPtr = cVector;
326 const int32_t* aPtr = aVector;
327 const int32_t* bPtr = bVector;
328
329 __m256i aVal, bVal, cVal;
330 for (; number < oneEightPoints; number++) {
331
332 aVal = _mm256_loadu_si256((__m256i*)aPtr);
333 bVal = _mm256_loadu_si256((__m256i*)bPtr);
334
335 cVal = _mm256_or_si256(aVal, bVal);
336
337 _mm256_storeu_si256((__m256i*)cPtr,
338 cVal); // Store the results back into the C container
339
340 aPtr += 8;
341 bPtr += 8;
342 cPtr += 8;
343 }
344
345 number = oneEightPoints * 8;
346 for (; number < num_points; number++) {
347 cVector[number] = aVector[number] | bVector[number];
348 }
349}
350#endif /* LV_HAVE_AVX2 */
351
352
353#endif /* INCLUDED_volk_32i_x2_or_32i_u_H */
static void volk_32i_x2_or_32i_neon(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:203
static void volk_32i_x2_or_32i_generic(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:235
static void volk_32i_x2_or_32i_a_sse(int32_t *cVector, const int32_t *aVector, const int32_t *bVector, unsigned int num_points)
Definition: volk_32i_x2_or_32i.h:166