Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_s32f_add_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2020 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
68#include <inttypes.h>
69#include <stdio.h>
70
71#ifdef LV_HAVE_GENERIC
72
73static inline void volk_32f_s32f_add_32f_generic(float* cVector,
74 const float* aVector,
75 const float scalar,
76 unsigned int num_points)
77{
78 unsigned int number = 0;
79 const float* inputPtr = aVector;
80 float* outputPtr = cVector;
81 for (number = 0; number < num_points; number++) {
82 *outputPtr = (*inputPtr) + scalar;
83 inputPtr++;
84 outputPtr++;
85 }
86}
87
88#endif /* LV_HAVE_GENERIC */
89#ifndef INCLUDED_volk_32f_s32f_add_32f_u_H
90#define INCLUDED_volk_32f_s32f_add_32f_u_H
91
92#ifdef LV_HAVE_SSE
93#include <xmmintrin.h>
94
95static inline void volk_32f_s32f_add_32f_u_sse(float* cVector,
96 const float* aVector,
97 const float scalar,
98 unsigned int num_points)
99{
100 unsigned int number = 0;
101 const unsigned int quarterPoints = num_points / 4;
102
103 float* cPtr = cVector;
104 const float* aPtr = aVector;
105
106 __m128 aVal, bVal, cVal;
107 bVal = _mm_set_ps1(scalar);
108 for (; number < quarterPoints; number++) {
109 aVal = _mm_loadu_ps(aPtr);
110
111 cVal = _mm_add_ps(aVal, bVal);
112
113 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
114
115 aPtr += 4;
116 cPtr += 4;
117 }
118
119 number = quarterPoints * 4;
120 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
121}
122#endif /* LV_HAVE_SSE */
123
124#ifdef LV_HAVE_AVX
125#include <immintrin.h>
126
127static inline void volk_32f_s32f_add_32f_u_avx(float* cVector,
128 const float* aVector,
129 const float scalar,
130 unsigned int num_points)
131{
132 unsigned int number = 0;
133 const unsigned int eighthPoints = num_points / 8;
134
135 float* cPtr = cVector;
136 const float* aPtr = aVector;
137
138 __m256 aVal, bVal, cVal;
139 bVal = _mm256_set1_ps(scalar);
140 for (; number < eighthPoints; number++) {
141
142 aVal = _mm256_loadu_ps(aPtr);
143
144 cVal = _mm256_add_ps(aVal, bVal);
145
146 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
147
148 aPtr += 8;
149 cPtr += 8;
150 }
151
152 number = eighthPoints * 8;
153 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
154}
155#endif /* LV_HAVE_AVX */
156
157#ifdef LV_HAVE_NEON
158#include <arm_neon.h>
159
160static inline void volk_32f_s32f_add_32f_u_neon(float* cVector,
161 const float* aVector,
162 const float scalar,
163 unsigned int num_points)
164{
165 unsigned int number = 0;
166 const float* inputPtr = aVector;
167 float* outputPtr = cVector;
168 const unsigned int quarterPoints = num_points / 4;
169
170 float32x4_t aVal, cVal, scalarvec;
171
172 scalarvec = vdupq_n_f32(scalar);
173
174 for (number = 0; number < quarterPoints; number++) {
175 aVal = vld1q_f32(inputPtr); // Load into NEON regs
176 cVal = vaddq_f32(aVal, scalarvec); // Do the add
177 vst1q_f32(outputPtr, cVal); // Store results back to output
178 inputPtr += 4;
179 outputPtr += 4;
180 }
181
182 number = quarterPoints * 4;
183 volk_32f_s32f_add_32f_generic(outputPtr, inputPtr, scalar, num_points - number);
184}
185#endif /* LV_HAVE_NEON */
186
187
188#endif /* INCLUDED_volk_32f_s32f_add_32f_u_H */
189
190
191#ifndef INCLUDED_volk_32f_s32f_add_32f_a_H
192#define INCLUDED_volk_32f_s32f_add_32f_a_H
193
194#ifdef LV_HAVE_SSE
195#include <xmmintrin.h>
196
197static inline void volk_32f_s32f_add_32f_a_sse(float* cVector,
198 const float* aVector,
199 const float scalar,
200 unsigned int num_points)
201{
202 unsigned int number = 0;
203 const unsigned int quarterPoints = num_points / 4;
204
205 float* cPtr = cVector;
206 const float* aPtr = aVector;
207
208 __m128 aVal, bVal, cVal;
209 bVal = _mm_set_ps1(scalar);
210 for (; number < quarterPoints; number++) {
211 aVal = _mm_load_ps(aPtr);
212
213 cVal = _mm_add_ps(aVal, bVal);
214
215 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
216
217 aPtr += 4;
218 cPtr += 4;
219 }
220
221 number = quarterPoints * 4;
222 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
223}
224#endif /* LV_HAVE_SSE */
225
226#ifdef LV_HAVE_AVX
227#include <immintrin.h>
228
229static inline void volk_32f_s32f_add_32f_a_avx(float* cVector,
230 const float* aVector,
231 const float scalar,
232 unsigned int num_points)
233{
234 unsigned int number = 0;
235 const unsigned int eighthPoints = num_points / 8;
236
237 float* cPtr = cVector;
238 const float* aPtr = aVector;
239
240 __m256 aVal, bVal, cVal;
241 bVal = _mm256_set1_ps(scalar);
242 for (; number < eighthPoints; number++) {
243 aVal = _mm256_load_ps(aPtr);
244
245 cVal = _mm256_add_ps(aVal, bVal);
246
247 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
248
249 aPtr += 8;
250 cPtr += 8;
251 }
252
253 number = eighthPoints * 8;
254 volk_32f_s32f_add_32f_generic(cPtr, aPtr, scalar, num_points - number);
255}
256#endif /* LV_HAVE_AVX */
257
258#ifdef LV_HAVE_ORC
259
260extern void volk_32f_s32f_add_32f_a_orc_impl(float* dst,
261 const float* src,
262 const float scalar,
263 unsigned int num_points);
264
265static inline void volk_32f_s32f_add_32f_u_orc(float* cVector,
266 const float* aVector,
267 const float scalar,
268 unsigned int num_points)
269{
270 volk_32f_s32f_add_32f_a_orc_impl(cVector, aVector, scalar, num_points);
271}
272#endif /* LV_HAVE_ORC */
273
274#endif /* INCLUDED_volk_32f_s32f_add_32f_a_H */
static void volk_32f_s32f_add_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:160
static void volk_32f_s32f_add_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:73
static void volk_32f_s32f_add_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:95
static void volk_32f_s32f_add_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:197
static void volk_32f_s32f_add_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:127
static void volk_32f_s32f_add_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_add_32f.h:229