Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_s32f_multiply_32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
69#ifndef INCLUDED_volk_32f_s32f_multiply_32f_u_H
70#define INCLUDED_volk_32f_s32f_multiply_32f_u_H
71
72#include <inttypes.h>
73#include <stdio.h>
74
75#ifdef LV_HAVE_SSE
76#include <xmmintrin.h>
77
78static inline void volk_32f_s32f_multiply_32f_u_sse(float* cVector,
79 const float* aVector,
80 const float scalar,
81 unsigned int num_points)
82{
83 unsigned int number = 0;
84 const unsigned int quarterPoints = num_points / 4;
85
86 float* cPtr = cVector;
87 const float* aPtr = aVector;
88
89 __m128 aVal, bVal, cVal;
90 bVal = _mm_set_ps1(scalar);
91 for (; number < quarterPoints; number++) {
92 aVal = _mm_loadu_ps(aPtr);
93
94 cVal = _mm_mul_ps(aVal, bVal);
95
96 _mm_storeu_ps(cPtr, cVal); // Store the results back into the C container
97
98 aPtr += 4;
99 cPtr += 4;
100 }
101
102 number = quarterPoints * 4;
103 for (; number < num_points; number++) {
104 *cPtr++ = (*aPtr++) * scalar;
105 }
106}
107#endif /* LV_HAVE_SSE */
108
109#ifdef LV_HAVE_AVX
110#include <immintrin.h>
111
112static inline void volk_32f_s32f_multiply_32f_u_avx(float* cVector,
113 const float* aVector,
114 const float scalar,
115 unsigned int num_points)
116{
117 unsigned int number = 0;
118 const unsigned int eighthPoints = num_points / 8;
119
120 float* cPtr = cVector;
121 const float* aPtr = aVector;
122
123 __m256 aVal, bVal, cVal;
124 bVal = _mm256_set1_ps(scalar);
125 for (; number < eighthPoints; number++) {
126
127 aVal = _mm256_loadu_ps(aPtr);
128
129 cVal = _mm256_mul_ps(aVal, bVal);
130
131 _mm256_storeu_ps(cPtr, cVal); // Store the results back into the C container
132
133 aPtr += 8;
134 cPtr += 8;
135 }
136
137 number = eighthPoints * 8;
138 for (; number < num_points; number++) {
139 *cPtr++ = (*aPtr++) * scalar;
140 }
141}
142#endif /* LV_HAVE_AVX */
143
144#ifdef LV_HAVE_GENERIC
145
146static inline void volk_32f_s32f_multiply_32f_generic(float* cVector,
147 const float* aVector,
148 const float scalar,
149 unsigned int num_points)
150{
151 unsigned int number = 0;
152 const float* inputPtr = aVector;
153 float* outputPtr = cVector;
154 for (number = 0; number < num_points; number++) {
155 *outputPtr = (*inputPtr) * scalar;
156 inputPtr++;
157 outputPtr++;
158 }
159}
160#endif /* LV_HAVE_GENERIC */
161
162#endif /* INCLUDED_volk_32f_s32f_multiply_32f_u_H */
163
164
165#ifndef INCLUDED_volk_32f_s32f_multiply_32f_a_H
166#define INCLUDED_volk_32f_s32f_multiply_32f_a_H
167
168#include <inttypes.h>
169#include <stdio.h>
170
171#ifdef LV_HAVE_SSE
172#include <xmmintrin.h>
173
174static inline void volk_32f_s32f_multiply_32f_a_sse(float* cVector,
175 const float* aVector,
176 const float scalar,
177 unsigned int num_points)
178{
179 unsigned int number = 0;
180 const unsigned int quarterPoints = num_points / 4;
181
182 float* cPtr = cVector;
183 const float* aPtr = aVector;
184
185 __m128 aVal, bVal, cVal;
186 bVal = _mm_set_ps1(scalar);
187 for (; number < quarterPoints; number++) {
188 aVal = _mm_load_ps(aPtr);
189
190 cVal = _mm_mul_ps(aVal, bVal);
191
192 _mm_store_ps(cPtr, cVal); // Store the results back into the C container
193
194 aPtr += 4;
195 cPtr += 4;
196 }
197
198 number = quarterPoints * 4;
199 for (; number < num_points; number++) {
200 *cPtr++ = (*aPtr++) * scalar;
201 }
202}
203#endif /* LV_HAVE_SSE */
204
205#ifdef LV_HAVE_AVX
206#include <immintrin.h>
207
208static inline void volk_32f_s32f_multiply_32f_a_avx(float* cVector,
209 const float* aVector,
210 const float scalar,
211 unsigned int num_points)
212{
213 unsigned int number = 0;
214 const unsigned int eighthPoints = num_points / 8;
215
216 float* cPtr = cVector;
217 const float* aPtr = aVector;
218
219 __m256 aVal, bVal, cVal;
220 bVal = _mm256_set1_ps(scalar);
221 for (; number < eighthPoints; number++) {
222 aVal = _mm256_load_ps(aPtr);
223
224 cVal = _mm256_mul_ps(aVal, bVal);
225
226 _mm256_store_ps(cPtr, cVal); // Store the results back into the C container
227
228 aPtr += 8;
229 cPtr += 8;
230 }
231
232 number = eighthPoints * 8;
233 for (; number < num_points; number++) {
234 *cPtr++ = (*aPtr++) * scalar;
235 }
236}
237#endif /* LV_HAVE_AVX */
238
239#ifdef LV_HAVE_NEON
240#include <arm_neon.h>
241
242static inline void volk_32f_s32f_multiply_32f_u_neon(float* cVector,
243 const float* aVector,
244 const float scalar,
245 unsigned int num_points)
246{
247 unsigned int number = 0;
248 const float* inputPtr = aVector;
249 float* outputPtr = cVector;
250 const unsigned int quarterPoints = num_points / 4;
251
252 float32x4_t aVal, cVal;
253
254 for (number = 0; number < quarterPoints; number++) {
255 aVal = vld1q_f32(inputPtr); // Load into NEON regs
256 cVal = vmulq_n_f32(aVal, scalar); // Do the multiply
257 vst1q_f32(outputPtr, cVal); // Store results back to output
258 inputPtr += 4;
259 outputPtr += 4;
260 }
261 for (number = quarterPoints * 4; number < num_points; number++) {
262 *outputPtr++ = (*inputPtr++) * scalar;
263 }
264}
265#endif /* LV_HAVE_NEON */
266
267
268#ifdef LV_HAVE_GENERIC
269
270static inline void volk_32f_s32f_multiply_32f_a_generic(float* cVector,
271 const float* aVector,
272 const float scalar,
273 unsigned int num_points)
274{
275 unsigned int number = 0;
276 const float* inputPtr = aVector;
277 float* outputPtr = cVector;
278 for (number = 0; number < num_points; number++) {
279 *outputPtr = (*inputPtr) * scalar;
280 inputPtr++;
281 outputPtr++;
282 }
283}
284#endif /* LV_HAVE_GENERIC */
285
286
287#ifdef LV_HAVE_ORC
288
289extern void volk_32f_s32f_multiply_32f_a_orc_impl(float* dst,
290 const float* src,
291 const float scalar,
292 unsigned int num_points);
293
294static inline void volk_32f_s32f_multiply_32f_u_orc(float* cVector,
295 const float* aVector,
296 const float scalar,
297 unsigned int num_points)
298{
299 volk_32f_s32f_multiply_32f_a_orc_impl(cVector, aVector, scalar, num_points);
300}
301
302#endif /* LV_HAVE_GENERIC */
303
304#endif /* INCLUDED_volk_32f_s32f_multiply_32f_a_H */
static void volk_32f_s32f_multiply_32f_a_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:208
static void volk_32f_s32f_multiply_32f_a_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:270
static void volk_32f_s32f_multiply_32f_u_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:78
static void volk_32f_s32f_multiply_32f_u_avx(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:112
static void volk_32f_s32f_multiply_32f_a_sse(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:174
static void volk_32f_s32f_multiply_32f_generic(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:146
static void volk_32f_s32f_multiply_32f_u_neon(float *cVector, const float *aVector, const float scalar, unsigned int num_points)
Definition: volk_32f_s32f_multiply_32f.h:242