Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_accumulator_s32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2019 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
64#ifndef INCLUDED_volk_32fc_accumulator_s32fc_a_H
65#define INCLUDED_volk_32fc_accumulator_s32fc_a_H
66
67#include <inttypes.h>
68#include <volk/volk_common.h>
69
70#ifdef LV_HAVE_GENERIC
72 const lv_32fc_t* inputBuffer,
73 unsigned int num_points)
74{
75 const lv_32fc_t* aPtr = inputBuffer;
76 unsigned int number = 0;
77 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
78
79 for (; number < num_points; number++) {
80 returnValue += (*aPtr++);
81 }
82 *result = returnValue;
83}
84#endif /* LV_HAVE_GENERIC */
85
86#ifdef LV_HAVE_AVX
87#include <immintrin.h>
88
90 const lv_32fc_t* inputBuffer,
91 unsigned int num_points)
92{
93 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
94 unsigned int number = 0;
95 const unsigned int quarterPoints = num_points / 4;
96
97 const lv_32fc_t* aPtr = inputBuffer;
98 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
99
100 __m256 accumulator = _mm256_setzero_ps();
101 __m256 aVal = _mm256_setzero_ps();
102
103 for (; number < quarterPoints; number++) {
104 aVal = _mm256_loadu_ps((float*)aPtr);
105 accumulator = _mm256_add_ps(accumulator, aVal);
106 aPtr += 4;
107 }
108
109 _mm256_store_ps(tempBuffer, accumulator);
110
111 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
112 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
113 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
114 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
115
116 number = quarterPoints * 4;
117 for (; number < num_points; number++) {
118 returnValue += (*aPtr++);
119 }
120 *result = returnValue;
121}
122#endif /* LV_HAVE_AVX */
123
124#ifdef LV_HAVE_SSE
125#include <xmmintrin.h>
126
128 const lv_32fc_t* inputBuffer,
129 unsigned int num_points)
130{
131 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
132 unsigned int number = 0;
133 const unsigned int halfPoints = num_points / 2;
134
135 const lv_32fc_t* aPtr = inputBuffer;
136 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
137
138 __m128 accumulator = _mm_setzero_ps();
139 __m128 aVal = _mm_setzero_ps();
140
141 for (; number < halfPoints; number++) {
142 aVal = _mm_loadu_ps((float*)aPtr);
143 accumulator = _mm_add_ps(accumulator, aVal);
144 aPtr += 2;
145 }
146
147 _mm_store_ps(tempBuffer, accumulator);
148
149 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
150 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
151
152 number = halfPoints * 2;
153 for (; number < num_points; number++) {
154 returnValue += (*aPtr++);
155 }
156 *result = returnValue;
157}
158#endif /* LV_HAVE_SSE */
159
160#ifdef LV_HAVE_AVX
161#include <immintrin.h>
162
164 const lv_32fc_t* inputBuffer,
165 unsigned int num_points)
166{
167 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
168 unsigned int number = 0;
169 const unsigned int quarterPoints = num_points / 4;
170
171 const lv_32fc_t* aPtr = inputBuffer;
172 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
173
174 __m256 accumulator = _mm256_setzero_ps();
175 __m256 aVal = _mm256_setzero_ps();
176
177 for (; number < quarterPoints; number++) {
178 aVal = _mm256_load_ps((float*)aPtr);
179 accumulator = _mm256_add_ps(accumulator, aVal);
180 aPtr += 4;
181 }
182
183 _mm256_store_ps(tempBuffer, accumulator);
184
185 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
186 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
187 returnValue += lv_cmake(tempBuffer[4], tempBuffer[5]);
188 returnValue += lv_cmake(tempBuffer[6], tempBuffer[7]);
189
190 number = quarterPoints * 4;
191 for (; number < num_points; number++) {
192 returnValue += (*aPtr++);
193 }
194 *result = returnValue;
195}
196#endif /* LV_HAVE_AVX */
197
198#ifdef LV_HAVE_SSE
199#include <xmmintrin.h>
200
202 const lv_32fc_t* inputBuffer,
203 unsigned int num_points)
204{
205 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
206 unsigned int number = 0;
207 const unsigned int halfPoints = num_points / 2;
208
209 const lv_32fc_t* aPtr = inputBuffer;
210 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
211
212 __m128 accumulator = _mm_setzero_ps();
213 __m128 aVal = _mm_setzero_ps();
214
215 for (; number < halfPoints; number++) {
216 aVal = _mm_load_ps((float*)aPtr);
217 accumulator = _mm_add_ps(accumulator, aVal);
218 aPtr += 2;
219 }
220
221 _mm_store_ps(tempBuffer, accumulator);
222
223 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
224 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
225
226 number = halfPoints * 2;
227 for (; number < num_points; number++) {
228 returnValue += (*aPtr++);
229 }
230 *result = returnValue;
231}
232#endif /* LV_HAVE_SSE */
233
234#ifdef LV_HAVE_NEON
235#include <arm_neon.h>
237 const lv_32fc_t* inputBuffer,
238 unsigned int num_points)
239{
240 const lv_32fc_t* aPtr = inputBuffer;
241 unsigned int number = 0;
242 lv_32fc_t returnValue = lv_cmake(0.f, 0.f);
243 unsigned int eighthPoints = num_points / 8;
244 float32x4_t in_vec;
245 float32x4_t out_vec0 = { 0.f, 0.f, 0.f, 0.f };
246 float32x4_t out_vec1 = { 0.f, 0.f, 0.f, 0.f };
247 float32x4_t out_vec2 = { 0.f, 0.f, 0.f, 0.f };
248 float32x4_t out_vec3 = { 0.f, 0.f, 0.f, 0.f };
249 __VOLK_ATTR_ALIGNED(32) float tempBuffer[4];
250
251 for (; number < eighthPoints; number++) {
252 in_vec = vld1q_f32((float*)aPtr);
253 out_vec0 = vaddq_f32(in_vec, out_vec0);
254 aPtr += 2;
255
256 in_vec = vld1q_f32((float*)aPtr);
257 out_vec1 = vaddq_f32(in_vec, out_vec1);
258 aPtr += 2;
259
260 in_vec = vld1q_f32((float*)aPtr);
261 out_vec2 = vaddq_f32(in_vec, out_vec2);
262 aPtr += 2;
263
264 in_vec = vld1q_f32((float*)aPtr);
265 out_vec3 = vaddq_f32(in_vec, out_vec3);
266 aPtr += 2;
267 }
268 vst1q_f32(tempBuffer, out_vec0);
269 returnValue = lv_cmake(tempBuffer[0], tempBuffer[1]);
270 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
271
272 vst1q_f32(tempBuffer, out_vec1);
273 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
274 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
275
276 vst1q_f32(tempBuffer, out_vec2);
277 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
278 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
279
280 vst1q_f32(tempBuffer, out_vec3);
281 returnValue += lv_cmake(tempBuffer[0], tempBuffer[1]);
282 returnValue += lv_cmake(tempBuffer[2], tempBuffer[3]);
283
284 number = eighthPoints * 8;
285 for (; number < num_points; number++) {
286 returnValue += (*aPtr++);
287 }
288 *result = returnValue;
289}
290#endif /* LV_HAVE_NEON */
291
292#endif /* INCLUDED_volk_32fc_accumulator_s32fc_a_H */
static void volk_32fc_accumulator_s32fc_generic(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:71
static void volk_32fc_accumulator_s32fc_a_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:201
static void volk_32fc_accumulator_s32fc_u_sse(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:127
static void volk_32fc_accumulator_s32fc_a_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:163
static void volk_32fc_accumulator_s32fc_neon(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:236
static void volk_32fc_accumulator_s32fc_u_avx(lv_32fc_t *result, const lv_32fc_t *inputBuffer, unsigned int num_points)
Definition: volk_32fc_accumulator_s32fc.h:89
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56
#define lv_cmake(r, i)
Definition: volk_complex.h:68
float complex lv_32fc_t
Definition: volk_complex.h:65