Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32f_accumulator_s32f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
63#ifndef INCLUDED_volk_32f_accumulator_s32f_a_H
64#define INCLUDED_volk_32f_accumulator_s32f_a_H
65
66#include <inttypes.h>
67#include <volk/volk_common.h>
68
69#ifdef LV_HAVE_AVX
70#include <immintrin.h>
71
72static inline void volk_32f_accumulator_s32f_a_avx(float* result,
73 const float* inputBuffer,
74 unsigned int num_points)
75{
76 float returnValue = 0;
77 unsigned int number = 0;
78 const unsigned int eighthPoints = num_points / 8;
79
80 const float* aPtr = inputBuffer;
81 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
82
83 __m256 accumulator = _mm256_setzero_ps();
84 __m256 aVal = _mm256_setzero_ps();
85
86 for (; number < eighthPoints; number++) {
87 aVal = _mm256_load_ps(aPtr);
88 accumulator = _mm256_add_ps(accumulator, aVal);
89 aPtr += 8;
90 }
91
92 _mm256_store_ps(tempBuffer, accumulator);
93
94 returnValue = tempBuffer[0];
95 returnValue += tempBuffer[1];
96 returnValue += tempBuffer[2];
97 returnValue += tempBuffer[3];
98 returnValue += tempBuffer[4];
99 returnValue += tempBuffer[5];
100 returnValue += tempBuffer[6];
101 returnValue += tempBuffer[7];
102
103 number = eighthPoints * 8;
104 for (; number < num_points; number++) {
105 returnValue += (*aPtr++);
106 }
107 *result = returnValue;
108}
109#endif /* LV_HAVE_AVX */
110
111
112#ifdef LV_HAVE_AVX
113#include <immintrin.h>
114
115static inline void volk_32f_accumulator_s32f_u_avx(float* result,
116 const float* inputBuffer,
117 unsigned int num_points)
118{
119 float returnValue = 0;
120 unsigned int number = 0;
121 const unsigned int eighthPoints = num_points / 8;
122
123 const float* aPtr = inputBuffer;
124 __VOLK_ATTR_ALIGNED(32) float tempBuffer[8];
125
126 __m256 accumulator = _mm256_setzero_ps();
127 __m256 aVal = _mm256_setzero_ps();
128
129 for (; number < eighthPoints; number++) {
130 aVal = _mm256_loadu_ps(aPtr);
131 accumulator = _mm256_add_ps(accumulator, aVal);
132 aPtr += 8;
133 }
134
135 _mm256_store_ps(tempBuffer, accumulator);
136
137 returnValue = tempBuffer[0];
138 returnValue += tempBuffer[1];
139 returnValue += tempBuffer[2];
140 returnValue += tempBuffer[3];
141 returnValue += tempBuffer[4];
142 returnValue += tempBuffer[5];
143 returnValue += tempBuffer[6];
144 returnValue += tempBuffer[7];
145
146 number = eighthPoints * 8;
147 for (; number < num_points; number++) {
148 returnValue += (*aPtr++);
149 }
150 *result = returnValue;
151}
152#endif /* LV_HAVE_AVX */
153
154
155#ifdef LV_HAVE_SSE
156#include <xmmintrin.h>
157
158static inline void volk_32f_accumulator_s32f_a_sse(float* result,
159 const float* inputBuffer,
160 unsigned int num_points)
161{
162 float returnValue = 0;
163 unsigned int number = 0;
164 const unsigned int quarterPoints = num_points / 4;
165
166 const float* aPtr = inputBuffer;
167 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
168
169 __m128 accumulator = _mm_setzero_ps();
170 __m128 aVal = _mm_setzero_ps();
171
172 for (; number < quarterPoints; number++) {
173 aVal = _mm_load_ps(aPtr);
174 accumulator = _mm_add_ps(accumulator, aVal);
175 aPtr += 4;
176 }
177
178 _mm_store_ps(tempBuffer, accumulator);
179
180 returnValue = tempBuffer[0];
181 returnValue += tempBuffer[1];
182 returnValue += tempBuffer[2];
183 returnValue += tempBuffer[3];
184
185 number = quarterPoints * 4;
186 for (; number < num_points; number++) {
187 returnValue += (*aPtr++);
188 }
189 *result = returnValue;
190}
191#endif /* LV_HAVE_SSE */
192
193
194#ifdef LV_HAVE_SSE
195#include <xmmintrin.h>
196
197static inline void volk_32f_accumulator_s32f_u_sse(float* result,
198 const float* inputBuffer,
199 unsigned int num_points)
200{
201 float returnValue = 0;
202 unsigned int number = 0;
203 const unsigned int quarterPoints = num_points / 4;
204
205 const float* aPtr = inputBuffer;
206 __VOLK_ATTR_ALIGNED(16) float tempBuffer[4];
207
208 __m128 accumulator = _mm_setzero_ps();
209 __m128 aVal = _mm_setzero_ps();
210
211 for (; number < quarterPoints; number++) {
212 aVal = _mm_load_ps(aPtr);
213 accumulator = _mm_add_ps(accumulator, aVal);
214 aPtr += 4;
215 }
216
217 _mm_store_ps(tempBuffer, accumulator);
218
219 returnValue = tempBuffer[0];
220 returnValue += tempBuffer[1];
221 returnValue += tempBuffer[2];
222 returnValue += tempBuffer[3];
223
224 number = quarterPoints * 4;
225 for (; number < num_points; number++) {
226 returnValue += (*aPtr++);
227 }
228 *result = returnValue;
229}
230#endif /* LV_HAVE_SSE */
231
232#ifdef LV_HAVE_GENERIC
233static inline void volk_32f_accumulator_s32f_generic(float* result,
234 const float* inputBuffer,
235 unsigned int num_points)
236{
237 const float* aPtr = inputBuffer;
238 unsigned int number = 0;
239 float returnValue = 0;
240
241 for (; number < num_points; number++) {
242 returnValue += (*aPtr++);
243 }
244 *result = returnValue;
245}
246#endif /* LV_HAVE_GENERIC */
247
248#endif /* INCLUDED_volk_32f_accumulator_s32f_a_H */
static void volk_32f_accumulator_s32f_a_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:72
static void volk_32f_accumulator_s32f_u_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:197
static void volk_32f_accumulator_s32f_generic(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:233
static void volk_32f_accumulator_s32f_u_avx(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:115
static void volk_32f_accumulator_s32f_a_sse(float *result, const float *inputBuffer, unsigned int num_points)
Definition: volk_32f_accumulator_s32f.h:158
#define __VOLK_ATTR_ALIGNED(x)
Definition: volk_common.h:56