Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_64f_x2_add_64f.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2018 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
71#ifndef INCLUDED_volk_64f_x2_add_64f_H
72#define INCLUDED_volk_64f_x2_add_64f_H
73
74#include <inttypes.h>
75
76
77#ifdef LV_HAVE_GENERIC
78
79static inline void volk_64f_x2_add_64f_generic(double* cVector,
80 const double* aVector,
81 const double* bVector,
82 unsigned int num_points)
83{
84 double* cPtr = cVector;
85 const double* aPtr = aVector;
86 const double* bPtr = bVector;
87 unsigned int number = 0;
88
89 for (number = 0; number < num_points; number++) {
90 *cPtr++ = (*aPtr++) + (*bPtr++);
91 }
92}
93
94#endif /* LV_HAVE_GENERIC */
95
96/*
97 * Unaligned versions
98 */
99
100#ifdef LV_HAVE_SSE2
101
102#include <emmintrin.h>
103
104static inline void volk_64f_x2_add_64f_u_sse2(double* cVector,
105 const double* aVector,
106 const double* bVector,
107 unsigned int num_points)
108{
109 unsigned int number = 0;
110 const unsigned int half_points = num_points / 2;
111
112 double* cPtr = cVector;
113 const double* aPtr = aVector;
114 const double* bPtr = bVector;
115
116 __m128d aVal, bVal, cVal;
117 for (; number < half_points; number++) {
118 aVal = _mm_loadu_pd(aPtr);
119 bVal = _mm_loadu_pd(bPtr);
120
121 cVal = _mm_add_pd(aVal, bVal);
122
123 _mm_storeu_pd(cPtr, cVal); // Store the results back into the C container
124
125 aPtr += 2;
126 bPtr += 2;
127 cPtr += 2;
128 }
129
130 number = half_points * 2;
131 for (; number < num_points; number++) {
132 *cPtr++ = (*aPtr++) + (*bPtr++);
133 }
134}
135
136#endif /* LV_HAVE_SSE2 */
137
138
139#ifdef LV_HAVE_AVX
140
141#include <immintrin.h>
142
143static inline void volk_64f_x2_add_64f_u_avx(double* cVector,
144 const double* aVector,
145 const double* bVector,
146 unsigned int num_points)
147{
148 unsigned int number = 0;
149 const unsigned int quarter_points = num_points / 4;
150
151 double* cPtr = cVector;
152 const double* aPtr = aVector;
153 const double* bPtr = bVector;
154
155 __m256d aVal, bVal, cVal;
156 for (; number < quarter_points; number++) {
157
158 aVal = _mm256_loadu_pd(aPtr);
159 bVal = _mm256_loadu_pd(bPtr);
160
161 cVal = _mm256_add_pd(aVal, bVal);
162
163 _mm256_storeu_pd(cPtr, cVal); // Store the results back into the C container
164
165 aPtr += 4;
166 bPtr += 4;
167 cPtr += 4;
168 }
169
170 number = quarter_points * 4;
171 for (; number < num_points; number++) {
172 *cPtr++ = (*aPtr++) + (*bPtr++);
173 }
174}
175
176#endif /* LV_HAVE_AVX */
177
178/*
179 * Aligned versions
180 */
181
182#ifdef LV_HAVE_SSE2
183
184#include <emmintrin.h>
185
186static inline void volk_64f_x2_add_64f_a_sse2(double* cVector,
187 const double* aVector,
188 const double* bVector,
189 unsigned int num_points)
190{
191 unsigned int number = 0;
192 const unsigned int half_points = num_points / 2;
193
194 double* cPtr = cVector;
195 const double* aPtr = aVector;
196 const double* bPtr = bVector;
197
198 __m128d aVal, bVal, cVal;
199 for (; number < half_points; number++) {
200 aVal = _mm_load_pd(aPtr);
201 bVal = _mm_load_pd(bPtr);
202
203 cVal = _mm_add_pd(aVal, bVal);
204
205 _mm_store_pd(cPtr, cVal); // Store the results back into the C container
206
207 aPtr += 2;
208 bPtr += 2;
209 cPtr += 2;
210 }
211
212 number = half_points * 2;
213 for (; number < num_points; number++) {
214 *cPtr++ = (*aPtr++) + (*bPtr++);
215 }
216}
217
218#endif /* LV_HAVE_SSE2 */
219
220
221#ifdef LV_HAVE_AVX
222
223#include <immintrin.h>
224
225static inline void volk_64f_x2_add_64f_a_avx(double* cVector,
226 const double* aVector,
227 const double* bVector,
228 unsigned int num_points)
229{
230 unsigned int number = 0;
231 const unsigned int quarter_points = num_points / 4;
232
233 double* cPtr = cVector;
234 const double* aPtr = aVector;
235 const double* bPtr = bVector;
236
237 __m256d aVal, bVal, cVal;
238 for (; number < quarter_points; number++) {
239
240 aVal = _mm256_load_pd(aPtr);
241 bVal = _mm256_load_pd(bPtr);
242
243 cVal = _mm256_add_pd(aVal, bVal);
244
245 _mm256_store_pd(cPtr, cVal); // Store the results back into the C container
246
247 aPtr += 4;
248 bPtr += 4;
249 cPtr += 4;
250 }
251
252 number = quarter_points * 4;
253 for (; number < num_points; number++) {
254 *cPtr++ = (*aPtr++) + (*bPtr++);
255 }
256}
257
258#endif /* LV_HAVE_AVX */
259
260#endif /* INCLUDED_volk_64f_x2_add_64f_u_H */
static void volk_64f_x2_add_64f_u_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:104
static void volk_64f_x2_add_64f_generic(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:79
static void volk_64f_x2_add_64f_a_sse2(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:186
static void volk_64f_x2_add_64f_a_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:225
static void volk_64f_x2_add_64f_u_avx(double *cVector, const double *aVector, const double *bVector, unsigned int num_points)
Definition: volk_64f_x2_add_64f.h:143