Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32fc_32f_multiply_32fc.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
55#ifndef INCLUDED_volk_32fc_32f_multiply_32fc_a_H
56#define INCLUDED_volk_32fc_32f_multiply_32fc_a_H
57
58#include <inttypes.h>
59#include <stdio.h>
60
61#ifdef LV_HAVE_AVX
62#include <immintrin.h>
63
64static inline void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t* cVector,
65 const lv_32fc_t* aVector,
66 const float* bVector,
67 unsigned int num_points)
68{
69 unsigned int number = 0;
70 const unsigned int eighthPoints = num_points / 8;
71
72 lv_32fc_t* cPtr = cVector;
73 const lv_32fc_t* aPtr = aVector;
74 const float* bPtr = bVector;
75
76 __m256 aVal1, aVal2, bVal, bVal1, bVal2, cVal1, cVal2;
77
78 __m256i permute_mask = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
79
80 for (; number < eighthPoints; number++) {
81
82 aVal1 = _mm256_load_ps((float*)aPtr);
83 aPtr += 4;
84
85 aVal2 = _mm256_load_ps((float*)aPtr);
86 aPtr += 4;
87
88 bVal = _mm256_load_ps(bPtr); // b0|b1|b2|b3|b4|b5|b6|b7
89 bPtr += 8;
90
91 bVal1 = _mm256_permute2f128_ps(bVal, bVal, 0x00); // b0|b1|b2|b3|b0|b1|b2|b3
92 bVal2 = _mm256_permute2f128_ps(bVal, bVal, 0x11); // b4|b5|b6|b7|b4|b5|b6|b7
93
94 bVal1 = _mm256_permutevar_ps(bVal1, permute_mask); // b0|b0|b1|b1|b2|b2|b3|b3
95 bVal2 = _mm256_permutevar_ps(bVal2, permute_mask); // b4|b4|b5|b5|b6|b6|b7|b7
96
97 cVal1 = _mm256_mul_ps(aVal1, bVal1);
98 cVal2 = _mm256_mul_ps(aVal2, bVal2);
99
100 _mm256_store_ps((float*)cPtr,
101 cVal1); // Store the results back into the C container
102 cPtr += 4;
103
104 _mm256_store_ps((float*)cPtr,
105 cVal2); // Store the results back into the C container
106 cPtr += 4;
107 }
108
109 number = eighthPoints * 8;
110 for (; number < num_points; ++number) {
111 *cPtr++ = (*aPtr++) * (*bPtr++);
112 }
113}
114#endif /* LV_HAVE_AVX */
115
116
117#ifdef LV_HAVE_SSE
118#include <xmmintrin.h>
119
121 const lv_32fc_t* aVector,
122 const float* bVector,
123 unsigned int num_points)
124{
125 unsigned int number = 0;
126 const unsigned int quarterPoints = num_points / 4;
127
128 lv_32fc_t* cPtr = cVector;
129 const lv_32fc_t* aPtr = aVector;
130 const float* bPtr = bVector;
131
132 __m128 aVal1, aVal2, bVal, bVal1, bVal2, cVal;
133 for (; number < quarterPoints; number++) {
134
135 aVal1 = _mm_load_ps((const float*)aPtr);
136 aPtr += 2;
137
138 aVal2 = _mm_load_ps((const float*)aPtr);
139 aPtr += 2;
140
141 bVal = _mm_load_ps(bPtr);
142 bPtr += 4;
143
144 bVal1 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(1, 1, 0, 0));
145 bVal2 = _mm_shuffle_ps(bVal, bVal, _MM_SHUFFLE(3, 3, 2, 2));
146
147 cVal = _mm_mul_ps(aVal1, bVal1);
148
149 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
150 cPtr += 2;
151
152 cVal = _mm_mul_ps(aVal2, bVal2);
153
154 _mm_store_ps((float*)cPtr, cVal); // Store the results back into the C container
155
156 cPtr += 2;
157 }
158
159 number = quarterPoints * 4;
160 for (; number < num_points; number++) {
161 *cPtr++ = (*aPtr++) * (*bPtr);
162 bPtr++;
163 }
164}
165#endif /* LV_HAVE_SSE */
166
167
168#ifdef LV_HAVE_GENERIC
169
171 const lv_32fc_t* aVector,
172 const float* bVector,
173 unsigned int num_points)
174{
175 lv_32fc_t* cPtr = cVector;
176 const lv_32fc_t* aPtr = aVector;
177 const float* bPtr = bVector;
178 unsigned int number = 0;
179
180 for (number = 0; number < num_points; number++) {
181 *cPtr++ = (*aPtr++) * (*bPtr++);
182 }
183}
184#endif /* LV_HAVE_GENERIC */
185
186
187#ifdef LV_HAVE_NEON
188#include <arm_neon.h>
189
190static inline void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t* cVector,
191 const lv_32fc_t* aVector,
192 const float* bVector,
193 unsigned int num_points)
194{
195 lv_32fc_t* cPtr = cVector;
196 const lv_32fc_t* aPtr = aVector;
197 const float* bPtr = bVector;
198 unsigned int number = 0;
199 unsigned int quarter_points = num_points / 4;
200
201 float32x4x2_t inputVector, outputVector;
202 float32x4_t tapsVector;
203 for (number = 0; number < quarter_points; number++) {
204 inputVector = vld2q_f32((float*)aPtr);
205 tapsVector = vld1q_f32(bPtr);
206
207 outputVector.val[0] = vmulq_f32(inputVector.val[0], tapsVector);
208 outputVector.val[1] = vmulq_f32(inputVector.val[1], tapsVector);
209
210 vst2q_f32((float*)cPtr, outputVector);
211 aPtr += 4;
212 bPtr += 4;
213 cPtr += 4;
214 }
215
216 for (number = quarter_points * 4; number < num_points; number++) {
217 *cPtr++ = (*aPtr++) * (*bPtr++);
218 }
219}
220#endif /* LV_HAVE_NEON */
221
222
223#ifdef LV_HAVE_ORC
224
225extern void volk_32fc_32f_multiply_32fc_a_orc_impl(lv_32fc_t* cVector,
226 const lv_32fc_t* aVector,
227 const float* bVector,
228 unsigned int num_points);
229
230static inline void volk_32fc_32f_multiply_32fc_u_orc(lv_32fc_t* cVector,
231 const lv_32fc_t* aVector,
232 const float* bVector,
233 unsigned int num_points)
234{
235 volk_32fc_32f_multiply_32fc_a_orc_impl(cVector, aVector, bVector, num_points);
236}
237
238#endif /* LV_HAVE_GENERIC */
239
240
241#endif /* INCLUDED_volk_32fc_32f_multiply_32fc_a_H */
static void volk_32fc_32f_multiply_32fc_neon(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:190
static void volk_32fc_32f_multiply_32fc_generic(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:170
static void volk_32fc_32f_multiply_32fc_a_avx(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:64
static void volk_32fc_32f_multiply_32fc_a_sse(lv_32fc_t *cVector, const lv_32fc_t *aVector, const float *bVector, unsigned int num_points)
Definition: volk_32fc_32f_multiply_32fc.h:120
float complex lv_32fc_t
Definition: volk_complex.h:65