Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16ic_x2_multiply_16ic.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2016 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
47#ifndef INCLUDED_volk_16ic_x2_multiply_16ic_H
48#define INCLUDED_volk_16ic_x2_multiply_16ic_H
49
50#include <volk/volk_common.h>
51#include <volk/volk_complex.h>
52
53#ifdef LV_HAVE_GENERIC
54
56 const lv_16sc_t* in_a,
57 const lv_16sc_t* in_b,
58 unsigned int num_points)
59{
60 unsigned int n;
61 for (n = 0; n < num_points; n++) {
62 result[n] = in_a[n] * in_b[n];
63 }
64}
65
66#endif /*LV_HAVE_GENERIC*/
67
68
69#ifdef LV_HAVE_SSE2
70#include <emmintrin.h>
71
73 const lv_16sc_t* in_a,
74 const lv_16sc_t* in_b,
75 unsigned int num_points)
76{
77 const unsigned int sse_iters = num_points / 4;
78 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
79 result;
80
81 mask_imag = _mm_set_epi8(
82 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
83 mask_real = _mm_set_epi8(
84 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
85
86 const lv_16sc_t* _in_a = in_a;
87 const lv_16sc_t* _in_b = in_b;
88 lv_16sc_t* _out = out;
89 unsigned int number;
90
91 for (number = 0; number < sse_iters; number++) {
92 a = _mm_load_si128(
93 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
94 b = _mm_load_si128((__m128i*)_in_b);
95 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
96
97 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
98 // zeros, and store the results in dst.
99 real = _mm_subs_epi16(c, c_sr);
100 real = _mm_and_si128(real,
101 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
102
103 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
104 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
105
106 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
107 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
108
109 imag = _mm_adds_epi16(imag1, imag2);
110 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
111
112 result = _mm_or_si128(real, imag);
113
114 _mm_store_si128((__m128i*)_out, result);
115
116 _in_a += 4;
117 _in_b += 4;
118 _out += 4;
119 }
120
121 for (number = sse_iters * 4; number < num_points; ++number) {
122 *_out++ = (*_in_a++) * (*_in_b++);
123 }
124}
125#endif /* LV_HAVE_SSE2 */
126
127
128#ifdef LV_HAVE_SSE2
129#include <emmintrin.h>
130
132 const lv_16sc_t* in_a,
133 const lv_16sc_t* in_b,
134 unsigned int num_points)
135{
136 const unsigned int sse_iters = num_points / 4;
137 __m128i a, b, c, c_sr, mask_imag, mask_real, real, imag, imag1, imag2, b_sl, a_sl,
138 result;
139
140 mask_imag = _mm_set_epi8(
141 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0);
142 mask_real = _mm_set_epi8(
143 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF, 0, 0, 0xFF, 0xFF);
144
145 const lv_16sc_t* _in_a = in_a;
146 const lv_16sc_t* _in_b = in_b;
147 lv_16sc_t* _out = out;
148 unsigned int number;
149
150 for (number = 0; number < sse_iters; number++) {
151 a = _mm_loadu_si128(
152 (__m128i*)_in_a); // load (2 byte imag, 2 byte real) x 4 into 128 bits reg
153 b = _mm_loadu_si128((__m128i*)_in_b);
154 c = _mm_mullo_epi16(a, b); // a3.i*b3.i, a3.r*b3.r, ....
155
156 c_sr = _mm_srli_si128(c, 2); // Shift a right by imm8 bytes while shifting in
157 // zeros, and store the results in dst.
158 real = _mm_subs_epi16(c, c_sr);
159 real = _mm_and_si128(real,
160 mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
161
162 b_sl = _mm_slli_si128(b, 2); // b3.r, b2.i ....
163 a_sl = _mm_slli_si128(a, 2); // a3.r, a2.i ....
164
165 imag1 = _mm_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
166 imag2 = _mm_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
167
168 imag = _mm_adds_epi16(imag1, imag2);
169 imag = _mm_and_si128(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
170
171 result = _mm_or_si128(real, imag);
172
173 _mm_storeu_si128((__m128i*)_out, result);
174
175 _in_a += 4;
176 _in_b += 4;
177 _out += 4;
178 }
179
180 for (number = sse_iters * 4; number < num_points; ++number) {
181 *_out++ = (*_in_a++) * (*_in_b++);
182 }
183}
184#endif /* LV_HAVE_SSE2 */
185
186
187#ifdef LV_HAVE_AVX2
188#include <immintrin.h>
189
190static inline void volk_16ic_x2_multiply_16ic_u_avx2(lv_16sc_t* out,
191 const lv_16sc_t* in_a,
192 const lv_16sc_t* in_b,
193 unsigned int num_points)
194{
195 unsigned int number = 0;
196 const unsigned int avx2_points = num_points / 8;
197
198 const lv_16sc_t* _in_a = in_a;
199 const lv_16sc_t* _in_b = in_b;
200 lv_16sc_t* _out = out;
201
202 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
203
204 const __m256i mask_imag = _mm256_set_epi8(0xFF,
205 0xFF,
206 0,
207 0,
208 0xFF,
209 0xFF,
210 0,
211 0,
212 0xFF,
213 0xFF,
214 0,
215 0,
216 0xFF,
217 0xFF,
218 0,
219 0,
220 0xFF,
221 0xFF,
222 0,
223 0,
224 0xFF,
225 0xFF,
226 0,
227 0,
228 0xFF,
229 0xFF,
230 0,
231 0,
232 0xFF,
233 0xFF,
234 0,
235 0);
236 const __m256i mask_real = _mm256_set_epi8(0,
237 0,
238 0xFF,
239 0xFF,
240 0,
241 0,
242 0xFF,
243 0xFF,
244 0,
245 0,
246 0xFF,
247 0xFF,
248 0,
249 0,
250 0xFF,
251 0xFF,
252 0,
253 0,
254 0xFF,
255 0xFF,
256 0,
257 0,
258 0xFF,
259 0xFF,
260 0,
261 0,
262 0xFF,
263 0xFF,
264 0,
265 0,
266 0xFF,
267 0xFF);
268
269 for (; number < avx2_points; number++) {
270 a = _mm256_loadu_si256(
271 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
272 b = _mm256_loadu_si256(
273 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
274 c = _mm256_mullo_epi16(a, b);
275
276 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
277 // zeros, and store the results in dst.
278 real = _mm256_subs_epi16(c, c_sr);
279 real = _mm256_and_si256(
280 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
281
282 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
283 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
284
285 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
286 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
287
288 imag = _mm256_adds_epi16(imag1, imag2);
289 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
290
291 result = _mm256_or_si256(real, imag);
292
293 _mm256_storeu_si256((__m256i*)_out, result);
294
295 _in_a += 8;
296 _in_b += 8;
297 _out += 8;
298 }
299
300 number = avx2_points * 8;
301 for (; number < num_points; number++) {
302 *_out++ = (*_in_a++) * (*_in_b++);
303 }
304}
305#endif /* LV_HAVE_AVX2 */
306
307
308#ifdef LV_HAVE_AVX2
309#include <immintrin.h>
310
311static inline void volk_16ic_x2_multiply_16ic_a_avx2(lv_16sc_t* out,
312 const lv_16sc_t* in_a,
313 const lv_16sc_t* in_b,
314 unsigned int num_points)
315{
316 unsigned int number = 0;
317 const unsigned int avx2_points = num_points / 8;
318
319 const lv_16sc_t* _in_a = in_a;
320 const lv_16sc_t* _in_b = in_b;
321 lv_16sc_t* _out = out;
322
323 __m256i a, b, c, c_sr, real, imag, imag1, imag2, b_sl, a_sl, result;
324
325 const __m256i mask_imag = _mm256_set_epi8(0xFF,
326 0xFF,
327 0,
328 0,
329 0xFF,
330 0xFF,
331 0,
332 0,
333 0xFF,
334 0xFF,
335 0,
336 0,
337 0xFF,
338 0xFF,
339 0,
340 0,
341 0xFF,
342 0xFF,
343 0,
344 0,
345 0xFF,
346 0xFF,
347 0,
348 0,
349 0xFF,
350 0xFF,
351 0,
352 0,
353 0xFF,
354 0xFF,
355 0,
356 0);
357 const __m256i mask_real = _mm256_set_epi8(0,
358 0,
359 0xFF,
360 0xFF,
361 0,
362 0,
363 0xFF,
364 0xFF,
365 0,
366 0,
367 0xFF,
368 0xFF,
369 0,
370 0,
371 0xFF,
372 0xFF,
373 0,
374 0,
375 0xFF,
376 0xFF,
377 0,
378 0,
379 0xFF,
380 0xFF,
381 0,
382 0,
383 0xFF,
384 0xFF,
385 0,
386 0,
387 0xFF,
388 0xFF);
389
390 for (; number < avx2_points; number++) {
391 a = _mm256_load_si256(
392 (__m256i*)_in_a); // Load the ar + ai, br + bi as ar,ai,br,bi
393 b = _mm256_load_si256(
394 (__m256i*)_in_b); // Load the cr + ci, dr + di as cr,ci,dr,di
395 c = _mm256_mullo_epi16(a, b);
396
397 c_sr = _mm256_srli_si256(c, 2); // Shift a right by imm8 bytes while shifting in
398 // zeros, and store the results in dst.
399 real = _mm256_subs_epi16(c, c_sr);
400 real = _mm256_and_si256(
401 real, mask_real); // a3.r*b3.r-a3.i*b3.i , 0, a3.r*b3.r- a3.i*b3.i
402
403 b_sl = _mm256_slli_si256(b, 2); // b3.r, b2.i ....
404 a_sl = _mm256_slli_si256(a, 2); // a3.r, a2.i ....
405
406 imag1 = _mm256_mullo_epi16(a, b_sl); // a3.i*b3.r, ....
407 imag2 = _mm256_mullo_epi16(b, a_sl); // b3.i*a3.r, ....
408
409 imag = _mm256_adds_epi16(imag1, imag2);
410 imag = _mm256_and_si256(imag, mask_imag); // a3.i*b3.r+b3.i*a3.r, 0, ...
411
412 result = _mm256_or_si256(real, imag);
413
414 _mm256_store_si256((__m256i*)_out, result);
415
416 _in_a += 8;
417 _in_b += 8;
418 _out += 8;
419 }
420
421 number = avx2_points * 8;
422 for (; number < num_points; number++) {
423 *_out++ = (*_in_a++) * (*_in_b++);
424 }
425}
426#endif /* LV_HAVE_AVX2 */
427
428
429#ifdef LV_HAVE_NEON
430#include <arm_neon.h>
431
433 const lv_16sc_t* in_a,
434 const lv_16sc_t* in_b,
435 unsigned int num_points)
436{
437 lv_16sc_t* a_ptr = (lv_16sc_t*)in_a;
438 lv_16sc_t* b_ptr = (lv_16sc_t*)in_b;
439 unsigned int quarter_points = num_points / 4;
440 int16x4x2_t a_val, b_val, c_val;
441 int16x4x2_t tmp_real, tmp_imag;
442 unsigned int number = 0;
443
444 for (number = 0; number < quarter_points; ++number) {
445 a_val = vld2_s16((int16_t*)a_ptr); // a0r|a1r|a2r|a3r || a0i|a1i|a2i|a3i
446 b_val = vld2_s16((int16_t*)b_ptr); // b0r|b1r|b2r|b3r || b0i|b1i|b2i|b3i
447 __VOLK_PREFETCH(a_ptr + 4);
448 __VOLK_PREFETCH(b_ptr + 4);
449
450 // multiply the real*real and imag*imag to get real result
451 // a0r*b0r|a1r*b1r|a2r*b2r|a3r*b3r
452 tmp_real.val[0] = vmul_s16(a_val.val[0], b_val.val[0]);
453 // a0i*b0i|a1i*b1i|a2i*b2i|a3i*b3i
454 tmp_real.val[1] = vmul_s16(a_val.val[1], b_val.val[1]);
455
456 // Multiply cross terms to get the imaginary result
457 // a0r*b0i|a1r*b1i|a2r*b2i|a3r*b3i
458 tmp_imag.val[0] = vmul_s16(a_val.val[0], b_val.val[1]);
459 // a0i*b0r|a1i*b1r|a2i*b2r|a3i*b3r
460 tmp_imag.val[1] = vmul_s16(a_val.val[1], b_val.val[0]);
461
462 // store the results
463 c_val.val[0] = vsub_s16(tmp_real.val[0], tmp_real.val[1]);
464 c_val.val[1] = vadd_s16(tmp_imag.val[0], tmp_imag.val[1]);
465 vst2_s16((int16_t*)out, c_val);
466
467 a_ptr += 4;
468 b_ptr += 4;
469 out += 4;
470 }
471
472 for (number = quarter_points * 4; number < num_points; number++) {
473 *out++ = (*a_ptr++) * (*b_ptr++);
474 }
475}
476#endif /* LV_HAVE_NEON */
477
478#endif /*INCLUDED_volk_16ic_x2_multiply_16ic_H*/
static void volk_16ic_x2_multiply_16ic_a_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:72
static void volk_16ic_x2_multiply_16ic_u_sse2(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:131
static void volk_16ic_x2_multiply_16ic_generic(lv_16sc_t *result, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:55
static void volk_16ic_x2_multiply_16ic_neon(lv_16sc_t *out, const lv_16sc_t *in_a, const lv_16sc_t *in_b, unsigned int num_points)
Definition: volk_16ic_x2_multiply_16ic.h:432
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62
short complex lv_16sc_t
Definition: volk_complex.h:62