Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_16i_x5_add_quad_16i_x4.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2012, 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
60#ifndef INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
61#define INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H
62
63#include <inttypes.h>
64#include <stdio.h>
65
66#ifdef LV_HAVE_SSE2
67#include <emmintrin.h>
68#include <xmmintrin.h>
69
70static inline void volk_16i_x5_add_quad_16i_x4_a_sse2(short* target0,
71 short* target1,
72 short* target2,
73 short* target3,
74 short* src0,
75 short* src1,
76 short* src2,
77 short* src3,
78 short* src4,
79 unsigned int num_points)
80{
81 const unsigned int num_bytes = num_points * 2;
82
83 __m128i xmm0, xmm1, xmm2, xmm3, xmm4;
84 __m128i *p_target0, *p_target1, *p_target2, *p_target3, *p_src0, *p_src1, *p_src2,
85 *p_src3, *p_src4;
86 p_target0 = (__m128i*)target0;
87 p_target1 = (__m128i*)target1;
88 p_target2 = (__m128i*)target2;
89 p_target3 = (__m128i*)target3;
90
91 p_src0 = (__m128i*)src0;
92 p_src1 = (__m128i*)src1;
93 p_src2 = (__m128i*)src2;
94 p_src3 = (__m128i*)src3;
95 p_src4 = (__m128i*)src4;
96
97 int i = 0;
98
99 int bound = (num_bytes >> 4);
100 int leftovers = (num_bytes >> 1) & 7;
101
102 for (; i < bound; ++i) {
103 xmm0 = _mm_load_si128(p_src0);
104 xmm1 = _mm_load_si128(p_src1);
105 xmm2 = _mm_load_si128(p_src2);
106 xmm3 = _mm_load_si128(p_src3);
107 xmm4 = _mm_load_si128(p_src4);
108
109 p_src0 += 1;
110 p_src1 += 1;
111
112 xmm1 = _mm_add_epi16(xmm0, xmm1);
113 xmm2 = _mm_add_epi16(xmm0, xmm2);
114 xmm3 = _mm_add_epi16(xmm0, xmm3);
115 xmm4 = _mm_add_epi16(xmm0, xmm4);
116
117
118 p_src2 += 1;
119 p_src3 += 1;
120 p_src4 += 1;
121
122 _mm_store_si128(p_target0, xmm1);
123 _mm_store_si128(p_target1, xmm2);
124 _mm_store_si128(p_target2, xmm3);
125 _mm_store_si128(p_target3, xmm4);
126
127 p_target0 += 1;
128 p_target1 += 1;
129 p_target2 += 1;
130 p_target3 += 1;
131 }
132 /*__VOLK_ASM __VOLK_VOLATILE
133 (
134 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1:\n\t"
135 "cmp $0, %[bound]\n\t"
136 "je .%=volk_16i_x5_add_quad_16i_x4_a_sse2_END\n\t"
137 "movaps (%[src0]), %%xmm1\n\t"
138 "movaps (%[src1]), %%xmm2\n\t"
139 "movaps (%[src2]), %%xmm3\n\t"
140 "movaps (%[src3]), %%xmm4\n\t"
141 "movaps (%[src4]), %%xmm5\n\t"
142 "add $16, %[src0]\n\t"
143 "add $16, %[src1]\n\t"
144 "add $16, %[src2]\n\t"
145 "add $16, %[src3]\n\t"
146 "add $16, %[src4]\n\t"
147 "paddw %%xmm1, %%xmm2\n\t"
148 "paddw %%xmm1, %%xmm3\n\t"
149 "paddw %%xmm1, %%xmm4\n\t"
150 "paddw %%xmm1, %%xmm5\n\t"
151 "add $-1, %[bound]\n\t"
152 "movaps %%xmm2, (%[target0])\n\t"
153 "movaps %%xmm3, (%[target1])\n\t"
154 "movaps %%xmm4, (%[target2])\n\t"
155 "movaps %%xmm5, (%[target3])\n\t"
156 "add $16, %[target0]\n\t"
157 "add $16, %[target1]\n\t"
158 "add $16, %[target2]\n\t"
159 "add $16, %[target3]\n\t"
160 "jmp .%=volk_16i_x5_add_quad_16i_x4_a_sse2_L1\n\t"
161 ".%=volk_16i_x5_add_quad_16i_x4_a_sse2_END:\n\t"
162 :
163 :[bound]"r"(bound), [src0]"r"(src0), [src1]"r"(src1), [src2]"r"(src2),
164 [src3]"r"(src3), [src4]"r"(src4), [target0]"r"(target0), [target1]"r"(target1),
165 [target2]"r"(target2), [target3]"r"(target3)
166 :"xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
167 );
168 */
169
170 for (i = bound * 8; i < (bound * 8) + leftovers; ++i) {
171 target0[i] = src0[i] + src1[i];
172 target1[i] = src0[i] + src2[i];
173 target2[i] = src0[i] + src3[i];
174 target3[i] = src0[i] + src4[i];
175 }
176}
177#endif /*LV_HAVE_SSE2*/
178
179#ifdef LV_HAVE_NEON
180#include <arm_neon.h>
181
182static inline void volk_16i_x5_add_quad_16i_x4_neon(short* target0,
183 short* target1,
184 short* target2,
185 short* target3,
186 short* src0,
187 short* src1,
188 short* src2,
189 short* src3,
190 short* src4,
191 unsigned int num_points)
192{
193 const unsigned int eighth_points = num_points / 8;
194 unsigned int number = 0;
195
196 int16x8_t src0_vec, src1_vec, src2_vec, src3_vec, src4_vec;
197 int16x8_t target0_vec, target1_vec, target2_vec, target3_vec;
198 for (number = 0; number < eighth_points; ++number) {
199 src0_vec = vld1q_s16(src0);
200 src1_vec = vld1q_s16(src1);
201 src2_vec = vld1q_s16(src2);
202 src3_vec = vld1q_s16(src3);
203 src4_vec = vld1q_s16(src4);
204
205 target0_vec = vaddq_s16(src0_vec, src1_vec);
206 target1_vec = vaddq_s16(src0_vec, src2_vec);
207 target2_vec = vaddq_s16(src0_vec, src3_vec);
208 target3_vec = vaddq_s16(src0_vec, src4_vec);
209
210 vst1q_s16(target0, target0_vec);
211 vst1q_s16(target1, target1_vec);
212 vst1q_s16(target2, target2_vec);
213 vst1q_s16(target3, target3_vec);
214 src0 += 8;
215 src1 += 8;
216 src2 += 8;
217 src3 += 8;
218 src4 += 8;
219 target0 += 8;
220 target1 += 8;
221 target2 += 8;
222 target3 += 8;
223 }
224
225 for (number = eighth_points * 8; number < num_points; ++number) {
226 *target0++ = *src0 + *src1++;
227 *target1++ = *src0 + *src2++;
228 *target2++ = *src0 + *src3++;
229 *target3++ = *src0++ + *src4++;
230 }
231}
232
233#endif /* LV_HAVE_NEON */
234
235#ifdef LV_HAVE_GENERIC
236
237static inline void volk_16i_x5_add_quad_16i_x4_generic(short* target0,
238 short* target1,
239 short* target2,
240 short* target3,
241 short* src0,
242 short* src1,
243 short* src2,
244 short* src3,
245 short* src4,
246 unsigned int num_points)
247{
248 const unsigned int num_bytes = num_points * 2;
249
250 int i = 0;
251
252 int bound = num_bytes >> 1;
253
254 for (i = 0; i < bound; ++i) {
255 target0[i] = src0[i] + src1[i];
256 target1[i] = src0[i] + src2[i];
257 target2[i] = src0[i] + src3[i];
258 target3[i] = src0[i] + src4[i];
259 }
260}
261
262#endif /* LV_HAVE_GENERIC */
263
264#endif /*INCLUDED_volk_16i_x5_add_quad_16i_x4_a_H*/
static void volk_16i_x5_add_quad_16i_x4_a_sse2(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:70
static void volk_16i_x5_add_quad_16i_x4_neon(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:182
static void volk_16i_x5_add_quad_16i_x4_generic(short *target0, short *target1, short *target2, short *target3, short *src0, short *src1, short *src2, short *src3, short *src4, unsigned int num_points)
Definition: volk_16i_x5_add_quad_16i_x4.h:237
for i
Definition: volk_config_fixed.tmpl.h:25