Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_avx2_intrinsics.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2015 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
23/*
24 * This file is intended to hold AVX2 intrinsics of intrinsics.
25 * They should be used in VOLK kernels to avoid copy-paste.
26 */
27
28#ifndef INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
29#define INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_
31#include <immintrin.h>
32
33static inline __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
34{
35 const __m128i zeros = _mm_set1_epi8(0x00);
36 const __m128i sign_extract = _mm_set1_epi8(0x80);
37 const __m256i shuffle_mask = _mm256_setr_epi8(0xff,
38 0xff,
39 0xff,
40 0x00,
41 0xff,
42 0xff,
43 0xff,
44 0x01,
45 0xff,
46 0xff,
47 0xff,
48 0x02,
49 0xff,
50 0xff,
51 0xff,
52 0x03,
53 0xff,
54 0xff,
55 0xff,
56 0x04,
57 0xff,
58 0xff,
59 0xff,
60 0x05,
61 0xff,
62 0xff,
63 0xff,
64 0x06,
65 0xff,
66 0xff,
67 0xff,
68 0x07);
69 __m256i sign_bits = _mm256_setzero_si256();
70
71 fbits = _mm_cmpgt_epi8(fbits, zeros);
72 fbits = _mm_and_si128(fbits, sign_extract);
73 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 0);
74 sign_bits = _mm256_insertf128_si256(sign_bits, fbits, 1);
75 sign_bits = _mm256_shuffle_epi8(sign_bits, shuffle_mask);
76
77 return _mm256_castsi256_ps(sign_bits);
78}
79
80static inline __m256
81_mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
82{
83 // prepare sign mask for correct +-
84 __m256 sign_mask = _mm256_polar_sign_mask_avx2(fbits);
85
86 __m256 llr0, llr1;
87 _mm256_polar_deinterleave(&llr0, &llr1, src0, src1);
88
89 // calculate result
90 llr0 = _mm256_xor_ps(llr0, sign_mask);
91 __m256 dst = _mm256_add_ps(llr0, llr1);
92 return dst;
93}
94
95static inline __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0,
96 const __m256 cplxValue1)
97{
98 const __m256i idx = _mm256_set_epi32(7, 6, 3, 2, 5, 4, 1, 0);
99 const __m256 squared0 = _mm256_mul_ps(cplxValue0, cplxValue0); // Square the values
100 const __m256 squared1 = _mm256_mul_ps(cplxValue1, cplxValue1); // Square the Values
101 const __m256 complex_result = _mm256_hadd_ps(squared0, squared1);
102 return _mm256_permutevar8x32_ps(complex_result, idx);
103}
104
105static inline __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0,
106 const __m256 symbols1,
107 const __m256 points0,
108 const __m256 points1,
109 const __m256 scalar)
110{
111 /*
112 * Calculate: |y - x|^2 * SNR_lin
113 * Consider 'symbolsX' and 'pointsX' to be complex float
114 * 'symbolsX' are 'y' and 'pointsX' are 'x'
115 */
116 const __m256 diff0 = _mm256_sub_ps(symbols0, points0);
117 const __m256 diff1 = _mm256_sub_ps(symbols1, points1);
118 const __m256 norms = _mm256_magnitudesquared_ps_avx2(diff0, diff1);
119 return _mm256_mul_ps(norms, scalar);
120}
121
122/*
123 * The function below vectorizes the inner loop of the following code:
124 *
125 * float max_values[8] = {0.f};
126 * unsigned max_indices[8] = {0};
127 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
128 * for (unsigned i = 0; i < num_points / 8; ++i) {
129 * for (unsigned j = 0; j < 8; ++j) {
130 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
131 * bool compare = abs_squared > max_values[j];
132 * max_values[j] = compare ? abs_squared : max_values[j];
133 * max_indices[j] = compare ? current_indices[j] : max_indices[j]
134 * current_indices[j] += 8; // update for next outer loop iteration
135 * ++src0;
136 * }
137 * }
138 */
139static inline void vector_32fc_index_max_variant0(__m256 in0,
140 __m256 in1,
141 __m256* max_values,
142 __m256i* max_indices,
143 __m256i* current_indices,
144 __m256i indices_increment)
145{
146 in0 = _mm256_mul_ps(in0, in0);
147 in1 = _mm256_mul_ps(in1, in1);
148
149 /*
150 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
151 * hadd_ps(a, b) computes
152 * (b_7 + b_6,
153 * b_5 + b_4,
154 * ---------
155 * a_7 + b_6,
156 * a_5 + a_4,
157 * ---------
158 * b_3 + b_2,
159 * b_1 + b_0,
160 * ---------
161 * a_3 + a_2,
162 * a_1 + a_0).
163 * The result is the squared absolute value of complex numbers at index
164 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
165 * current_indices!
166 */
167 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
168
169 /*
170 * Compare the recently computed squared absolute values with the
171 * previously determined maximum values. cmp_ps(a, b) determines
172 * a > b ? 0xFFFFFFFF for each element in the vectors =>
173 * compare_mask = abs_squared > max_values ? 0xFFFFFFFF : 0
174 *
175 * If either operand is NaN, 0 is returned as an “ordered” comparision is
176 * used => the blend operation will select the value from *max_values.
177 */
178 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
179
180 /* Select maximum by blending. This is the only line which differs from variant1 */
181 *max_values = _mm256_blendv_ps(*max_values, abs_squared, compare_mask);
182
183 /*
184 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
185 * each element in the vectors =>
186 * max_indices = compare_mask ? current_indices : max_indices
187 *
188 * Note: The casting of data types is required to make the compiler happy
189 * and does not change values.
190 */
191 *max_indices =
192 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
193 _mm256_castsi256_ps(*current_indices),
194 compare_mask));
195
196 /* compute indices of complex numbers which will be loaded in the next iteration */
197 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
198}
199
200/* See _variant0 for details */
201static inline void vector_32fc_index_max_variant1(__m256 in0,
202 __m256 in1,
203 __m256* max_values,
204 __m256i* max_indices,
205 __m256i* current_indices,
206 __m256i indices_increment)
207{
208 in0 = _mm256_mul_ps(in0, in0);
209 in1 = _mm256_mul_ps(in1, in1);
210
211 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
212 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *max_values, _CMP_GT_OS);
213
214 /*
215 * This is the only line which differs from variant0. Using maxps instead of
216 * blendvps is faster on Intel CPUs (on the ones tested with).
217 *
218 * Note: The order of arguments matters if a NaN is encountered in which
219 * case the value of the second argument is selected. This is consistent
220 * with the “ordered” comparision and the blend operation: The comparision
221 * returns false if a NaN is encountered and the blend operation
222 * consequently selects the value from max_indices.
223 */
224 *max_values = _mm256_max_ps(abs_squared, *max_values);
225
226 *max_indices =
227 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*max_indices),
228 _mm256_castsi256_ps(*current_indices),
229 compare_mask));
230
231 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
232}
233
234/*
235 * The function below vectorizes the inner loop of the following code:
236 *
237 * float min_values[8] = {FLT_MAX};
238 * unsigned min_indices[8] = {0};
239 * unsigned current_indices[8] = {0, 1, 2, 3, 4, 5, 6, 7};
240 * for (unsigned i = 0; i < num_points / 8; ++i) {
241 * for (unsigned j = 0; j < 8; ++j) {
242 * float abs_squared = real(src0) * real(src0) + imag(src0) * imag(src1)
243 * bool compare = abs_squared < min_values[j];
244 * min_values[j] = compare ? abs_squared : min_values[j];
245 * min_indices[j] = compare ? current_indices[j] : min_indices[j]
246 * current_indices[j] += 8; // update for next outer loop iteration
247 * ++src0;
248 * }
249 * }
250 */
251static inline void vector_32fc_index_min_variant0(__m256 in0,
252 __m256 in1,
253 __m256* min_values,
254 __m256i* min_indices,
255 __m256i* current_indices,
256 __m256i indices_increment)
257{
258 in0 = _mm256_mul_ps(in0, in0);
259 in1 = _mm256_mul_ps(in1, in1);
260
261 /*
262 * Given the vectors a = (a_7, a_6, …, a_1, a_0) and b = (b_7, b_6, …, b_1, b_0)
263 * hadd_ps(a, b) computes
264 * (b_7 + b_6,
265 * b_5 + b_4,
266 * ---------
267 * a_7 + b_6,
268 * a_5 + a_4,
269 * ---------
270 * b_3 + b_2,
271 * b_1 + b_0,
272 * ---------
273 * a_3 + a_2,
274 * a_1 + a_0).
275 * The result is the squared absolute value of complex numbers at index
276 * offsets (7, 6, 3, 2, 5, 4, 1, 0). This must be the initial value of
277 * current_indices!
278 */
279 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
280
281 /*
282 * Compare the recently computed squared absolute values with the
283 * previously determined minimum values. cmp_ps(a, b) determines
284 * a < b ? 0xFFFFFFFF for each element in the vectors =>
285 * compare_mask = abs_squared < min_values ? 0xFFFFFFFF : 0
286 *
287 * If either operand is NaN, 0 is returned as an “ordered” comparision is
288 * used => the blend operation will select the value from *min_values.
289 */
290 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
291
292 /* Select minimum by blending. This is the only line which differs from variant1 */
293 *min_values = _mm256_blendv_ps(*min_values, abs_squared, compare_mask);
294
295 /*
296 * Updates indices: blendv_ps(a, b, mask) determines mask ? b : a for
297 * each element in the vectors =>
298 * min_indices = compare_mask ? current_indices : min_indices
299 *
300 * Note: The casting of data types is required to make the compiler happy
301 * and does not change values.
302 */
303 *min_indices =
304 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
305 _mm256_castsi256_ps(*current_indices),
306 compare_mask));
307
308 /* compute indices of complex numbers which will be loaded in the next iteration */
309 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
310}
311
312/* See _variant0 for details */
313static inline void vector_32fc_index_min_variant1(__m256 in0,
314 __m256 in1,
315 __m256* min_values,
316 __m256i* min_indices,
317 __m256i* current_indices,
318 __m256i indices_increment)
319{
320 in0 = _mm256_mul_ps(in0, in0);
321 in1 = _mm256_mul_ps(in1, in1);
322
323 __m256 abs_squared = _mm256_hadd_ps(in0, in1);
324 __m256 compare_mask = _mm256_cmp_ps(abs_squared, *min_values, _CMP_LT_OS);
325
326 /*
327 * This is the only line which differs from variant0. Using maxps instead of
328 * blendvps is faster on Intel CPUs (on the ones tested with).
329 *
330 * Note: The order of arguments matters if a NaN is encountered in which
331 * case the value of the second argument is selected. This is consistent
332 * with the “ordered” comparision and the blend operation: The comparision
333 * returns false if a NaN is encountered and the blend operation
334 * consequently selects the value from min_indices.
335 */
336 *min_values = _mm256_min_ps(abs_squared, *min_values);
337
338 *min_indices =
339 _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(*min_indices),
340 _mm256_castsi256_ps(*current_indices),
341 compare_mask));
342
343 *current_indices = _mm256_add_epi32(*current_indices, indices_increment);
344}
345
346#endif /* INCLUDE_VOLK_VOLK_AVX2_INTRINSICS_H_ */
static __m256 _mm256_scaled_norm_dist_ps_avx2(const __m256 symbols0, const __m256 symbols1, const __m256 points0, const __m256 points1, const __m256 scalar)
Definition: volk_avx2_intrinsics.h:105
static __m256 _mm256_polar_sign_mask_avx2(__m128i fbits)
Definition: volk_avx2_intrinsics.h:33
static void vector_32fc_index_max_variant1(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:201
static __m256 _mm256_magnitudesquared_ps_avx2(const __m256 cplxValue0, const __m256 cplxValue1)
Definition: volk_avx2_intrinsics.h:95
static void vector_32fc_index_min_variant0(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:251
static __m256 _mm256_polar_fsign_add_llrs_avx2(__m256 src0, __m256 src1, __m128i fbits)
Definition: volk_avx2_intrinsics.h:81
static void vector_32fc_index_max_variant0(__m256 in0, __m256 in1, __m256 *max_values, __m256i *max_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:139
static void vector_32fc_index_min_variant1(__m256 in0, __m256 in1, __m256 *min_values, __m256i *min_indices, __m256i *current_indices, __m256i indices_increment)
Definition: volk_avx2_intrinsics.h:313
static void _mm256_polar_deinterleave(__m256 *llr0, __m256 *llr1, __m256 src0, __m256 src1)
Definition: volk_avx_intrinsics.h:158