Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_8u_x4_conv_k7_r2_8u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 * Copyright 2014 Free Software Foundation, Inc.
4 *
5 * This file is part of GNU Radio
6 *
7 * GNU Radio is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 3, or (at your option)
10 * any later version.
11 *
12 * GNU Radio is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with GNU Radio; see the file COPYING. If not, write to
19 * the Free Software Foundation, Inc., 51 Franklin Street,
20 * Boston, MA 02110-1301, USA.
21 */
22
58#ifndef INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
59#define INCLUDED_volk_8u_x4_conv_k7_r2_8u_H
60
61typedef union {
62 unsigned char /*DECISIONTYPE*/ t[64 /*NUMSTATES*/ / 8 /*DECISIONTYPE_BITSIZE*/];
63 unsigned int w[64 /*NUMSTATES*/ / 32];
64 unsigned short s[64 /*NUMSTATES*/ / 16];
65 unsigned char c[64 /*NUMSTATES*/ / 8];
66#ifdef _MSC_VER
68#else
69} decision_t __attribute__((aligned(16)));
70#endif
71
72
73static inline void renormalize(unsigned char* X, unsigned char threshold)
74{
75 int NUMSTATES = 64;
76 int i;
77
78 unsigned char min = X[0];
79 // if(min > threshold) {
80 for (i = 0; i < NUMSTATES; i++)
81 if (min > X[i])
82 min = X[i];
83 for (i = 0; i < NUMSTATES; i++)
84 X[i] -= min;
85 //}
86}
87
88
89// helper BFLY for GENERIC version
90static inline void BFLY(int i,
91 int s,
92 unsigned char* syms,
93 unsigned char* Y,
94 unsigned char* X,
95 decision_t* d,
96 unsigned char* Branchtab)
97{
98 int j, decision0, decision1;
99 unsigned char metric, m0, m1, m2, m3;
100
101 int NUMSTATES = 64;
102 int RATE = 2;
103 int METRICSHIFT = 2;
104 int PRECISIONSHIFT = 2;
105
106 metric = 0;
107 for (j = 0; j < RATE; j++)
108 metric += (Branchtab[i + j * NUMSTATES / 2] ^ syms[s * RATE + j]) >> METRICSHIFT;
109 metric = metric >> PRECISIONSHIFT;
110
111 unsigned char max = ((RATE * ((256 - 1) >> METRICSHIFT)) >> PRECISIONSHIFT);
112
113 m0 = X[i] + metric;
114 m1 = X[i + NUMSTATES / 2] + (max - metric);
115 m2 = X[i] + (max - metric);
116 m3 = X[i + NUMSTATES / 2] + metric;
117
118 decision0 = (signed int)(m0 - m1) > 0;
119 decision1 = (signed int)(m2 - m3) > 0;
120
121 Y[2 * i] = decision0 ? m1 : m0;
122 Y[2 * i + 1] = decision1 ? m3 : m2;
123
124 d->w[i / (sizeof(unsigned int) * 8 / 2) +
125 s * (sizeof(decision_t) / sizeof(unsigned int))] |=
126 (decision0 | decision1 << 1) << ((2 * i) & (sizeof(unsigned int) * 8 - 1));
127}
128
129
130//#if LV_HAVE_AVX2
131//
132//#include <immintrin.h>
133//#include <stdio.h>
134//
135// static inline void volk_8u_x4_conv_k7_r2_8u_avx2(unsigned char* Y,
136// unsigned char* X,
137// unsigned char* syms,
138// unsigned char* dec,
139// unsigned int framebits,
140// unsigned int excess,
141// unsigned char* Branchtab)
142//{
143// unsigned int i9;
144// for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
145// unsigned char a75, a81;
146// int a73, a92;
147// int s20, s21;
148// unsigned char *a80, *b6;
149// int *a110, *a91, *a93;
150// __m256i *a112, *a71, *a72, *a77, *a83, *a95;
151// __m256i a86, a87;
152// __m256i a76, a78, a79, a82, a84, a85, a88, a89, a90, d10, d9, m23, m24, m25,
153// m26,
154// s18, s19, s22, s23, s24, s25, t13, t14, t15;
155// a71 = ((__m256i*)X);
156// s18 = *(a71);
157// a72 = (a71 + 1);
158// s19 = *(a72);
159// s22 = _mm256_permute2x128_si256(s18, s19, 0x20);
160// s19 = _mm256_permute2x128_si256(s18, s19, 0x31);
161// s18 = s22;
162// a73 = (4 * i9);
163// b6 = (syms + a73);
164// a75 = *(b6);
165// a76 = _mm256_set1_epi8(a75);
166// a77 = ((__m256i*)Branchtab);
167// a78 = *(a77);
168// a79 = _mm256_xor_si256(a76, a78);
169// a80 = (b6 + 1);
170// a81 = *(a80);
171// a82 = _mm256_set1_epi8(a81);
172// a83 = (a77 + 1);
173// a84 = *(a83);
174// a85 = _mm256_xor_si256(a82, a84);
175// t13 = _mm256_avg_epu8(a79, a85);
176// a86 = ((__m256i)t13);
177// a87 = _mm256_srli_epi16(a86, 2);
178// a88 = ((__m256i)a87);
179// t14 = _mm256_and_si256(a88, _mm256_set1_epi8(63));
180// t15 = _mm256_subs_epu8(_mm256_set1_epi8(63), t14);
181// m23 = _mm256_adds_epu8(s18, t14);
182// m24 = _mm256_adds_epu8(s19, t15);
183// m25 = _mm256_adds_epu8(s18, t15);
184// m26 = _mm256_adds_epu8(s19, t14);
185// a89 = _mm256_min_epu8(m24, m23);
186// d9 = _mm256_cmpeq_epi8(a89, m24);
187// a90 = _mm256_min_epu8(m26, m25);
188// d10 = _mm256_cmpeq_epi8(a90, m26);
189// s22 = _mm256_unpacklo_epi8(d9, d10);
190// s23 = _mm256_unpackhi_epi8(d9, d10);
191// s20 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x20));
192// a91 = ((int*)dec);
193// a92 = (4 * i9);
194// a93 = (a91 + a92);
195// *(a93) = s20;
196// s21 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s22, s23, 0x31));
197// a110 = (a93 + 1);
198// *(a110) = s21;
199// s22 = _mm256_unpacklo_epi8(a89, a90);
200// s23 = _mm256_unpackhi_epi8(a89, a90);
201// a95 = ((__m256i*)Y);
202// s24 = _mm256_permute2x128_si256(s22, s23, 0x20);
203// *(a95) = s24;
204// s23 = _mm256_permute2x128_si256(s22, s23, 0x31);
205// a112 = (a95 + 1);
206// *(a112) = s23;
207// if ((((unsigned char*)Y)[0] > 210)) {
208// __m256i m5, m6;
209// m5 = ((__m256i*)Y)[0];
210// m5 = _mm256_min_epu8(m5, ((__m256i*)Y)[1]);
211// __m256i m7;
212// m7 = _mm256_min_epu8(_mm256_srli_si256(m5, 8), m5);
213// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 32)),
214// ((__m256i)m7)));
215// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 16)),
216// ((__m256i)m7)));
217// m7 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m7, 8)),
218// ((__m256i)m7)));
219// m7 = _mm256_unpacklo_epi8(m7, m7);
220// m7 = _mm256_shufflelo_epi16(m7, 0);
221// m6 = _mm256_unpacklo_epi64(m7, m7);
222// m6 = _mm256_permute2x128_si256(
223// m6, m6, 0); // copy lower half of m6 to upper half, since above ops
224// // operate on 128 bit lanes
225// ((__m256i*)Y)[0] = _mm256_subs_epu8(((__m256i*)Y)[0], m6);
226// ((__m256i*)Y)[1] = _mm256_subs_epu8(((__m256i*)Y)[1], m6);
227// }
228// unsigned char a188, a194;
229// int a205;
230// int s48, s54;
231// unsigned char *a187, *a193;
232// int *a204, *a206, *a223, *b16;
233// __m256i *a184, *a185, *a190, *a196, *a208, *a225;
234// __m256i a199, a200;
235// __m256i a189, a191, a192, a195, a197, a198, a201, a202, a203, d17, d18, m39,
236// m40,
237// m41, m42, s46, s47, s50, s51, t25, t26, t27;
238// a184 = ((__m256i*)Y);
239// s46 = *(a184);
240// a185 = (a184 + 1);
241// s47 = *(a185);
242// s50 = _mm256_permute2x128_si256(s46, s47, 0x20);
243// s47 = _mm256_permute2x128_si256(s46, s47, 0x31);
244// s46 = s50;
245// a187 = (b6 + 2);
246// a188 = *(a187);
247// a189 = _mm256_set1_epi8(a188);
248// a190 = ((__m256i*)Branchtab);
249// a191 = *(a190);
250// a192 = _mm256_xor_si256(a189, a191);
251// a193 = (b6 + 3);
252// a194 = *(a193);
253// a195 = _mm256_set1_epi8(a194);
254// a196 = (a190 + 1);
255// a197 = *(a196);
256// a198 = _mm256_xor_si256(a195, a197);
257// t25 = _mm256_avg_epu8(a192, a198);
258// a199 = ((__m256i)t25);
259// a200 = _mm256_srli_epi16(a199, 2);
260// a201 = ((__m256i)a200);
261// t26 = _mm256_and_si256(a201, _mm256_set1_epi8(63));
262// t27 = _mm256_subs_epu8(_mm256_set1_epi8(63), t26);
263// m39 = _mm256_adds_epu8(s46, t26);
264// m40 = _mm256_adds_epu8(s47, t27);
265// m41 = _mm256_adds_epu8(s46, t27);
266// m42 = _mm256_adds_epu8(s47, t26);
267// a202 = _mm256_min_epu8(m40, m39);
268// d17 = _mm256_cmpeq_epi8(a202, m40);
269// a203 = _mm256_min_epu8(m42, m41);
270// d18 = _mm256_cmpeq_epi8(a203, m42);
271// s24 = _mm256_unpacklo_epi8(d17, d18);
272// s25 = _mm256_unpackhi_epi8(d17, d18);
273// s48 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x20));
274// a204 = ((int*)dec);
275// a205 = (4 * i9);
276// b16 = (a204 + a205);
277// a206 = (b16 + 2);
278// *(a206) = s48;
279// s54 = _mm256_movemask_epi8(_mm256_permute2x128_si256(s24, s25, 0x31));
280// a223 = (b16 + 3);
281// *(a223) = s54;
282// s50 = _mm256_unpacklo_epi8(a202, a203);
283// s51 = _mm256_unpackhi_epi8(a202, a203);
284// s25 = _mm256_permute2x128_si256(s50, s51, 0x20);
285// s51 = _mm256_permute2x128_si256(s50, s51, 0x31);
286// a208 = ((__m256i*)X);
287// *(a208) = s25;
288// a225 = (a208 + 1);
289// *(a225) = s51;
290//
291// if ((((unsigned char*)X)[0] > 210)) {
292// __m256i m12, m13;
293// m12 = ((__m256i*)X)[0];
294// m12 = _mm256_min_epu8(m12, ((__m256i*)X)[1]);
295// __m256i m14;
296// m14 = _mm256_min_epu8(_mm256_srli_si256(m12, 8), m12);
297// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 32)),
298// ((__m256i)m14)));
299// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 16)),
300// ((__m256i)m14)));
301// m14 = ((__m256i)_mm256_min_epu8(((__m256i)_mm256_srli_epi64(m14, 8)),
302// ((__m256i)m14)));
303// m14 = _mm256_unpacklo_epi8(m14, m14);
304// m14 = _mm256_shufflelo_epi16(m14, 0);
305// m13 = _mm256_unpacklo_epi64(m14, m14);
306// m13 = _mm256_permute2x128_si256(m13, m13, 0);
307// ((__m256i*)X)[0] = _mm256_subs_epu8(((__m256i*)X)[0], m13);
308// ((__m256i*)X)[1] = _mm256_subs_epu8(((__m256i*)X)[1], m13);
309// }
310// }
311//
312// renormalize(X, 210);
313//
314// unsigned int j;
315// for (j = 0; j < (framebits + excess) % 2; ++j) {
316// int i;
317// for (i = 0; i < 64 / 2; i++) {
318// BFLY(i,
319// (((framebits + excess) >> 1) << 1) + j,
320// syms,
321// Y,
322// X,
323// (decision_t*)dec,
324// Branchtab);
325// }
326//
327// renormalize(Y, 210);
328// }
329// /*skip*/
330//}
331//
332//#endif /*LV_HAVE_AVX2*/
333
334
335#if LV_HAVE_SSE3
336
337#include <emmintrin.h>
338#include <mmintrin.h>
339#include <pmmintrin.h>
340#include <stdio.h>
341#include <xmmintrin.h>
342
343static inline void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char* Y,
344 unsigned char* X,
345 unsigned char* syms,
346 unsigned char* dec,
347 unsigned int framebits,
348 unsigned int excess,
349 unsigned char* Branchtab)
350{
351 unsigned int i9;
352 for (i9 = 0; i9 < ((framebits + excess) >> 1); i9++) {
353 unsigned char a75, a81;
354 int a73, a92;
355 short int s20, s21, s26, s27;
356 unsigned char *a74, *a80, *b6;
357 short int *a110, *a111, *a91, *a93, *a94;
358 __m128i *a102, *a112, *a113, *a71, *a72, *a77, *a83, *a95, *a96, *a97, *a98, *a99;
359 __m128i a105, a106, a86, a87;
360 __m128i a100, a101, a103, a104, a107, a108, a109, a76, a78, a79, a82, a84, a85,
361 a88, a89, a90, d10, d11, d12, d9, m23, m24, m25, m26, m27, m28, m29, m30, s18,
362 s19, s22, s23, s24, s25, s28, s29, t13, t14, t15, t16, t17, t18;
363 a71 = ((__m128i*)X);
364 s18 = *(a71);
365 a72 = (a71 + 2);
366 s19 = *(a72);
367 a73 = (4 * i9);
368 a74 = (syms + a73);
369 a75 = *(a74);
370 a76 = _mm_set1_epi8(a75);
371 a77 = ((__m128i*)Branchtab);
372 a78 = *(a77);
373 a79 = _mm_xor_si128(a76, a78);
374 b6 = (a73 + syms);
375 a80 = (b6 + 1);
376 a81 = *(a80);
377 a82 = _mm_set1_epi8(a81);
378 a83 = (a77 + 2);
379 a84 = *(a83);
380 a85 = _mm_xor_si128(a82, a84);
381 t13 = _mm_avg_epu8(a79, a85);
382 a86 = ((__m128i)t13);
383 a87 = _mm_srli_epi16(a86, 2);
384 a88 = ((__m128i)a87);
385 t14 = _mm_and_si128(
386 a88,
387 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
388 t15 = _mm_subs_epu8(
389 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
390 t14);
391 m23 = _mm_adds_epu8(s18, t14);
392 m24 = _mm_adds_epu8(s19, t15);
393 m25 = _mm_adds_epu8(s18, t15);
394 m26 = _mm_adds_epu8(s19, t14);
395 a89 = _mm_min_epu8(m24, m23);
396 d9 = _mm_cmpeq_epi8(a89, m24);
397 a90 = _mm_min_epu8(m26, m25);
398 d10 = _mm_cmpeq_epi8(a90, m26);
399 s20 = _mm_movemask_epi8(_mm_unpacklo_epi8(d9, d10));
400 a91 = ((short int*)dec);
401 a92 = (8 * i9);
402 a93 = (a91 + a92);
403 *(a93) = s20;
404 s21 = _mm_movemask_epi8(_mm_unpackhi_epi8(d9, d10));
405 a94 = (a93 + 1);
406 *(a94) = s21;
407 s22 = _mm_unpacklo_epi8(a89, a90);
408 s23 = _mm_unpackhi_epi8(a89, a90);
409 a95 = ((__m128i*)Y);
410 *(a95) = s22;
411 a96 = (a95 + 1);
412 *(a96) = s23;
413 a97 = (a71 + 1);
414 s24 = *(a97);
415 a98 = (a71 + 3);
416 s25 = *(a98);
417 a99 = (a77 + 1);
418 a100 = *(a99);
419 a101 = _mm_xor_si128(a76, a100);
420 a102 = (a77 + 3);
421 a103 = *(a102);
422 a104 = _mm_xor_si128(a82, a103);
423 t16 = _mm_avg_epu8(a101, a104);
424 a105 = ((__m128i)t16);
425 a106 = _mm_srli_epi16(a105, 2);
426 a107 = ((__m128i)a106);
427 t17 = _mm_and_si128(
428 a107,
429 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
430 t18 = _mm_subs_epu8(
431 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
432 t17);
433 m27 = _mm_adds_epu8(s24, t17);
434 m28 = _mm_adds_epu8(s25, t18);
435 m29 = _mm_adds_epu8(s24, t18);
436 m30 = _mm_adds_epu8(s25, t17);
437 a108 = _mm_min_epu8(m28, m27);
438 d11 = _mm_cmpeq_epi8(a108, m28);
439 a109 = _mm_min_epu8(m30, m29);
440 d12 = _mm_cmpeq_epi8(a109, m30);
441 s26 = _mm_movemask_epi8(_mm_unpacklo_epi8(d11, d12));
442 a110 = (a93 + 2);
443 *(a110) = s26;
444 s27 = _mm_movemask_epi8(_mm_unpackhi_epi8(d11, d12));
445 a111 = (a93 + 3);
446 *(a111) = s27;
447 s28 = _mm_unpacklo_epi8(a108, a109);
448 s29 = _mm_unpackhi_epi8(a108, a109);
449 a112 = (a95 + 2);
450 *(a112) = s28;
451 a113 = (a95 + 3);
452 *(a113) = s29;
453 if ((((unsigned char*)Y)[0] > 210)) {
454 __m128i m5, m6;
455 m5 = ((__m128i*)Y)[0];
456 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[1]);
457 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[2]);
458 m5 = _mm_min_epu8(m5, ((__m128i*)Y)[3]);
459 __m128i m7;
460 m7 = _mm_min_epu8(_mm_srli_si128(m5, 8), m5);
461 m7 =
462 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 32)), ((__m128i)m7)));
463 m7 =
464 ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 16)), ((__m128i)m7)));
465 m7 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m7, 8)), ((__m128i)m7)));
466 m7 = _mm_unpacklo_epi8(m7, m7);
467 m7 = _mm_shufflelo_epi16(m7, _MM_SHUFFLE(0, 0, 0, 0));
468 m6 = _mm_unpacklo_epi64(m7, m7);
469 ((__m128i*)Y)[0] = _mm_subs_epu8(((__m128i*)Y)[0], m6);
470 ((__m128i*)Y)[1] = _mm_subs_epu8(((__m128i*)Y)[1], m6);
471 ((__m128i*)Y)[2] = _mm_subs_epu8(((__m128i*)Y)[2], m6);
472 ((__m128i*)Y)[3] = _mm_subs_epu8(((__m128i*)Y)[3], m6);
473 }
474 unsigned char a188, a194;
475 int a186, a205;
476 short int s48, s49, s54, s55;
477 unsigned char *a187, *a193, *b15;
478 short int *a204, *a206, *a207, *a223, *a224, *b16;
479 __m128i *a184, *a185, *a190, *a196, *a208, *a209, *a210, *a211, *a212, *a215,
480 *a225, *a226;
481 __m128i a199, a200, a218, a219;
482 __m128i a189, a191, a192, a195, a197, a198, a201, a202, a203, a213, a214, a216,
483 a217, a220, a221, a222, d17, d18, d19, d20, m39, m40, m41, m42, m43, m44, m45,
484 m46, s46, s47, s50, s51, s52, s53, s56, s57, t25, t26, t27, t28, t29, t30;
485 a184 = ((__m128i*)Y);
486 s46 = *(a184);
487 a185 = (a184 + 2);
488 s47 = *(a185);
489 a186 = (4 * i9);
490 b15 = (a186 + syms);
491 a187 = (b15 + 2);
492 a188 = *(a187);
493 a189 = _mm_set1_epi8(a188);
494 a190 = ((__m128i*)Branchtab);
495 a191 = *(a190);
496 a192 = _mm_xor_si128(a189, a191);
497 a193 = (b15 + 3);
498 a194 = *(a193);
499 a195 = _mm_set1_epi8(a194);
500 a196 = (a190 + 2);
501 a197 = *(a196);
502 a198 = _mm_xor_si128(a195, a197);
503 t25 = _mm_avg_epu8(a192, a198);
504 a199 = ((__m128i)t25);
505 a200 = _mm_srli_epi16(a199, 2);
506 a201 = ((__m128i)a200);
507 t26 = _mm_and_si128(
508 a201,
509 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
510 t27 = _mm_subs_epu8(
511 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
512 t26);
513 m39 = _mm_adds_epu8(s46, t26);
514 m40 = _mm_adds_epu8(s47, t27);
515 m41 = _mm_adds_epu8(s46, t27);
516 m42 = _mm_adds_epu8(s47, t26);
517 a202 = _mm_min_epu8(m40, m39);
518 d17 = _mm_cmpeq_epi8(a202, m40);
519 a203 = _mm_min_epu8(m42, m41);
520 d18 = _mm_cmpeq_epi8(a203, m42);
521 s48 = _mm_movemask_epi8(_mm_unpacklo_epi8(d17, d18));
522 a204 = ((short int*)dec);
523 a205 = (8 * i9);
524 b16 = (a204 + a205);
525 a206 = (b16 + 4);
526 *(a206) = s48;
527 s49 = _mm_movemask_epi8(_mm_unpackhi_epi8(d17, d18));
528 a207 = (b16 + 5);
529 *(a207) = s49;
530 s50 = _mm_unpacklo_epi8(a202, a203);
531 s51 = _mm_unpackhi_epi8(a202, a203);
532 a208 = ((__m128i*)X);
533 *(a208) = s50;
534 a209 = (a208 + 1);
535 *(a209) = s51;
536 a210 = (a184 + 1);
537 s52 = *(a210);
538 a211 = (a184 + 3);
539 s53 = *(a211);
540 a212 = (a190 + 1);
541 a213 = *(a212);
542 a214 = _mm_xor_si128(a189, a213);
543 a215 = (a190 + 3);
544 a216 = *(a215);
545 a217 = _mm_xor_si128(a195, a216);
546 t28 = _mm_avg_epu8(a214, a217);
547 a218 = ((__m128i)t28);
548 a219 = _mm_srli_epi16(a218, 2);
549 a220 = ((__m128i)a219);
550 t29 = _mm_and_si128(
551 a220,
552 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63));
553 t30 = _mm_subs_epu8(
554 _mm_set_epi8(63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63),
555 t29);
556 m43 = _mm_adds_epu8(s52, t29);
557 m44 = _mm_adds_epu8(s53, t30);
558 m45 = _mm_adds_epu8(s52, t30);
559 m46 = _mm_adds_epu8(s53, t29);
560 a221 = _mm_min_epu8(m44, m43);
561 d19 = _mm_cmpeq_epi8(a221, m44);
562 a222 = _mm_min_epu8(m46, m45);
563 d20 = _mm_cmpeq_epi8(a222, m46);
564 s54 = _mm_movemask_epi8(_mm_unpacklo_epi8(d19, d20));
565 a223 = (b16 + 6);
566 *(a223) = s54;
567 s55 = _mm_movemask_epi8(_mm_unpackhi_epi8(d19, d20));
568 a224 = (b16 + 7);
569 *(a224) = s55;
570 s56 = _mm_unpacklo_epi8(a221, a222);
571 s57 = _mm_unpackhi_epi8(a221, a222);
572 a225 = (a208 + 2);
573 *(a225) = s56;
574 a226 = (a208 + 3);
575 *(a226) = s57;
576 if ((((unsigned char*)X)[0] > 210)) {
577 __m128i m12, m13;
578 m12 = ((__m128i*)X)[0];
579 m12 = _mm_min_epu8(m12, ((__m128i*)X)[1]);
580 m12 = _mm_min_epu8(m12, ((__m128i*)X)[2]);
581 m12 = _mm_min_epu8(m12, ((__m128i*)X)[3]);
582 __m128i m14;
583 m14 = _mm_min_epu8(_mm_srli_si128(m12, 8), m12);
584 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 32)),
585 ((__m128i)m14)));
586 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 16)),
587 ((__m128i)m14)));
588 m14 = ((__m128i)_mm_min_epu8(((__m128i)_mm_srli_epi64(m14, 8)),
589 ((__m128i)m14)));
590 m14 = _mm_unpacklo_epi8(m14, m14);
591 m14 = _mm_shufflelo_epi16(m14, _MM_SHUFFLE(0, 0, 0, 0));
592 m13 = _mm_unpacklo_epi64(m14, m14);
593 ((__m128i*)X)[0] = _mm_subs_epu8(((__m128i*)X)[0], m13);
594 ((__m128i*)X)[1] = _mm_subs_epu8(((__m128i*)X)[1], m13);
595 ((__m128i*)X)[2] = _mm_subs_epu8(((__m128i*)X)[2], m13);
596 ((__m128i*)X)[3] = _mm_subs_epu8(((__m128i*)X)[3], m13);
597 }
598 }
599
600 renormalize(X, 210);
601
602 /*int ch;
603 for(ch = 0; ch < 64; ch++) {
604 printf("%d,", X[ch]);
605 }
606 printf("\n");*/
607
608 unsigned int j;
609 for (j = 0; j < (framebits + excess) % 2; ++j) {
610 int i;
611 for (i = 0; i < 64 / 2; i++) {
612 BFLY(i,
613 (((framebits + excess) >> 1) << 1) + j,
614 syms,
615 Y,
616 X,
617 (decision_t*)dec,
618 Branchtab);
619 }
620
621
622 renormalize(Y, 210);
623
624 /*printf("\n");
625 for(ch = 0; ch < 64; ch++) {
626 printf("%d,", Y[ch]);
627 }
628 printf("\n");*/
629 }
630 /*skip*/
631}
632
633#endif /*LV_HAVE_SSE3*/
634
635
636#if LV_HAVE_GENERIC
637
638static inline void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char* Y,
639 unsigned char* X,
640 unsigned char* syms,
641 unsigned char* dec,
642 unsigned int framebits,
643 unsigned int excess,
644 unsigned char* Branchtab)
645{
646 int nbits = framebits + excess;
647 int NUMSTATES = 64;
648 int RENORMALIZE_THRESHOLD = 210;
649
650 int s, i;
651 for (s = 0; s < nbits; s++) {
652 void* tmp;
653 for (i = 0; i < NUMSTATES / 2; i++) {
654 BFLY(i, s, syms, Y, X, (decision_t*)dec, Branchtab);
655 }
656
657 renormalize(Y, RENORMALIZE_THRESHOLD);
658
660 tmp = (void*)X;
661 X = Y;
662 Y = (unsigned char*)tmp;
663 }
664}
665
666#endif /* LV_HAVE_GENERIC */
667
668#endif /*INCLUDED_volk_8u_x4_conv_k7_r2_8u_H*/
Definition: volk_8u_x4_conv_k7_r2_8u.h:61
unsigned int w[64/32]
Definition: volk_8u_x4_conv_k7_r2_8u.h:63
static void BFLY(int i, int s, unsigned char *syms, unsigned char *Y, unsigned char *X, decision_t *d, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:90
static void volk_8u_x4_conv_k7_r2_8u_spiral(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:343
static void volk_8u_x4_conv_k7_r2_8u_generic(unsigned char *Y, unsigned char *X, unsigned char *syms, unsigned char *dec, unsigned int framebits, unsigned int excess, unsigned char *Branchtab)
Definition: volk_8u_x4_conv_k7_r2_8u.h:638
static void renormalize(unsigned char *X, unsigned char threshold)
Definition: volk_8u_x4_conv_k7_r2_8u.h:73
for i
Definition: volk_config_fixed.tmpl.h:25