27#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
28#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_U_H_
34 static const unsigned int b[] = {
35 0xAAAAAAAA, 0xCCCCCCCC, 0xF0F0F0F0, 0xFF00FF00, 0xFFFF0000
38 unsigned int res = (
val & b[0]) != 0;
39 res |= ((
val & b[4]) != 0) << 4;
40 res |= ((
val & b[3]) != 0) << 3;
41 res |= ((
val & b[2]) != 0) << 2;
42 res |= ((
val & b[1]) != 0) << 1;
47 const unsigned char* temp_ptr,
48 const unsigned int num_branches,
49 const unsigned int frame_half)
51 unsigned int branch, bit;
52 for (branch = 0; branch < num_branches; ++branch) {
53 for (bit = 0; bit < frame_half; ++bit) {
54 *frame_ptr = *temp_ptr ^ *(temp_ptr + 1);
55 *(frame_ptr + frame_half) = *(temp_ptr + 1);
59 frame_ptr += frame_half;
67 unsigned int frame_size)
70 unsigned int frame_half = frame_size >> 1;
71 unsigned int num_branches = 1;
76 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
79 num_branches = num_branches << 1;
80 frame_half = frame_half >> 1;
91 unsigned int frame_size)
95 unsigned int stage = po2;
96 unsigned char* frame_ptr = frame;
97 unsigned char* temp_ptr = temp;
99 unsigned int frame_half = frame_size >> 1;
100 unsigned int num_branches = 1;
105 const __m128i mask_stage1 = _mm_set_epi8(0x0,
123 __m128i r_frame0, r_temp0, shifted;
126 __m128i r_frame1, r_temp1;
127 const __m128i shuffle_separate =
128 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
135 for (branch = 0; branch < num_branches; ++branch) {
136 for (bit = 0; bit < frame_half; bit += 16) {
137 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
139 r_temp1 = _mm_loadu_si128((__m128i*)temp_ptr);
142 shifted = _mm_srli_si128(r_temp0, 1);
143 shifted = _mm_and_si128(shifted, mask_stage1);
144 r_temp0 = _mm_xor_si128(shifted, r_temp0);
145 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
147 shifted = _mm_srli_si128(r_temp1, 1);
148 shifted = _mm_and_si128(shifted, mask_stage1);
149 r_temp1 = _mm_xor_si128(shifted, r_temp1);
150 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
152 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
153 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
155 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
156 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
160 frame_ptr += frame_half;
162 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
164 num_branches = num_branches << 1;
165 frame_half = frame_half >> 1;
180 const __m128i shuffle_stage4 =
181 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
182 const __m128i mask_stage4 = _mm_set_epi8(0x0,
198 const __m128i mask_stage3 = _mm_set_epi8(0x0,
214 const __m128i mask_stage2 = _mm_set_epi8(0x0,
231 for (branch = 0; branch < num_branches; ++branch) {
232 r_temp0 = _mm_loadu_si128((__m128i*)temp_ptr);
239 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
241 shifted = _mm_srli_si128(r_temp0, 8);
242 shifted = _mm_and_si128(shifted, mask_stage4);
243 r_frame0 = _mm_xor_si128(shifted, r_temp0);
245 shifted = _mm_srli_si128(r_frame0, 4);
246 shifted = _mm_and_si128(shifted, mask_stage3);
247 r_frame0 = _mm_xor_si128(shifted, r_frame0);
249 shifted = _mm_srli_si128(r_frame0, 2);
250 shifted = _mm_and_si128(shifted, mask_stage2);
251 r_frame0 = _mm_xor_si128(shifted, r_frame0);
253 shifted = _mm_srli_si128(r_frame0, 1);
254 shifted = _mm_and_si128(shifted, mask_stage1);
255 r_frame0 = _mm_xor_si128(shifted, r_frame0);
258 _mm_storeu_si128((__m128i*)frame_ptr, r_frame0);
266#include <immintrin.h>
268static inline void volk_8u_x2_encodeframepolar_8u_u_avx2(
unsigned char* frame,
270 unsigned int frame_size)
274 unsigned int stage = po2;
275 unsigned char* frame_ptr = frame;
276 unsigned char* temp_ptr = temp;
278 unsigned int frame_half = frame_size >> 1;
279 unsigned int num_branches = 1;
284 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
317 const __m128i mask_stage0 = _mm_set_epi8(0x0,
334 __m256i r_frame0, r_temp0, shifted;
335 __m128i r_temp2, r_frame2, shifted2;
337 __m256i r_frame1, r_temp1;
338 __m128i r_frame3, r_temp3;
339 const __m256i shuffle_separate = _mm256_setr_epi8(0,
371 const __m128i shuffle_separate128 =
372 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
379 for (branch = 0; branch < num_branches; ++branch) {
380 for (bit = 0; bit < frame_half; bit += 32) {
381 if ((frame_half - bit) <
384 r_temp2 = _mm_loadu_si128((__m128i*)temp_ptr);
386 r_temp3 = _mm_loadu_si128((__m128i*)temp_ptr);
389 shifted2 = _mm_srli_si128(r_temp2, 1);
390 shifted2 = _mm_and_si128(shifted2, mask_stage0);
391 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
392 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
394 shifted2 = _mm_srli_si128(r_temp3, 1);
395 shifted2 = _mm_and_si128(shifted2, mask_stage0);
396 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
397 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
399 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
400 _mm_storeu_si128((__m128i*)frame_ptr, r_frame2);
402 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
403 _mm_storeu_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
407 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
409 r_temp1 = _mm256_loadu_si256((__m256i*)temp_ptr);
412 shifted = _mm256_srli_si256(r_temp0, 1);
413 shifted = _mm256_and_si256(shifted, mask_stage1);
414 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
415 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
417 shifted = _mm256_srli_si256(r_temp1, 1);
418 shifted = _mm256_and_si256(shifted, mask_stage1);
419 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
420 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
422 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
423 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
424 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
425 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
427 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
429 _mm256_storeu_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
433 frame_ptr += frame_half;
435 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
437 num_branches = num_branches << 1;
438 frame_half = frame_half >> 1;
453 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
485 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
517 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
549 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
582 for (branch = 0; branch < num_branches / 2; ++branch) {
583 r_temp0 = _mm256_loadu_si256((__m256i*)temp_ptr);
590 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
592 shifted = _mm256_srli_si256(r_temp0, 8);
593 shifted = _mm256_and_si256(shifted, mask_stage4);
594 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
597 shifted = _mm256_srli_si256(r_frame0, 4);
598 shifted = _mm256_and_si256(shifted, mask_stage3);
599 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
601 shifted = _mm256_srli_si256(r_frame0, 2);
602 shifted = _mm256_and_si256(shifted, mask_stage2);
603 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
605 shifted = _mm256_srli_si256(r_frame0, 1);
606 shifted = _mm256_and_si256(shifted, mask_stage1);
607 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
610 _mm256_storeu_si256((__m256i*)frame_ptr, r_frame0);
618#ifndef VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
619#define VOLK_KERNELS_VOLK_VOLK_8U_X2_ENCODEFRAMEPOLAR_8U_A_H_
622#include <tmmintrin.h>
626 unsigned int frame_size)
630 unsigned int stage = po2;
631 unsigned char* frame_ptr = frame;
632 unsigned char* temp_ptr = temp;
634 unsigned int frame_half = frame_size >> 1;
635 unsigned int num_branches = 1;
640 const __m128i mask_stage1 = _mm_set_epi8(0x0,
658 __m128i r_frame0, r_temp0, shifted;
661 __m128i r_frame1, r_temp1;
662 const __m128i shuffle_separate =
663 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
670 for (branch = 0; branch < num_branches; ++branch) {
671 for (bit = 0; bit < frame_half; bit += 16) {
672 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
674 r_temp1 = _mm_load_si128((__m128i*)temp_ptr);
677 shifted = _mm_srli_si128(r_temp0, 1);
678 shifted = _mm_and_si128(shifted, mask_stage1);
679 r_temp0 = _mm_xor_si128(shifted, r_temp0);
680 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_separate);
682 shifted = _mm_srli_si128(r_temp1, 1);
683 shifted = _mm_and_si128(shifted, mask_stage1);
684 r_temp1 = _mm_xor_si128(shifted, r_temp1);
685 r_temp1 = _mm_shuffle_epi8(r_temp1, shuffle_separate);
687 r_frame0 = _mm_unpacklo_epi64(r_temp0, r_temp1);
688 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
690 r_frame1 = _mm_unpackhi_epi64(r_temp0, r_temp1);
691 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame1);
695 frame_ptr += frame_half;
697 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
699 num_branches = num_branches << 1;
700 frame_half = frame_half >> 1;
715 const __m128i shuffle_stage4 =
716 _mm_setr_epi8(0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15);
717 const __m128i mask_stage4 = _mm_set_epi8(0x0,
733 const __m128i mask_stage3 = _mm_set_epi8(0x0,
749 const __m128i mask_stage2 = _mm_set_epi8(0x0,
766 for (branch = 0; branch < num_branches; ++branch) {
767 r_temp0 = _mm_load_si128((__m128i*)temp_ptr);
774 r_temp0 = _mm_shuffle_epi8(r_temp0, shuffle_stage4);
776 shifted = _mm_srli_si128(r_temp0, 8);
777 shifted = _mm_and_si128(shifted, mask_stage4);
778 r_frame0 = _mm_xor_si128(shifted, r_temp0);
780 shifted = _mm_srli_si128(r_frame0, 4);
781 shifted = _mm_and_si128(shifted, mask_stage3);
782 r_frame0 = _mm_xor_si128(shifted, r_frame0);
784 shifted = _mm_srli_si128(r_frame0, 2);
785 shifted = _mm_and_si128(shifted, mask_stage2);
786 r_frame0 = _mm_xor_si128(shifted, r_frame0);
788 shifted = _mm_srli_si128(r_frame0, 1);
789 shifted = _mm_and_si128(shifted, mask_stage1);
790 r_frame0 = _mm_xor_si128(shifted, r_frame0);
793 _mm_store_si128((__m128i*)frame_ptr, r_frame0);
800#include <immintrin.h>
802static inline void volk_8u_x2_encodeframepolar_8u_a_avx2(
unsigned char* frame,
804 unsigned int frame_size)
808 unsigned int stage = po2;
809 unsigned char* frame_ptr = frame;
810 unsigned char* temp_ptr = temp;
812 unsigned int frame_half = frame_size >> 1;
813 unsigned int num_branches = 1;
818 const __m256i mask_stage1 = _mm256_set_epi8(0x0,
851 const __m128i mask_stage0 = _mm_set_epi8(0x0,
868 __m256i r_frame0, r_temp0, shifted;
869 __m128i r_temp2, r_frame2, shifted2;
871 __m256i r_frame1, r_temp1;
872 __m128i r_frame3, r_temp3;
873 const __m256i shuffle_separate = _mm256_setr_epi8(0,
905 const __m128i shuffle_separate128 =
906 _mm_setr_epi8(0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15);
913 for (branch = 0; branch < num_branches; ++branch) {
914 for (bit = 0; bit < frame_half; bit += 32) {
915 if ((frame_half - bit) <
918 r_temp2 = _mm_load_si128((__m128i*)temp_ptr);
920 r_temp3 = _mm_load_si128((__m128i*)temp_ptr);
923 shifted2 = _mm_srli_si128(r_temp2, 1);
924 shifted2 = _mm_and_si128(shifted2, mask_stage0);
925 r_temp2 = _mm_xor_si128(shifted2, r_temp2);
926 r_temp2 = _mm_shuffle_epi8(r_temp2, shuffle_separate128);
928 shifted2 = _mm_srli_si128(r_temp3, 1);
929 shifted2 = _mm_and_si128(shifted2, mask_stage0);
930 r_temp3 = _mm_xor_si128(shifted2, r_temp3);
931 r_temp3 = _mm_shuffle_epi8(r_temp3, shuffle_separate128);
933 r_frame2 = _mm_unpacklo_epi64(r_temp2, r_temp3);
934 _mm_store_si128((__m128i*)frame_ptr, r_frame2);
936 r_frame3 = _mm_unpackhi_epi64(r_temp2, r_temp3);
937 _mm_store_si128((__m128i*)(frame_ptr + frame_half), r_frame3);
941 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
943 r_temp1 = _mm256_load_si256((__m256i*)temp_ptr);
946 shifted = _mm256_srli_si256(r_temp0, 1);
947 shifted = _mm256_and_si256(shifted, mask_stage1);
948 r_temp0 = _mm256_xor_si256(shifted, r_temp0);
949 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_separate);
951 shifted = _mm256_srli_si256(r_temp1, 1);
952 shifted = _mm256_and_si256(shifted, mask_stage1);
953 r_temp1 = _mm256_xor_si256(shifted, r_temp1);
954 r_temp1 = _mm256_shuffle_epi8(r_temp1, shuffle_separate);
956 r_frame0 = _mm256_unpacklo_epi64(r_temp0, r_temp1);
957 r_temp1 = _mm256_unpackhi_epi64(r_temp0, r_temp1);
958 r_frame0 = _mm256_permute4x64_epi64(r_frame0, 0xd8);
959 r_frame1 = _mm256_permute4x64_epi64(r_temp1, 0xd8);
961 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
963 _mm256_store_si256((__m256i*)(frame_ptr + frame_half), r_frame1);
967 frame_ptr += frame_half;
969 memcpy(temp, frame,
sizeof(
unsigned char) * frame_size);
971 num_branches = num_branches << 1;
972 frame_half = frame_half >> 1;
987 const __m256i shuffle_stage4 = _mm256_setr_epi8(0,
1019 const __m256i mask_stage4 = _mm256_set_epi8(0x0,
1051 const __m256i mask_stage3 = _mm256_set_epi8(0x0,
1083 const __m256i mask_stage2 = _mm256_set_epi8(0x0,
1116 for (branch = 0; branch < num_branches / 2; ++branch) {
1117 r_temp0 = _mm256_load_si256((__m256i*)temp_ptr);
1124 r_temp0 = _mm256_shuffle_epi8(r_temp0, shuffle_stage4);
1126 shifted = _mm256_srli_si256(r_temp0, 8);
1127 shifted = _mm256_and_si256(shifted, mask_stage4);
1128 r_frame0 = _mm256_xor_si256(shifted, r_temp0);
1130 shifted = _mm256_srli_si256(r_frame0, 4);
1131 shifted = _mm256_and_si256(shifted, mask_stage3);
1132 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1134 shifted = _mm256_srli_si256(r_frame0, 2);
1135 shifted = _mm256_and_si256(shifted, mask_stage2);
1136 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1138 shifted = _mm256_srli_si256(r_frame0, 1);
1139 shifted = _mm256_and_si256(shifted, mask_stage1);
1140 r_frame0 = _mm256_xor_si256(shifted, r_frame0);
1143 _mm256_store_si256((__m256i*)frame_ptr, r_frame0);
val
Definition: volk_arch_defs.py:66
static void volk_8u_x2_encodeframepolar_8u_a_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:624
static void encodepolar_single_stage(unsigned char *frame_ptr, const unsigned char *temp_ptr, const unsigned int num_branches, const unsigned int frame_half)
Definition: volk_8u_x2_encodeframepolar_8u.h:46
static void volk_8u_x2_encodeframepolar_8u_generic(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:65
static unsigned int log2_of_power_of_2(unsigned int val)
Definition: volk_8u_x2_encodeframepolar_8u.h:31
static void volk_8u_x2_encodeframepolar_8u_u_ssse3(unsigned char *frame, unsigned char *temp, unsigned int frame_size)
Definition: volk_8u_x2_encodeframepolar_8u.h:89
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62