Vector Optimized Library of Kernels 2.5.1
Architecture-tuned implementations of math kernels
volk_32u_reverse_32u.h
Go to the documentation of this file.
1/* -*- c++ -*- */
2/*
3 Copyright (C) 2018 Free Software Foundation, Inc.
4
5 This file is pat of libVOLK
6
7 All rights reserved.
8
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU Lesser General Public License version 2.1, as
11 published by the Free Software Foundation. This program is
12 distributed in the hope that it will be useful, but WITHOUT ANY
13 WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
19*/
20
41#ifndef INCLUDED_VOLK_32u_REVERSE_32u_U_H
43 int b00 : 1;
44 int b01 : 1;
45 int b02 : 1;
46 int b03 : 1;
47 int b04 : 1;
48 int b05 : 1;
49 int b06 : 1;
50 int b07 : 1;
51 int b08 : 1;
52 int b09 : 1;
53 int b10 : 1;
54 int b11 : 1;
55 int b12 : 1;
56 int b13 : 1;
57 int b14 : 1;
58 int b15 : 1;
59 int b16 : 1;
60 int b17 : 1;
61 int b18 : 1;
62 int b19 : 1;
63 int b20 : 1;
64 int b21 : 1;
65 int b22 : 1;
66 int b23 : 1;
67 int b24 : 1;
68 int b25 : 1;
69 int b26 : 1;
70 int b27 : 1;
71 int b28 : 1;
72 int b29 : 1;
73 int b30 : 1;
74 int b31 : 1;
75};
76struct char_split {
77 uint8_t b00 : 1;
78 uint8_t b01 : 1;
79 uint8_t b02 : 1;
80 uint8_t b03 : 1;
81 uint8_t b04 : 1;
82 uint8_t b05 : 1;
83 uint8_t b06 : 1;
84 uint8_t b07 : 1;
85};
86
87// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
88// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
89static const unsigned char BitReverseTable256[] = {
90 0x00, 0x80, 0x40, 0xC0, 0x20, 0xA0, 0x60, 0xE0, 0x10, 0x90, 0x50, 0xD0, 0x30, 0xB0,
91 0x70, 0xF0, 0x08, 0x88, 0x48, 0xC8, 0x28, 0xA8, 0x68, 0xE8, 0x18, 0x98, 0x58, 0xD8,
92 0x38, 0xB8, 0x78, 0xF8, 0x04, 0x84, 0x44, 0xC4, 0x24, 0xA4, 0x64, 0xE4, 0x14, 0x94,
93 0x54, 0xD4, 0x34, 0xB4, 0x74, 0xF4, 0x0C, 0x8C, 0x4C, 0xCC, 0x2C, 0xAC, 0x6C, 0xEC,
94 0x1C, 0x9C, 0x5C, 0xDC, 0x3C, 0xBC, 0x7C, 0xFC, 0x02, 0x82, 0x42, 0xC2, 0x22, 0xA2,
95 0x62, 0xE2, 0x12, 0x92, 0x52, 0xD2, 0x32, 0xB2, 0x72, 0xF2, 0x0A, 0x8A, 0x4A, 0xCA,
96 0x2A, 0xAA, 0x6A, 0xEA, 0x1A, 0x9A, 0x5A, 0xDA, 0x3A, 0xBA, 0x7A, 0xFA, 0x06, 0x86,
97 0x46, 0xC6, 0x26, 0xA6, 0x66, 0xE6, 0x16, 0x96, 0x56, 0xD6, 0x36, 0xB6, 0x76, 0xF6,
98 0x0E, 0x8E, 0x4E, 0xCE, 0x2E, 0xAE, 0x6E, 0xEE, 0x1E, 0x9E, 0x5E, 0xDE, 0x3E, 0xBE,
99 0x7E, 0xFE, 0x01, 0x81, 0x41, 0xC1, 0x21, 0xA1, 0x61, 0xE1, 0x11, 0x91, 0x51, 0xD1,
100 0x31, 0xB1, 0x71, 0xF1, 0x09, 0x89, 0x49, 0xC9, 0x29, 0xA9, 0x69, 0xE9, 0x19, 0x99,
101 0x59, 0xD9, 0x39, 0xB9, 0x79, 0xF9, 0x05, 0x85, 0x45, 0xC5, 0x25, 0xA5, 0x65, 0xE5,
102 0x15, 0x95, 0x55, 0xD5, 0x35, 0xB5, 0x75, 0xF5, 0x0D, 0x8D, 0x4D, 0xCD, 0x2D, 0xAD,
103 0x6D, 0xED, 0x1D, 0x9D, 0x5D, 0xDD, 0x3D, 0xBD, 0x7D, 0xFD, 0x03, 0x83, 0x43, 0xC3,
104 0x23, 0xA3, 0x63, 0xE3, 0x13, 0x93, 0x53, 0xD3, 0x33, 0xB3, 0x73, 0xF3, 0x0B, 0x8B,
105 0x4B, 0xCB, 0x2B, 0xAB, 0x6B, 0xEB, 0x1B, 0x9B, 0x5B, 0xDB, 0x3B, 0xBB, 0x7B, 0xFB,
106 0x07, 0x87, 0x47, 0xC7, 0x27, 0xA7, 0x67, 0xE7, 0x17, 0x97, 0x57, 0xD7, 0x37, 0xB7,
107 0x77, 0xF7, 0x0F, 0x8F, 0x4F, 0xCF, 0x2F, 0xAF, 0x6F, 0xEF, 0x1F, 0x9F, 0x5F, 0xDF,
108 0x3F, 0xBF, 0x7F, 0xFF
109};
110#ifdef LV_HAVE_GENERIC
111static inline void volk_32u_reverse_32u_dword_shuffle(uint32_t* out,
112 const uint32_t* in,
113 unsigned int num_points)
114{
115 const struct dword_split* in_ptr = (const struct dword_split*)in;
116 struct dword_split* out_ptr = (struct dword_split*)out;
117 unsigned int number = 0;
118 for (; number < num_points; ++number) {
119 out_ptr->b00 = in_ptr->b31;
120 out_ptr->b01 = in_ptr->b30;
121 out_ptr->b02 = in_ptr->b29;
122 out_ptr->b03 = in_ptr->b28;
123 out_ptr->b04 = in_ptr->b27;
124 out_ptr->b05 = in_ptr->b26;
125 out_ptr->b06 = in_ptr->b25;
126 out_ptr->b07 = in_ptr->b24;
127 out_ptr->b08 = in_ptr->b23;
128 out_ptr->b09 = in_ptr->b22;
129 out_ptr->b10 = in_ptr->b21;
130 out_ptr->b11 = in_ptr->b20;
131 out_ptr->b12 = in_ptr->b19;
132 out_ptr->b13 = in_ptr->b18;
133 out_ptr->b14 = in_ptr->b17;
134 out_ptr->b15 = in_ptr->b16;
135 out_ptr->b16 = in_ptr->b15;
136 out_ptr->b17 = in_ptr->b14;
137 out_ptr->b18 = in_ptr->b13;
138 out_ptr->b19 = in_ptr->b12;
139 out_ptr->b20 = in_ptr->b11;
140 out_ptr->b21 = in_ptr->b10;
141 out_ptr->b22 = in_ptr->b09;
142 out_ptr->b23 = in_ptr->b08;
143 out_ptr->b24 = in_ptr->b07;
144 out_ptr->b25 = in_ptr->b06;
145 out_ptr->b26 = in_ptr->b05;
146 out_ptr->b27 = in_ptr->b04;
147 out_ptr->b28 = in_ptr->b03;
148 out_ptr->b29 = in_ptr->b02;
149 out_ptr->b30 = in_ptr->b01;
150 out_ptr->b31 = in_ptr->b00;
151 ++in_ptr;
152 ++out_ptr;
153 }
154}
155#endif /* LV_HAVE_GENERIC */
156
157#ifdef LV_HAVE_GENERIC
158static inline void volk_32u_reverse_32u_byte_shuffle(uint32_t* out,
159 const uint32_t* in,
160 unsigned int num_points)
161{
162 const uint32_t* in_ptr = in;
163 uint32_t* out_ptr = out;
164 unsigned int number = 0;
165 for (; number < num_points; ++number) {
166 const struct char_split* in8 = (const struct char_split*)in_ptr;
167 struct char_split* out8 = (struct char_split*)out_ptr;
168
169 out8[3].b00 = in8[0].b07;
170 out8[3].b01 = in8[0].b06;
171 out8[3].b02 = in8[0].b05;
172 out8[3].b03 = in8[0].b04;
173 out8[3].b04 = in8[0].b03;
174 out8[3].b05 = in8[0].b02;
175 out8[3].b06 = in8[0].b01;
176 out8[3].b07 = in8[0].b00;
177
178 out8[2].b00 = in8[1].b07;
179 out8[2].b01 = in8[1].b06;
180 out8[2].b02 = in8[1].b05;
181 out8[2].b03 = in8[1].b04;
182 out8[2].b04 = in8[1].b03;
183 out8[2].b05 = in8[1].b02;
184 out8[2].b06 = in8[1].b01;
185 out8[2].b07 = in8[1].b00;
186
187 out8[1].b00 = in8[2].b07;
188 out8[1].b01 = in8[2].b06;
189 out8[1].b02 = in8[2].b05;
190 out8[1].b03 = in8[2].b04;
191 out8[1].b04 = in8[2].b03;
192 out8[1].b05 = in8[2].b02;
193 out8[1].b06 = in8[2].b01;
194 out8[1].b07 = in8[2].b00;
195
196 out8[0].b00 = in8[3].b07;
197 out8[0].b01 = in8[3].b06;
198 out8[0].b02 = in8[3].b05;
199 out8[0].b03 = in8[3].b04;
200 out8[0].b04 = in8[3].b03;
201 out8[0].b05 = in8[3].b02;
202 out8[0].b06 = in8[3].b01;
203 out8[0].b07 = in8[3].b00;
204 ++in_ptr;
205 ++out_ptr;
206 }
207}
208#endif /* LV_HAVE_GENERIC */
209
210// Idea from "Bit Twiddling Hacks", which dedicates this method to public domain
211// http://graphics.stanford.edu/~seander/bithacks.html#BitReverseTable
212#ifdef LV_HAVE_GENERIC
213static inline void
214volk_32u_reverse_32u_lut(uint32_t* out, const uint32_t* in, unsigned int num_points)
215{
216 const uint32_t* in_ptr = in;
217 uint32_t* out_ptr = out;
218 unsigned int number = 0;
219 for (; number < num_points; ++number) {
220 *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
221 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
222 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
223 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
224 ++in_ptr;
225 ++out_ptr;
226 }
227}
228#endif /* LV_HAVE_GENERIC */
229
230// Single-Byte code from "Bit Twiddling Hacks", which dedicates this method to public
231// domain http://graphics.stanford.edu/~seander/bithacks.html#ReverseByteWith64Bits
232#ifdef LV_HAVE_GENERIC
233static inline void
234volk_32u_reverse_32u_2001magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
235{
236 const uint32_t* in_ptr = in;
237 uint32_t* out_ptr = out;
238 const uint8_t* in8;
239 uint8_t* out8;
240 unsigned int number = 0;
241 for (; number < num_points; ++number) {
242 in8 = (const uint8_t*)in_ptr;
243 out8 = (uint8_t*)out_ptr;
244 out8[3] = ((in8[0] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
245 out8[2] = ((in8[1] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
246 out8[1] = ((in8[2] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
247 out8[0] = ((in8[3] * 0x80200802ULL) & 0x0884422110ULL) * 0x0101010101ULL >> 32;
248 ++in_ptr;
249 ++out_ptr;
250 }
251}
252#endif /* LV_HAVE_GENERIC */
253
254#ifdef LV_HAVE_GENERIC
255// Current gr-pager implementation
256static inline void
257volk_32u_reverse_32u_1972magic(uint32_t* out, const uint32_t* in, unsigned int num_points)
258{
259 const uint32_t* in_ptr = in;
260 uint32_t* out_ptr = out;
261 const uint8_t* in8;
262 uint8_t* out8;
263 unsigned int number = 0;
264 for (; number < num_points; ++number) {
265 in8 = (const uint8_t*)in_ptr;
266 out8 = (uint8_t*)out_ptr;
267 out8[3] = (in8[0] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
268 out8[2] = (in8[1] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
269 out8[1] = (in8[2] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
270 out8[0] = (in8[3] * 0x0202020202ULL & 0x010884422010ULL) % 1023;
271 ++in_ptr;
272 ++out_ptr;
273 }
274}
275#endif /* LV_HAVE_GENERIC */
276
277// After lengthy thought and quite a bit of whiteboarding:
278#ifdef LV_HAVE_GENERIC
279static inline void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t* out,
280 const uint32_t* in,
281 unsigned int num_points)
282{
283 const uint32_t* in_ptr = in;
284 uint32_t* out_ptr = out;
285 unsigned int number = 0;
286 for (; number < num_points; ++number) {
287 uint32_t tmp = *in_ptr;
288 /* permute uint16:
289 The idea is to simply shift the lower 16 bit up, and the upper 16 bit down.
290 */
291 tmp = (tmp << 16) | (tmp >> 16);
292 /* permute bytes:
293 shift up by 1 B first, then only consider even bytes, and OR with the unshifted
294 even bytes
295 */
296 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
297 /* permute 4bit tuples:
298 Same idea, but the "consideration" mask expression becomes unwieldy
299 */
300 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
301 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
302 /* permute 2bit tuples:
303 Here, we collapsed the "consideration" mask to a simple hexmask: 0b0011 =
304 3; we need those every 4b, which coincides with a hex digit!
305 */
306 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
307 /* permute odd/even:
308 0x01 = 0x1; we need these every 2b, which works out: 0x01 | (0x01 << 2) =
309 0x05!
310 */
311 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
312
313 *out_ptr = tmp;
314 ++in_ptr;
315 ++out_ptr;
316 }
317}
318#endif /* LV_HAVE_GENERIC */
319#ifdef LV_HAVE_GENERIC
320static inline void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t* out,
321 const uint32_t* in,
322 unsigned int num_points)
323{
324 // same stuff as top_down, inverted order (permutation matrices don't care, you know!)
325 const uint32_t* in_ptr = in;
326 uint32_t* out_ptr = out;
327 unsigned int number = 0;
328 for (; number < num_points; ++number) {
329 uint32_t tmp = *in_ptr;
330 tmp = ((tmp & (0x55555555)) << 1) | ((tmp >> 1) & (0x55555555));
331 tmp = ((tmp & (0x33333333)) << 2) | ((tmp >> 2) & (0x33333333));
332 tmp = ((tmp & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24)) << 4) |
333 ((tmp >> 4) & (0xF | 0xF << 8 | 0xF << 16 | 0xF << 24));
334 tmp = ((tmp & (0xFF | 0xFF << 16)) << 8) | ((tmp >> 8) & (0xFF | 0xFF << 16));
335 tmp = (tmp << 16) | (tmp >> 16);
336
337 *out_ptr = tmp;
338 ++in_ptr;
339 ++out_ptr;
340 }
341}
342#endif /* LV_HAVE_GENERIC */
343
344#ifdef LV_HAVE_NEONV8
345#include <arm_neon.h>
346
347static inline void
348volk_32u_reverse_32u_neonv8(uint32_t* out, const uint32_t* in, unsigned int num_points)
349{
350 const uint32_t* in_ptr = in;
351 uint32_t* out_ptr = out;
352
353 const uint8x16_t idx = { 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 };
354
355 const unsigned int quarterPoints = num_points / 4;
356 unsigned int number = 0;
357 for (; number < quarterPoints; ++number) {
358 __VOLK_PREFETCH(in_ptr + 4);
359 uint32x4_t x = vld1q_u32(in_ptr);
360 uint32x4_t z =
361 vreinterpretq_u32_u8(vqtbl1q_u8(vrbitq_u8(vreinterpretq_u8_u32(x)), idx));
362 vst1q_u32(out_ptr, z);
363 in_ptr += 4;
364 out_ptr += 4;
365 }
366 number = quarterPoints * 4;
367 for (; number < num_points; ++number) {
368 *out_ptr = (BitReverseTable256[*in_ptr & 0xff] << 24) |
369 (BitReverseTable256[(*in_ptr >> 8) & 0xff] << 16) |
370 (BitReverseTable256[(*in_ptr >> 16) & 0xff] << 8) |
371 (BitReverseTable256[(*in_ptr >> 24) & 0xff]);
372 ++in_ptr;
373 ++out_ptr;
374 }
375}
376
377#else
378#ifdef LV_HAVE_NEON
379#include <arm_neon.h>
380
381#define DO_RBIT \
382 __VOLK_ASM("rbit %[result], %[value]" \
383 : [result] "=r"(*out_ptr) \
384 : [value] "r"(*in_ptr) \
385 :); \
386 in_ptr++; \
387 out_ptr++;
388
389static inline void
390volk_32u_reverse_32u_arm(uint32_t* out, const uint32_t* in, unsigned int num_points)
391{
392
393 const uint32_t* in_ptr = in;
394 uint32_t* out_ptr = out;
395 const unsigned int eighthPoints = num_points / 8;
396 unsigned int number = 0;
397 for (; number < eighthPoints; ++number) {
398 __VOLK_PREFETCH(in_ptr + 8);
399 DO_RBIT;
400 DO_RBIT;
401 DO_RBIT;
402 DO_RBIT;
403 DO_RBIT;
404 DO_RBIT;
405 DO_RBIT;
406 DO_RBIT;
407 }
408 number = eighthPoints * 8;
409 for (; number < num_points; ++number) {
410 DO_RBIT;
411 }
412}
413#undef DO_RBIT
414#endif /* LV_HAVE_NEON */
415#endif /* LV_HAVE_NEONV8 */
416
417
418#endif /* INCLUDED_volk_32u_reverse_32u_u_H */
Definition: volk_32u_reverse_32u.h:76
uint8_t b02
Definition: volk_32u_reverse_32u.h:79
uint8_t b01
Definition: volk_32u_reverse_32u.h:78
uint8_t b05
Definition: volk_32u_reverse_32u.h:82
uint8_t b07
Definition: volk_32u_reverse_32u.h:84
uint8_t b04
Definition: volk_32u_reverse_32u.h:81
uint8_t b00
Definition: volk_32u_reverse_32u.h:77
uint8_t b06
Definition: volk_32u_reverse_32u.h:83
uint8_t b03
Definition: volk_32u_reverse_32u.h:80
Definition: volk_32u_reverse_32u.h:42
int b10
Definition: volk_32u_reverse_32u.h:53
int b02
Definition: volk_32u_reverse_32u.h:45
int b07
Definition: volk_32u_reverse_32u.h:50
int b16
Definition: volk_32u_reverse_32u.h:59
int b24
Definition: volk_32u_reverse_32u.h:67
int b06
Definition: volk_32u_reverse_32u.h:49
int b09
Definition: volk_32u_reverse_32u.h:52
int b28
Definition: volk_32u_reverse_32u.h:71
int b03
Definition: volk_32u_reverse_32u.h:46
int b11
Definition: volk_32u_reverse_32u.h:54
int b31
Definition: volk_32u_reverse_32u.h:74
int b23
Definition: volk_32u_reverse_32u.h:66
int b29
Definition: volk_32u_reverse_32u.h:72
int b25
Definition: volk_32u_reverse_32u.h:68
int b14
Definition: volk_32u_reverse_32u.h:57
int b15
Definition: volk_32u_reverse_32u.h:58
int b08
Definition: volk_32u_reverse_32u.h:51
int b21
Definition: volk_32u_reverse_32u.h:64
int b27
Definition: volk_32u_reverse_32u.h:70
int b19
Definition: volk_32u_reverse_32u.h:62
int b22
Definition: volk_32u_reverse_32u.h:65
int b30
Definition: volk_32u_reverse_32u.h:73
int b04
Definition: volk_32u_reverse_32u.h:47
int b18
Definition: volk_32u_reverse_32u.h:61
int b17
Definition: volk_32u_reverse_32u.h:60
int b12
Definition: volk_32u_reverse_32u.h:55
int b05
Definition: volk_32u_reverse_32u.h:48
int b00
Definition: volk_32u_reverse_32u.h:43
int b01
Definition: volk_32u_reverse_32u.h:44
int b26
Definition: volk_32u_reverse_32u.h:69
int b13
Definition: volk_32u_reverse_32u.h:56
int b20
Definition: volk_32u_reverse_32u.h:63
static void volk_32u_reverse_32u_1972magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:257
static void volk_32u_reverse_32u_dword_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:111
static void volk_32u_reverse_32u_2001magic(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:234
static void volk_32u_reverse_32u_lut(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:214
#define DO_RBIT
Definition: volk_32u_reverse_32u.h:381
static const unsigned char BitReverseTable256[]
Definition: volk_32u_reverse_32u.h:89
static void volk_32u_reverse_32u_bintree_permute_bottom_up(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:320
static void volk_32u_reverse_32u_bintree_permute_top_down(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:279
static void volk_32u_reverse_32u_arm(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:390
static void volk_32u_reverse_32u_byte_shuffle(uint32_t *out, const uint32_t *in, unsigned int num_points)
Definition: volk_32u_reverse_32u.h:158
#define __VOLK_PREFETCH(addr)
Definition: volk_common.h:62