10#ifndef EIGEN_PACKET_MATH_GPU_H
11#define EIGEN_PACKET_MATH_GPU_H
18#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
19#define EIGEN_GPU_HAS_LDG 1
23#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
24#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
27#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
28#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
34#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
36template<>
struct is_arithmetic<float4> {
enum { value =
true }; };
37template<>
struct is_arithmetic<double2> {
enum { value =
true }; };
39template<>
struct packet_traits<float> : default_packet_traits
66 HasGammaSampleDerAlpha = 1,
75template<>
struct packet_traits<double> : default_packet_traits
100 HasGammaSampleDerAlpha = 1,
110template<>
struct unpacket_traits<float4> {
typedef float type;
enum {size=4, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef float4 half; };
111template<>
struct unpacket_traits<double2> {
typedef double type;
enum {size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef double2 half; };
113template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(
const float& from) {
114 return make_float4(from, from, from, from);
116template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(
const double& from) {
117 return make_double2(from, from);
123#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
126EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_and(
const float& a,
128 return __int_as_float(__float_as_int(a) & __float_as_int(b));
130EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_and(
const double& a,
132 return __longlong_as_double(__double_as_longlong(a) &
133 __double_as_longlong(b));
136EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_or(
const float& a,
138 return __int_as_float(__float_as_int(a) | __float_as_int(b));
140EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_or(
const double& a,
142 return __longlong_as_double(__double_as_longlong(a) |
143 __double_as_longlong(b));
146EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_xor(
const float& a,
148 return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
150EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_xor(
const double& a,
152 return __longlong_as_double(__double_as_longlong(a) ^
153 __double_as_longlong(b));
156EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float bitwise_andnot(
const float& a,
158 return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
160EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bitwise_andnot(
const double& a,
162 return __longlong_as_double(__double_as_longlong(a) &
163 ~__double_as_longlong(b));
165EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float eq_mask(
const float& a,
167 return __int_as_float(a == b ? 0xffffffffu : 0u);
169EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double eq_mask(
const double& a,
171 return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
174EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
float lt_mask(
const float& a,
176 return __int_as_float(a < b ? 0xffffffffu : 0u);
178EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double lt_mask(
const double& a,
180 return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
186EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(
const float4& a,
188 return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
189 bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
192EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(
const double2& a,
194 return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
198EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(
const float4& a,
200 return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
201 bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
204EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(
const double2& a,
206 return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
210EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(
const float4& a,
212 return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
213 bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
216EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(
const double2& a,
218 return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
222EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(
const float4& a,
224 return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
225 bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
228EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
229pandnot<double2>(
const double2& a,
const double2& b) {
230 return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
234EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(
const float4& a,
236 return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
240EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(
const float4& a,
242 return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
246EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
247pcmp_eq<double2>(
const double2& a,
const double2& b) {
248 return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
251EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
252pcmp_lt<double2>(
const double2& a,
const double2& b) {
253 return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
257template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(
const float& a) {
258 return make_float4(a, a+1, a+2, a+3);
260template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(
const double& a) {
261 return make_double2(a, a+1);
264template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(
const float4& a,
const float4& b) {
265 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
267template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(
const double2& a,
const double2& b) {
268 return make_double2(a.x+b.x, a.y+b.y);
271template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(
const float4& a,
const float4& b) {
272 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
274template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(
const double2& a,
const double2& b) {
275 return make_double2(a.x-b.x, a.y-b.y);
278template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(
const float4& a) {
279 return make_float4(-a.x, -a.y, -a.z, -a.w);
281template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(
const double2& a) {
282 return make_double2(-a.x, -a.y);
285template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(
const float4& a) {
return a; }
286template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(
const double2& a) {
return a; }
288template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(
const float4& a,
const float4& b) {
289 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
291template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(
const double2& a,
const double2& b) {
292 return make_double2(a.x*b.x, a.y*b.y);
295template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(
const float4& a,
const float4& b) {
296 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
298template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(
const double2& a,
const double2& b) {
299 return make_double2(a.x/b.x, a.y/b.y);
302template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(
const float4& a,
const float4& b) {
303 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
305template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(
const double2& a,
const double2& b) {
306 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
309template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(
const float4& a,
const float4& b) {
310 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
312template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(
const double2& a,
const double2& b) {
313 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
316template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(
const float* from) {
317 return *
reinterpret_cast<const float4*
>(from);
320template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(
const double* from) {
321 return *
reinterpret_cast<const double2*
>(from);
324template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(
const float* from) {
325 return make_float4(from[0], from[1], from[2], from[3]);
327template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(
const double* from) {
328 return make_double2(from[0], from[1]);
331template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(
const float* from) {
332 return make_float4(from[0], from[0], from[1], from[1]);
334template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(
const double* from) {
335 return make_double2(from[0], from[0]);
338template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<float>(
float* to,
const float4& from) {
339 *
reinterpret_cast<float4*
>(to) = from;
342template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<double>(
double* to,
const double2& from) {
343 *
reinterpret_cast<double2*
>(to) = from;
346template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<float>(
float* to,
const float4& from) {
353template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<double>(
double* to,
const double2& from) {
359EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(
const float* from) {
360#if defined(EIGEN_GPU_HAS_LDG)
361 return __ldg((
const float4*)from);
363 return make_float4(from[0], from[1], from[2], from[3]);
367EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(
const double* from) {
368#if defined(EIGEN_GPU_HAS_LDG)
369 return __ldg((
const double2*)from);
371 return make_double2(from[0], from[1]);
376EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(
const float* from) {
377#if defined(EIGEN_GPU_HAS_LDG)
378 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
380 return make_float4(from[0], from[1], from[2], from[3]);
384EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(
const double* from) {
385#if defined(EIGEN_GPU_HAS_LDG)
386 return make_double2(__ldg(from+0), __ldg(from+1));
388 return make_double2(from[0], from[1]);
392template<> EIGEN_DEVICE_FUNC
inline float4 pgather<float, float4>(
const float* from,
Index stride) {
393 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
396template<> EIGEN_DEVICE_FUNC
inline double2 pgather<double, double2>(
const double* from,
Index stride) {
397 return make_double2(from[0*stride], from[1*stride]);
400template<> EIGEN_DEVICE_FUNC
inline void pscatter<float, float4>(
float* to,
const float4& from,
Index stride) {
401 to[stride*0] = from.x;
402 to[stride*1] = from.y;
403 to[stride*2] = from.z;
404 to[stride*3] = from.w;
406template<> EIGEN_DEVICE_FUNC
inline void pscatter<double, double2>(
double* to,
const double2& from,
Index stride) {
407 to[stride*0] = from.x;
408 to[stride*1] = from.y;
411template<> EIGEN_DEVICE_FUNC
inline float pfirst<float4>(
const float4& a) {
414template<> EIGEN_DEVICE_FUNC
inline double pfirst<double2>(
const double2& a) {
418template<> EIGEN_DEVICE_FUNC
inline float predux<float4>(
const float4& a) {
419 return a.x + a.y + a.z + a.w;
421template<> EIGEN_DEVICE_FUNC
inline double predux<double2>(
const double2& a) {
425template<> EIGEN_DEVICE_FUNC
inline float predux_max<float4>(
const float4& a) {
426 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
428template<> EIGEN_DEVICE_FUNC
inline double predux_max<double2>(
const double2& a) {
429 return fmax(a.x, a.y);
432template<> EIGEN_DEVICE_FUNC
inline float predux_min<float4>(
const float4& a) {
433 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
435template<> EIGEN_DEVICE_FUNC
inline double predux_min<double2>(
const double2& a) {
436 return fmin(a.x, a.y);
439template<> EIGEN_DEVICE_FUNC
inline float predux_mul<float4>(
const float4& a) {
440 return a.x * a.y * a.z * a.w;
442template<> EIGEN_DEVICE_FUNC
inline double predux_mul<double2>(
const double2& a) {
446template<> EIGEN_DEVICE_FUNC
inline float4 pabs<float4>(
const float4& a) {
447 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
449template<> EIGEN_DEVICE_FUNC
inline double2 pabs<double2>(
const double2& a) {
450 return make_double2(fabs(a.x), fabs(a.y));
453template<> EIGEN_DEVICE_FUNC
inline float4 pfloor<float4>(
const float4& a) {
454 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
456template<> EIGEN_DEVICE_FUNC
inline double2 pfloor<double2>(
const double2& a) {
457 return make_double2(floor(a.x), floor(a.y));
460EIGEN_DEVICE_FUNC
inline void
461ptranspose(PacketBlock<float4,4>& kernel) {
462 float tmp = kernel.packet[0].y;
463 kernel.packet[0].y = kernel.packet[1].x;
464 kernel.packet[1].x = tmp;
466 tmp = kernel.packet[0].z;
467 kernel.packet[0].z = kernel.packet[2].x;
468 kernel.packet[2].x = tmp;
470 tmp = kernel.packet[0].w;
471 kernel.packet[0].w = kernel.packet[3].x;
472 kernel.packet[3].x = tmp;
474 tmp = kernel.packet[1].z;
475 kernel.packet[1].z = kernel.packet[2].y;
476 kernel.packet[2].y = tmp;
478 tmp = kernel.packet[1].w;
479 kernel.packet[1].w = kernel.packet[3].y;
480 kernel.packet[3].y = tmp;
482 tmp = kernel.packet[2].w;
483 kernel.packet[2].w = kernel.packet[3].z;
484 kernel.packet[3].z = tmp;
487EIGEN_DEVICE_FUNC
inline void
488ptranspose(PacketBlock<double2,2>& kernel) {
489 double tmp = kernel.packet[0].y;
490 kernel.packet[0].y = kernel.packet[1].x;
491 kernel.packet[1].x = tmp;
498#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
500typedef ulonglong2 Packet4h2;
501template<>
struct unpacket_traits<Packet4h2> {
typedef Eigen::half type;
enum {size=8, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef Packet4h2 half; };
502template<>
struct is_arithmetic<Packet4h2> {
enum { value =
true }; };
504template<>
struct unpacket_traits<half2> {
typedef Eigen::half type;
enum {size=2, alignment=
Aligned16, vectorizable=
true, masked_load_available=
false, masked_store_available=
false};
typedef half2 half; };
505template<>
struct is_arithmetic<half2> {
enum { value =
true }; };
507template<>
struct packet_traits<
Eigen::half> : default_packet_traits
509 typedef Packet4h2 type;
510 typedef Packet4h2 half;
531EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(
const __half& a,
const __half& b) {
532#if defined(EIGEN_GPU_COMPILE_PHASE)
533 return __halves2half2(a, b);
536 return __floats2half2_rn(__half2float(a), __half2float(b));
540EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(
const half2& a) {
541#if defined(EIGEN_GPU_COMPILE_PHASE)
542 return __low2half(a);
544 return __float2half(__low2float(a));
548EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(
const half2& a) {
549#if defined(EIGEN_GPU_COMPILE_PHASE)
550 return __high2half(a);
552 return __float2half(__high2float(a));
558EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(
const Eigen::half& from) {
559#if defined(EIGEN_GPU_COMPILE_PHASE)
560 return __half2half2(from);
562 const float f = __half2float(from);
563 return __floats2half2_rn(f, f);
568EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
569pset1<Packet4h2>(
const Eigen::half& from) {
571 half2* p_alias =
reinterpret_cast<half2*
>(&r);
572 p_alias[0] = pset1<half2>(from);
573 p_alias[1] = pset1<half2>(from);
574 p_alias[2] = pset1<half2>(from);
575 p_alias[3] = pset1<half2>(from);
583EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(
const Eigen::half* from) {
584 return *
reinterpret_cast<const half2*
>(from);
587EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(
const Eigen::half* from) {
588 return combine_half(from[0], from[1]);
591EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(
const Eigen::half* from) {
592 return combine_half(from[0], from[0]);
595EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore(Eigen::half* to,
597 *
reinterpret_cast<half2*
>(to) = from;
600EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu(Eigen::half* to,
602 to[0] = get_half2_low(from);
603 to[1] = get_half2_high(from);
607EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
608 const Eigen::half* from) {
609#if defined(EIGEN_GPU_HAS_LDG)
611 return __ldg(
reinterpret_cast<const half2*
>(from));
613 return combine_half(*(from+0), *(from+1));
617EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
618 const Eigen::half* from) {
619#if defined(EIGEN_GPU_HAS_LDG)
620 return __halves2half2(__ldg(from+0), __ldg(from+1));
622 return combine_half(*(from+0), *(from+1));
626EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(
const Eigen::half* from,
628 return combine_half(from[0*stride], from[1*stride]);
631EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter(
632 Eigen::half* to,
const half2& from,
Index stride) {
633 to[stride*0] = get_half2_low(from);
634 to[stride*1] = get_half2_high(from);
637EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(
const half2& a) {
638 return get_half2_low(a);
641EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(
const half2& a) {
642 half a1 = get_half2_low(a);
643 half a2 = get_half2_high(a);
644 half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
645 half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
646 return combine_half(result1, result2);
649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(
const half2& ) {
650 half true_half = half_impl::raw_uint16_to_half(0xffffu);
651 return pset1<half2>(true_half);
654EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(
const half2& ) {
655 half false_half = half_impl::raw_uint16_to_half(0x0000u);
656 return pset1<half2>(false_half);
659EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
660ptranspose(PacketBlock<half2,2>& kernel) {
661 __half a1 = get_half2_low(kernel.packet[0]);
662 __half a2 = get_half2_high(kernel.packet[0]);
663 __half b1 = get_half2_low(kernel.packet[1]);
664 __half b2 = get_half2_high(kernel.packet[1]);
665 kernel.packet[0] = combine_half(a1, b1);
666 kernel.packet[1] = combine_half(a2, b2);
669EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(
const Eigen::half& a) {
670#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
671 return __halves2half2(a, __hadd(a, __float2half(1.0f)));
673 float f = __half2float(a) + 1.0f;
674 return combine_half(a, __float2half(f));
678EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(
const half2& mask,
681 half mask_low = get_half2_low(mask);
682 half mask_high = get_half2_high(mask);
683 half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
684 half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
685 return combine_half(result_low, result_high);
688EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(
const half2& a,
690 half true_half = half_impl::raw_uint16_to_half(0xffffu);
691 half false_half = half_impl::raw_uint16_to_half(0x0000u);
692 half a1 = get_half2_low(a);
693 half a2 = get_half2_high(a);
694 half b1 = get_half2_low(b);
695 half b2 = get_half2_high(b);
696 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
697 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
698 return combine_half(eq1, eq2);
701EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(
const half2& a,
703 half true_half = half_impl::raw_uint16_to_half(0xffffu);
704 half false_half = half_impl::raw_uint16_to_half(0x0000u);
705 half a1 = get_half2_low(a);
706 half a2 = get_half2_high(a);
707 half b1 = get_half2_low(b);
708 half b2 = get_half2_high(b);
709 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
710 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
711 return combine_half(eq1, eq2);
714EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(
const half2& a,
716 half a1 = get_half2_low(a);
717 half a2 = get_half2_high(a);
718 half b1 = get_half2_low(b);
719 half b2 = get_half2_high(b);
720 half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
721 half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
722 return combine_half(result1, result2);
725EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(
const half2& a,
727 half a1 = get_half2_low(a);
728 half a2 = get_half2_high(a);
729 half b1 = get_half2_low(b);
730 half b2 = get_half2_high(b);
731 half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
732 half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
733 return combine_half(result1, result2);
736EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(
const half2& a,
738 half a1 = get_half2_low(a);
739 half a2 = get_half2_high(a);
740 half b1 = get_half2_low(b);
741 half b2 = get_half2_high(b);
742 half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
743 half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
744 return combine_half(result1, result2);
747EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(
const half2& a,
749 half a1 = get_half2_low(a);
750 half a2 = get_half2_high(a);
751 half b1 = get_half2_low(b);
752 half b2 = get_half2_high(b);
753 half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
754 half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
755 return combine_half(result1, result2);
758EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(
const half2& a,
760#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
761 return __hadd2(a, b);
763 float a1 = __low2float(a);
764 float a2 = __high2float(a);
765 float b1 = __low2float(b);
766 float b2 = __high2float(b);
769 return __floats2half2_rn(r1, r2);
773EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(
const half2& a,
775#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
776 return __hsub2(a, b);
778 float a1 = __low2float(a);
779 float a2 = __high2float(a);
780 float b1 = __low2float(b);
781 float b2 = __high2float(b);
784 return __floats2half2_rn(r1, r2);
788EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(
const half2& a) {
789#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
792 float a1 = __low2float(a);
793 float a2 = __high2float(a);
794 return __floats2half2_rn(-a1, -a2);
798EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(
const half2& a) {
return a; }
800EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(
const half2& a,
802#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
803 return __hmul2(a, b);
805 float a1 = __low2float(a);
806 float a2 = __high2float(a);
807 float b1 = __low2float(b);
808 float b2 = __high2float(b);
811 return __floats2half2_rn(r1, r2);
815EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(
const half2& a,
818#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
819 return __hfma2(a, b, c);
821 float a1 = __low2float(a);
822 float a2 = __high2float(a);
823 float b1 = __low2float(b);
824 float b2 = __high2float(b);
825 float c1 = __low2float(c);
826 float c2 = __high2float(c);
827 float r1 = a1 * b1 + c1;
828 float r2 = a2 * b2 + c2;
829 return __floats2half2_rn(r1, r2);
833EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(
const half2& a,
835#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
836 return __h2div(a, b);
838 float a1 = __low2float(a);
839 float a2 = __high2float(a);
840 float b1 = __low2float(b);
841 float b2 = __high2float(b);
844 return __floats2half2_rn(r1, r2);
848EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(
const half2& a,
850 float a1 = __low2float(a);
851 float a2 = __high2float(a);
852 float b1 = __low2float(b);
853 float b2 = __high2float(b);
854 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
855 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
856 return combine_half(r1, r2);
859EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(
const half2& a,
861 float a1 = __low2float(a);
862 float a2 = __high2float(a);
863 float b1 = __low2float(b);
864 float b2 = __high2float(b);
865 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
866 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
867 return combine_half(r1, r2);
870EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(
const half2& a) {
871#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
872 return __hadd(__low2half(a), __high2half(a));
874 float a1 = __low2float(a);
875 float a2 = __high2float(a);
876 return Eigen::half(__float2half(a1 + a2));
880EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(
const half2& a) {
881#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
882 __half first = __low2half(a);
883 __half second = __high2half(a);
884 return __hgt(first, second) ? first : second;
886 float a1 = __low2float(a);
887 float a2 = __high2float(a);
888 return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
892EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(
const half2& a) {
893#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
894 __half first = __low2half(a);
895 __half second = __high2half(a);
896 return __hlt(first, second) ? first : second;
898 float a1 = __low2float(a);
899 float a2 = __high2float(a);
900 return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
904EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(
const half2& a) {
905#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
906 return __hmul(__low2half(a), __high2half(a));
908 float a1 = __low2float(a);
909 float a2 = __high2float(a);
910 return Eigen::half(__float2half(a1 * a2));
914EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(
const half2& a) {
915 float a1 = __low2float(a);
916 float a2 = __high2float(a);
917 float r1 = log1pf(a1);
918 float r2 = log1pf(a2);
919 return __floats2half2_rn(r1, r2);
922EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(
const half2& a) {
923 float a1 = __low2float(a);
924 float a2 = __high2float(a);
925 float r1 = expm1f(a1);
926 float r2 = expm1f(a2);
927 return __floats2half2_rn(r1, r2);
930#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
931 defined(EIGEN_HIP_DEVICE_COMPILE)
933EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
934half2 plog(
const half2& a) {
938 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
939half2 pexp(
const half2& a) {
943 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
944half2 psqrt(
const half2& a) {
948 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
949half2 prsqrt(
const half2& a) {
955EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(
const half2& a) {
956 float a1 = __low2float(a);
957 float a2 = __high2float(a);
960 return __floats2half2_rn(r1, r2);
963EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(
const half2& a) {
964 float a1 = __low2float(a);
965 float a2 = __high2float(a);
968 return __floats2half2_rn(r1, r2);
971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(
const half2& a) {
972 float a1 = __low2float(a);
973 float a2 = __high2float(a);
974 float r1 = sqrtf(a1);
975 float r2 = sqrtf(a2);
976 return __floats2half2_rn(r1, r2);
979EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(
const half2& a) {
980 float a1 = __low2float(a);
981 float a2 = __high2float(a);
982 float r1 = rsqrtf(a1);
983 float r2 = rsqrtf(a2);
984 return __floats2half2_rn(r1, r2);
990EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
991pload<Packet4h2>(
const Eigen::half* from) {
992 return *
reinterpret_cast<const Packet4h2*
>(from);
997EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
998ploadu<Packet4h2>(
const Eigen::half* from) {
1000 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1001 p_alias[0] = ploadu(from + 0);
1002 p_alias[1] = ploadu(from + 2);
1003 p_alias[2] = ploadu(from + 4);
1004 p_alias[3] = ploadu(from + 6);
1009EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1010ploaddup<Packet4h2>(
const Eigen::half* from) {
1012 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1013 p_alias[0] = ploaddup(from + 0);
1014 p_alias[1] = ploaddup(from + 1);
1015 p_alias[2] = ploaddup(from + 2);
1016 p_alias[3] = ploaddup(from + 3);
1021EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstore<Eigen::half>(
1022 Eigen::half* to,
const Packet4h2& from) {
1023 *
reinterpret_cast<Packet4h2*
>(to) = from;
1027EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pstoreu<Eigen::half>(
1028 Eigen::half* to,
const Packet4h2& from) {
1029 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1030 pstoreu(to + 0,from_alias[0]);
1031 pstoreu(to + 2,from_alias[1]);
1032 pstoreu(to + 4,from_alias[2]);
1033 pstoreu(to + 6,from_alias[3]);
1037EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1038ploadt_ro<Packet4h2, Aligned>(
const Eigen::half* from) {
1039#if defined(EIGEN_GPU_HAS_LDG)
1041 r = __ldg(
reinterpret_cast<const Packet4h2*
>(from));
1045 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1046 r_alias[0] = ploadt_ro_aligned(from + 0);
1047 r_alias[1] = ploadt_ro_aligned(from + 2);
1048 r_alias[2] = ploadt_ro_aligned(from + 4);
1049 r_alias[3] = ploadt_ro_aligned(from + 6);
1055EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1056ploadt_ro<Packet4h2, Unaligned>(
const Eigen::half* from) {
1058 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1059 r_alias[0] = ploadt_ro_unaligned(from + 0);
1060 r_alias[1] = ploadt_ro_unaligned(from + 2);
1061 r_alias[2] = ploadt_ro_unaligned(from + 4);
1062 r_alias[3] = ploadt_ro_unaligned(from + 6);
1067EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1068pgather<Eigen::half, Packet4h2>(
const Eigen::half* from,
Index stride) {
1070 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1071 p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072 p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073 p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074 p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1079EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void pscatter<Eigen::half, Packet4h2>(
1080 Eigen::half* to,
const Packet4h2& from,
Index stride) {
1081 const half2* from_alias =
reinterpret_cast<const half2*
>(&from);
1082 pscatter(to + stride * 0, from_alias[0], stride);
1083 pscatter(to + stride * 2, from_alias[1], stride);
1084 pscatter(to + stride * 4, from_alias[2], stride);
1085 pscatter(to + stride * 6, from_alias[3], stride);
1089EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
1090 const Packet4h2& a) {
1091 return pfirst(*(
reinterpret_cast<const half2*
>(&a)));
1095EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
1096 const Packet4h2& a) {
1098 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1099 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1100 p_alias[0] = pabs(a_alias[0]);
1101 p_alias[1] = pabs(a_alias[1]);
1102 p_alias[2] = pabs(a_alias[2]);
1103 p_alias[3] = pabs(a_alias[3]);
1108EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
1109 const Packet4h2& ) {
1110 half true_half = half_impl::raw_uint16_to_half(0xffffu);
1111 return pset1<Packet4h2>(true_half);
1115EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(
const Packet4h2& ) {
1116 half false_half = half_impl::raw_uint16_to_half(0x0000u);
1117 return pset1<Packet4h2>(false_half);
1120EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose_double(
1121 double* d_row0,
double* d_row1,
double* d_row2,
double* d_row3,
1122 double* d_row4,
double* d_row5,
double* d_row6,
double* d_row7) {
1125 d_row0[1] = d_row4[0];
1129 d_row1[1] = d_row5[0];
1133 d_row2[1] = d_row6[0];
1137 d_row3[1] = d_row7[0];
1141EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void ptranspose_half2(
1142 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1145 f_row0[1] = f_row2[0];
1149 f_row1[1] = f_row3[0];
1153EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
1154ptranspose_half(half2& f0, half2& f1) {
1155 __half a1 = get_half2_low(f0);
1156 __half a2 = get_half2_high(f0);
1157 __half b1 = get_half2_low(f1);
1158 __half b2 = get_half2_high(f1);
1159 f0 = combine_half(a1, b1);
1160 f1 = combine_half(a2, b2);
1163EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void
1164ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1165 double* d_row0 =
reinterpret_cast<double*
>(&kernel.packet[0]);
1166 double* d_row1 =
reinterpret_cast<double*
>(&kernel.packet[1]);
1167 double* d_row2 =
reinterpret_cast<double*
>(&kernel.packet[2]);
1168 double* d_row3 =
reinterpret_cast<double*
>(&kernel.packet[3]);
1169 double* d_row4 =
reinterpret_cast<double*
>(&kernel.packet[4]);
1170 double* d_row5 =
reinterpret_cast<double*
>(&kernel.packet[5]);
1171 double* d_row6 =
reinterpret_cast<double*
>(&kernel.packet[6]);
1172 double* d_row7 =
reinterpret_cast<double*
>(&kernel.packet[7]);
1173 ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174 d_row4, d_row5, d_row6, d_row7);
1177 half2* f_row0 =
reinterpret_cast<half2*
>(d_row0);
1178 half2* f_row1 =
reinterpret_cast<half2*
>(d_row1);
1179 half2* f_row2 =
reinterpret_cast<half2*
>(d_row2);
1180 half2* f_row3 =
reinterpret_cast<half2*
>(d_row3);
1181 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1182 ptranspose_half(f_row0[0], f_row1[0]);
1183 ptranspose_half(f_row0[1], f_row1[1]);
1184 ptranspose_half(f_row2[0], f_row3[0]);
1185 ptranspose_half(f_row2[1], f_row3[1]);
1187 f_row0 =
reinterpret_cast<half2*
>(d_row0 + 1);
1188 f_row1 =
reinterpret_cast<half2*
>(d_row1 + 1);
1189 f_row2 =
reinterpret_cast<half2*
>(d_row2 + 1);
1190 f_row3 =
reinterpret_cast<half2*
>(d_row3 + 1);
1191 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1192 ptranspose_half(f_row0[0], f_row1[0]);
1193 ptranspose_half(f_row0[1], f_row1[1]);
1194 ptranspose_half(f_row2[0], f_row3[0]);
1195 ptranspose_half(f_row2[1], f_row3[1]);
1197 f_row0 =
reinterpret_cast<half2*
>(d_row4);
1198 f_row1 =
reinterpret_cast<half2*
>(d_row5);
1199 f_row2 =
reinterpret_cast<half2*
>(d_row6);
1200 f_row3 =
reinterpret_cast<half2*
>(d_row7);
1201 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1202 ptranspose_half(f_row0[0], f_row1[0]);
1203 ptranspose_half(f_row0[1], f_row1[1]);
1204 ptranspose_half(f_row2[0], f_row3[0]);
1205 ptranspose_half(f_row2[1], f_row3[1]);
1207 f_row0 =
reinterpret_cast<half2*
>(d_row4 + 1);
1208 f_row1 =
reinterpret_cast<half2*
>(d_row5 + 1);
1209 f_row2 =
reinterpret_cast<half2*
>(d_row6 + 1);
1210 f_row3 =
reinterpret_cast<half2*
>(d_row7 + 1);
1211 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1212 ptranspose_half(f_row0[0], f_row1[0]);
1213 ptranspose_half(f_row0[1], f_row1[1]);
1214 ptranspose_half(f_row2[0], f_row3[0]);
1215 ptranspose_half(f_row2[1], f_row3[1]);
1220EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1221plset<Packet4h2>(
const Eigen::half& a) {
1222#if defined(EIGEN_HIP_DEVICE_COMPILE)
1225 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1226 p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1227 p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
1228 __hadd(a, __float2half(3.0f)));
1229 p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
1230 __hadd(a, __float2half(5.0f)));
1231 p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
1232 __hadd(a, __float2half(7.0f)));
1234#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1236 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1238 half2 b = pset1<half2>(a);
1240 half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1241 half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1243 c = __hadd2(b, half_offset0);
1244 r_alias[0] = plset(__low2half(c));
1245 r_alias[1] = plset(__high2half(c));
1247 c = __hadd2(b, half_offset1);
1248 r_alias[2] = plset(__low2half(c));
1249 r_alias[3] = plset(__high2half(c));
1254 float f = __half2float(a);
1256 half2* p_alias =
reinterpret_cast<half2*
>(&r);
1257 p_alias[0] = combine_half(a, __float2half(f + 1.0f));
1258 p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
1259 p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
1260 p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
1266EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1267pselect<Packet4h2>(
const Packet4h2& mask,
const Packet4h2& a,
1268 const Packet4h2& b) {
1270 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1271 const half2* mask_alias =
reinterpret_cast<const half2*
>(&mask);
1272 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1273 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1274 r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1275 r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1276 r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1277 r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1282EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1283pcmp_eq<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1285 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1286 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1287 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1288 r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1289 r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1290 r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1291 r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1296EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
1297 const Packet4h2& a,
const Packet4h2& b) {
1299 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1300 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1301 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1302 r_alias[0] = pand(a_alias[0], b_alias[0]);
1303 r_alias[1] = pand(a_alias[1], b_alias[1]);
1304 r_alias[2] = pand(a_alias[2], b_alias[2]);
1305 r_alias[3] = pand(a_alias[3], b_alias[3]);
1310EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
1311 const Packet4h2& a,
const Packet4h2& b) {
1313 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1314 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1315 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1316 r_alias[0] = por(a_alias[0], b_alias[0]);
1317 r_alias[1] = por(a_alias[1], b_alias[1]);
1318 r_alias[2] = por(a_alias[2], b_alias[2]);
1319 r_alias[3] = por(a_alias[3], b_alias[3]);
1324EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
1325 const Packet4h2& a,
const Packet4h2& b) {
1327 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1328 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1329 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1330 r_alias[0] = pxor(a_alias[0], b_alias[0]);
1331 r_alias[1] = pxor(a_alias[1], b_alias[1]);
1332 r_alias[2] = pxor(a_alias[2], b_alias[2]);
1333 r_alias[3] = pxor(a_alias[3], b_alias[3]);
1338EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1339pandnot<Packet4h2>(
const Packet4h2& a,
const Packet4h2& b) {
1341 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1342 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1343 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1344 r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1345 r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1346 r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1347 r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1352EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
1353 const Packet4h2& a,
const Packet4h2& b) {
1355 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1356 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1357 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1358 r_alias[0] = padd(a_alias[0], b_alias[0]);
1359 r_alias[1] = padd(a_alias[1], b_alias[1]);
1360 r_alias[2] = padd(a_alias[2], b_alias[2]);
1361 r_alias[3] = padd(a_alias[3], b_alias[3]);
1366EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
1367 const Packet4h2& a,
const Packet4h2& b) {
1369 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1370 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1371 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1372 r_alias[0] = psub(a_alias[0], b_alias[0]);
1373 r_alias[1] = psub(a_alias[1], b_alias[1]);
1374 r_alias[2] = psub(a_alias[2], b_alias[2]);
1375 r_alias[3] = psub(a_alias[3], b_alias[3]);
1380EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(
const Packet4h2& a) {
1382 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1383 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1384 r_alias[0] = pnegate(a_alias[0]);
1385 r_alias[1] = pnegate(a_alias[1]);
1386 r_alias[2] = pnegate(a_alias[2]);
1387 r_alias[3] = pnegate(a_alias[3]);
1392EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(
const Packet4h2& a) {
1397EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
1398 const Packet4h2& a,
const Packet4h2& b) {
1400 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1401 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1402 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1403 r_alias[0] = pmul(a_alias[0], b_alias[0]);
1404 r_alias[1] = pmul(a_alias[1], b_alias[1]);
1405 r_alias[2] = pmul(a_alias[2], b_alias[2]);
1406 r_alias[3] = pmul(a_alias[3], b_alias[3]);
1411EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
1412 const Packet4h2& a,
const Packet4h2& b,
const Packet4h2& c) {
1414 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1415 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1416 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1417 const half2* c_alias =
reinterpret_cast<const half2*
>(&c);
1418 r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1419 r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1420 r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1421 r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1426EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
1427 const Packet4h2& a,
const Packet4h2& b) {
1429 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1430 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1431 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1432 r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1433 r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1434 r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1435 r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1440EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
1441 const Packet4h2& a,
const Packet4h2& b) {
1443 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1444 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1445 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1446 r_alias[0] = pmin(a_alias[0], b_alias[0]);
1447 r_alias[1] = pmin(a_alias[1], b_alias[1]);
1448 r_alias[2] = pmin(a_alias[2], b_alias[2]);
1449 r_alias[3] = pmin(a_alias[3], b_alias[3]);
1454EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
1455 const Packet4h2& a,
const Packet4h2& b) {
1457 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1458 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1459 const half2* b_alias =
reinterpret_cast<const half2*
>(&b);
1460 r_alias[0] = pmax(a_alias[0], b_alias[0]);
1461 r_alias[1] = pmax(a_alias[1], b_alias[1]);
1462 r_alias[2] = pmax(a_alias[2], b_alias[2]);
1463 r_alias[3] = pmax(a_alias[3], b_alias[3]);
1468EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
1469 const Packet4h2& a) {
1470 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1472 return predux(a_alias[0]) + predux(a_alias[1]) +
1473 predux(a_alias[2]) + predux(a_alias[3]);
1477EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
1478 const Packet4h2& a) {
1479 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1480 half2 m0 = combine_half(predux_max(a_alias[0]),
1481 predux_max(a_alias[1]));
1482 half2 m1 = combine_half(predux_max(a_alias[2]),
1483 predux_max(a_alias[3]));
1484 __half first = predux_max(m0);
1485 __half second = predux_max(m1);
1486#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1487 return (__hgt(first, second) ? first : second);
1489 float ffirst = __half2float(first);
1490 float fsecond = __half2float(second);
1491 return (ffirst > fsecond)? first: second;
1496EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
1497 const Packet4h2& a) {
1498 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1499 half2 m0 = combine_half(predux_min(a_alias[0]),
1500 predux_min(a_alias[1]));
1501 half2 m1 = combine_half(predux_min(a_alias[2]),
1502 predux_min(a_alias[3]));
1503 __half first = predux_min(m0);
1504 __half second = predux_min(m1);
1505#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1506 return (__hlt(first, second) ? first : second);
1508 float ffirst = __half2float(first);
1509 float fsecond = __half2float(second);
1510 return (ffirst < fsecond)? first: second;
1516EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
1517 const Packet4h2& a) {
1518 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1519 return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
1520 pmul(a_alias[2], a_alias[3])));
1524EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1525plog1p<Packet4h2>(
const Packet4h2& a) {
1527 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1528 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1529 r_alias[0] = plog1p(a_alias[0]);
1530 r_alias[1] = plog1p(a_alias[1]);
1531 r_alias[2] = plog1p(a_alias[2]);
1532 r_alias[3] = plog1p(a_alias[3]);
1537EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1538pexpm1<Packet4h2>(
const Packet4h2& a) {
1540 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1541 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1542 r_alias[0] = pexpm1(a_alias[0]);
1543 r_alias[1] = pexpm1(a_alias[1]);
1544 r_alias[2] = pexpm1(a_alias[2]);
1545 r_alias[3] = pexpm1(a_alias[3]);
1550EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(
const Packet4h2& a) {
1552 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1553 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1554 r_alias[0] = plog(a_alias[0]);
1555 r_alias[1] = plog(a_alias[1]);
1556 r_alias[2] = plog(a_alias[2]);
1557 r_alias[3] = plog(a_alias[3]);
1562EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(
const Packet4h2& a) {
1564 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1565 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1566 r_alias[0] = pexp(a_alias[0]);
1567 r_alias[1] = pexp(a_alias[1]);
1568 r_alias[2] = pexp(a_alias[2]);
1569 r_alias[3] = pexp(a_alias[3]);
1574EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(
const Packet4h2& a) {
1576 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1577 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1578 r_alias[0] = psqrt(a_alias[0]);
1579 r_alias[1] = psqrt(a_alias[1]);
1580 r_alias[2] = psqrt(a_alias[2]);
1581 r_alias[3] = psqrt(a_alias[3]);
1586EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1587prsqrt<Packet4h2>(
const Packet4h2& a) {
1589 half2* r_alias =
reinterpret_cast<half2*
>(&r);
1590 const half2* a_alias =
reinterpret_cast<const half2*
>(&a);
1591 r_alias[0] = prsqrt(a_alias[0]);
1592 r_alias[1] = prsqrt(a_alias[1]);
1593 r_alias[2] = prsqrt(a_alias[2]);
1594 r_alias[3] = prsqrt(a_alias[3]);
1601EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(
const half2& a,
1603#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1604 return __hadd2(a, b);
1606 float a1 = __low2float(a);
1607 float a2 = __high2float(a);
1608 float b1 = __low2float(b);
1609 float b2 = __high2float(b);
1612 return __floats2half2_rn(r1, r2);
1617EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(
const half2& a,
1619#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1620 return __hmul2(a, b);
1622 float a1 = __low2float(a);
1623 float a2 = __high2float(a);
1624 float b1 = __low2float(b);
1625 float b2 = __high2float(b);
1628 return __floats2half2_rn(r1, r2);
1633EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(
const half2& a,
1635#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1636 return __h2div(a, b);
1638 float a1 = __low2float(a);
1639 float a2 = __high2float(a);
1640 float b1 = __low2float(b);
1641 float b2 = __high2float(b);
1644 return __floats2half2_rn(r1, r2);
1649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(
const half2& a,
1651 float a1 = __low2float(a);
1652 float a2 = __high2float(a);
1653 float b1 = __low2float(b);
1654 float b2 = __high2float(b);
1655 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
1656 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
1657 return combine_half(r1, r2);
1661EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(
const half2& a,
1663 float a1 = __low2float(a);
1664 float a2 = __high2float(a);
1665 float b1 = __low2float(b);
1666 float b2 = __high2float(b);
1667 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
1668 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
1669 return combine_half(r1, r2);
1676#undef EIGEN_GPU_HAS_LDG
1677#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1678#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
@ Aligned16
Definition: Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition: Core:141
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74