Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
Eigen  3.4.0
 
Loading...
Searching...
No Matches
GPU/PacketMath.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_PACKET_MATH_GPU_H
11#define EIGEN_PACKET_MATH_GPU_H
12
13namespace Eigen {
14
15namespace internal {
16
17// Read-only data cached load available.
18#if defined(EIGEN_HIP_DEVICE_COMPILE) || (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350)
19#define EIGEN_GPU_HAS_LDG 1
20#endif
21
22// FP16 math available.
23#if (defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530)
24#define EIGEN_CUDA_HAS_FP16_ARITHMETIC 1
25#endif
26
27#if defined(EIGEN_HIP_DEVICE_COMPILE) || defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
28#define EIGEN_GPU_HAS_FP16_ARITHMETIC 1
29#endif
30
31// Make sure this is only available when targeting a GPU: we don't want to
32// introduce conflicts between these packet_traits definitions and the ones
33// we'll use on the host side (SSE, AVX, ...)
34#if defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
35
36template<> struct is_arithmetic<float4> { enum { value = true }; };
37template<> struct is_arithmetic<double2> { enum { value = true }; };
38
39template<> struct packet_traits<float> : default_packet_traits
40{
41 typedef float4 type;
42 typedef float4 half;
43 enum {
44 Vectorizable = 1,
45 AlignedOnScalar = 1,
46 size=4,
47 HasHalfPacket = 0,
48
49 HasDiv = 1,
50 HasSin = 0,
51 HasCos = 0,
52 HasLog = 1,
53 HasExp = 1,
54 HasSqrt = 1,
55 HasRsqrt = 1,
56 HasLGamma = 1,
57 HasDiGamma = 1,
58 HasZeta = 1,
59 HasPolygamma = 1,
60 HasErf = 1,
61 HasErfc = 1,
62 HasNdtri = 1,
63 HasBessel = 1,
64 HasIGamma = 1,
65 HasIGammaDerA = 1,
66 HasGammaSampleDerAlpha = 1,
67 HasIGammac = 1,
68 HasBetaInc = 1,
69
70 HasBlend = 0,
71 HasFloor = 1,
72 };
73};
74
75template<> struct packet_traits<double> : default_packet_traits
76{
77 typedef double2 type;
78 typedef double2 half;
79 enum {
80 Vectorizable = 1,
81 AlignedOnScalar = 1,
82 size=2,
83 HasHalfPacket = 0,
84
85 HasDiv = 1,
86 HasLog = 1,
87 HasExp = 1,
88 HasSqrt = 1,
89 HasRsqrt = 1,
90 HasLGamma = 1,
91 HasDiGamma = 1,
92 HasZeta = 1,
93 HasPolygamma = 1,
94 HasErf = 1,
95 HasErfc = 1,
96 HasNdtri = 1,
97 HasBessel = 1,
98 HasIGamma = 1,
99 HasIGammaDerA = 1,
100 HasGammaSampleDerAlpha = 1,
101 HasIGammac = 1,
102 HasBetaInc = 1,
103
104 HasBlend = 0,
105 HasFloor = 1,
106 };
107};
108
109
110template<> struct unpacket_traits<float4> { typedef float type; enum {size=4, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef float4 half; };
111template<> struct unpacket_traits<double2> { typedef double type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef double2 half; };
112
113template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pset1<float4>(const float& from) {
114 return make_float4(from, from, from, from);
115}
116template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pset1<double2>(const double& from) {
117 return make_double2(from, from);
118}
119
120// We need to distinguish ‘clang as the CUDA compiler’ from ‘clang as the host compiler,
121// invoked by NVCC’ (e.g. on MacOS). The former needs to see both host and device implementation
122// of the functions, while the latter can only deal with one of them.
123#if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
124namespace {
125
126EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_and(const float& a,
127 const float& b) {
128 return __int_as_float(__float_as_int(a) & __float_as_int(b));
129}
130EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_and(const double& a,
131 const double& b) {
132 return __longlong_as_double(__double_as_longlong(a) &
133 __double_as_longlong(b));
134}
135
136EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_or(const float& a,
137 const float& b) {
138 return __int_as_float(__float_as_int(a) | __float_as_int(b));
139}
140EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_or(const double& a,
141 const double& b) {
142 return __longlong_as_double(__double_as_longlong(a) |
143 __double_as_longlong(b));
144}
145
146EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_xor(const float& a,
147 const float& b) {
148 return __int_as_float(__float_as_int(a) ^ __float_as_int(b));
149}
150EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_xor(const double& a,
151 const double& b) {
152 return __longlong_as_double(__double_as_longlong(a) ^
153 __double_as_longlong(b));
154}
155
156EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float bitwise_andnot(const float& a,
157 const float& b) {
158 return __int_as_float(__float_as_int(a) & ~__float_as_int(b));
159}
160EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bitwise_andnot(const double& a,
161 const double& b) {
162 return __longlong_as_double(__double_as_longlong(a) &
163 ~__double_as_longlong(b));
164}
165EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float eq_mask(const float& a,
166 const float& b) {
167 return __int_as_float(a == b ? 0xffffffffu : 0u);
168}
169EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double eq_mask(const double& a,
170 const double& b) {
171 return __longlong_as_double(a == b ? 0xffffffffffffffffull : 0ull);
172}
173
174EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float lt_mask(const float& a,
175 const float& b) {
176 return __int_as_float(a < b ? 0xffffffffu : 0u);
177}
178EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double lt_mask(const double& a,
179 const double& b) {
180 return __longlong_as_double(a < b ? 0xffffffffffffffffull : 0ull);
181}
182
183} // namespace
184
185template <>
186EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pand<float4>(const float4& a,
187 const float4& b) {
188 return make_float4(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y),
189 bitwise_and(a.z, b.z), bitwise_and(a.w, b.w));
190}
191template <>
192EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pand<double2>(const double2& a,
193 const double2& b) {
194 return make_double2(bitwise_and(a.x, b.x), bitwise_and(a.y, b.y));
195}
196
197template <>
198EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 por<float4>(const float4& a,
199 const float4& b) {
200 return make_float4(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y),
201 bitwise_or(a.z, b.z), bitwise_or(a.w, b.w));
202}
203template <>
204EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 por<double2>(const double2& a,
205 const double2& b) {
206 return make_double2(bitwise_or(a.x, b.x), bitwise_or(a.y, b.y));
207}
208
209template <>
210EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pxor<float4>(const float4& a,
211 const float4& b) {
212 return make_float4(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y),
213 bitwise_xor(a.z, b.z), bitwise_xor(a.w, b.w));
214}
215template <>
216EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pxor<double2>(const double2& a,
217 const double2& b) {
218 return make_double2(bitwise_xor(a.x, b.x), bitwise_xor(a.y, b.y));
219}
220
221template <>
222EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pandnot<float4>(const float4& a,
223 const float4& b) {
224 return make_float4(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y),
225 bitwise_andnot(a.z, b.z), bitwise_andnot(a.w, b.w));
226}
227template <>
228EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
229pandnot<double2>(const double2& a, const double2& b) {
230 return make_double2(bitwise_andnot(a.x, b.x), bitwise_andnot(a.y, b.y));
231}
232
233template <>
234EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_eq<float4>(const float4& a,
235 const float4& b) {
236 return make_float4(eq_mask(a.x, b.x), eq_mask(a.y, b.y), eq_mask(a.z, b.z),
237 eq_mask(a.w, b.w));
238}
239template <>
240EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pcmp_lt<float4>(const float4& a,
241 const float4& b) {
242 return make_float4(lt_mask(a.x, b.x), lt_mask(a.y, b.y), lt_mask(a.z, b.z),
243 lt_mask(a.w, b.w));
244}
245template <>
246EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
247pcmp_eq<double2>(const double2& a, const double2& b) {
248 return make_double2(eq_mask(a.x, b.x), eq_mask(a.y, b.y));
249}
250template <>
251EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2
252pcmp_lt<double2>(const double2& a, const double2& b) {
253 return make_double2(lt_mask(a.x, b.x), lt_mask(a.y, b.y));
254}
255#endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
256
257template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 plset<float4>(const float& a) {
258 return make_float4(a, a+1, a+2, a+3);
259}
260template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 plset<double2>(const double& a) {
261 return make_double2(a, a+1);
262}
263
264template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 padd<float4>(const float4& a, const float4& b) {
265 return make_float4(a.x+b.x, a.y+b.y, a.z+b.z, a.w+b.w);
266}
267template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 padd<double2>(const double2& a, const double2& b) {
268 return make_double2(a.x+b.x, a.y+b.y);
269}
270
271template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 psub<float4>(const float4& a, const float4& b) {
272 return make_float4(a.x-b.x, a.y-b.y, a.z-b.z, a.w-b.w);
273}
274template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 psub<double2>(const double2& a, const double2& b) {
275 return make_double2(a.x-b.x, a.y-b.y);
276}
277
278template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pnegate(const float4& a) {
279 return make_float4(-a.x, -a.y, -a.z, -a.w);
280}
281template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pnegate(const double2& a) {
282 return make_double2(-a.x, -a.y);
283}
284
285template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pconj(const float4& a) { return a; }
286template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pconj(const double2& a) { return a; }
287
288template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmul<float4>(const float4& a, const float4& b) {
289 return make_float4(a.x*b.x, a.y*b.y, a.z*b.z, a.w*b.w);
290}
291template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmul<double2>(const double2& a, const double2& b) {
292 return make_double2(a.x*b.x, a.y*b.y);
293}
294
295template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pdiv<float4>(const float4& a, const float4& b) {
296 return make_float4(a.x/b.x, a.y/b.y, a.z/b.z, a.w/b.w);
297}
298template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pdiv<double2>(const double2& a, const double2& b) {
299 return make_double2(a.x/b.x, a.y/b.y);
300}
301
302template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmin<float4>(const float4& a, const float4& b) {
303 return make_float4(fminf(a.x, b.x), fminf(a.y, b.y), fminf(a.z, b.z), fminf(a.w, b.w));
304}
305template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmin<double2>(const double2& a, const double2& b) {
306 return make_double2(fmin(a.x, b.x), fmin(a.y, b.y));
307}
308
309template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pmax<float4>(const float4& a, const float4& b) {
310 return make_float4(fmaxf(a.x, b.x), fmaxf(a.y, b.y), fmaxf(a.z, b.z), fmaxf(a.w, b.w));
311}
312template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pmax<double2>(const double2& a, const double2& b) {
313 return make_double2(fmax(a.x, b.x), fmax(a.y, b.y));
314}
315
316template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 pload<float4>(const float* from) {
317 return *reinterpret_cast<const float4*>(from);
318}
319
320template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 pload<double2>(const double* from) {
321 return *reinterpret_cast<const double2*>(from);
322}
323
324template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploadu<float4>(const float* from) {
325 return make_float4(from[0], from[1], from[2], from[3]);
326}
327template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploadu<double2>(const double* from) {
328 return make_double2(from[0], from[1]);
329}
330
331template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float4 ploaddup<float4>(const float* from) {
332 return make_float4(from[0], from[0], from[1], from[1]);
333}
334template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double2 ploaddup<double2>(const double* from) {
335 return make_double2(from[0], from[0]);
336}
337
338template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<float>(float* to, const float4& from) {
339 *reinterpret_cast<float4*>(to) = from;
340}
341
342template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<double>(double* to, const double2& from) {
343 *reinterpret_cast<double2*>(to) = from;
344}
345
346template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<float>(float* to, const float4& from) {
347 to[0] = from.x;
348 to[1] = from.y;
349 to[2] = from.z;
350 to[3] = from.w;
351}
352
353template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<double>(double* to, const double2& from) {
354 to[0] = from.x;
355 to[1] = from.y;
356}
357
358template<>
359EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Aligned>(const float* from) {
360#if defined(EIGEN_GPU_HAS_LDG)
361 return __ldg((const float4*)from);
362#else
363 return make_float4(from[0], from[1], from[2], from[3]);
364#endif
365}
366template<>
367EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Aligned>(const double* from) {
368#if defined(EIGEN_GPU_HAS_LDG)
369 return __ldg((const double2*)from);
370#else
371 return make_double2(from[0], from[1]);
372#endif
373}
374
375template<>
376EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float4 ploadt_ro<float4, Unaligned>(const float* from) {
377#if defined(EIGEN_GPU_HAS_LDG)
378 return make_float4(__ldg(from+0), __ldg(from+1), __ldg(from+2), __ldg(from+3));
379#else
380 return make_float4(from[0], from[1], from[2], from[3]);
381#endif
382}
383template<>
384EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE double2 ploadt_ro<double2, Unaligned>(const double* from) {
385#if defined(EIGEN_GPU_HAS_LDG)
386 return make_double2(__ldg(from+0), __ldg(from+1));
387#else
388 return make_double2(from[0], from[1]);
389#endif
390}
391
392template<> EIGEN_DEVICE_FUNC inline float4 pgather<float, float4>(const float* from, Index stride) {
393 return make_float4(from[0*stride], from[1*stride], from[2*stride], from[3*stride]);
394}
395
396template<> EIGEN_DEVICE_FUNC inline double2 pgather<double, double2>(const double* from, Index stride) {
397 return make_double2(from[0*stride], from[1*stride]);
398}
399
400template<> EIGEN_DEVICE_FUNC inline void pscatter<float, float4>(float* to, const float4& from, Index stride) {
401 to[stride*0] = from.x;
402 to[stride*1] = from.y;
403 to[stride*2] = from.z;
404 to[stride*3] = from.w;
405}
406template<> EIGEN_DEVICE_FUNC inline void pscatter<double, double2>(double* to, const double2& from, Index stride) {
407 to[stride*0] = from.x;
408 to[stride*1] = from.y;
409}
410
411template<> EIGEN_DEVICE_FUNC inline float pfirst<float4>(const float4& a) {
412 return a.x;
413}
414template<> EIGEN_DEVICE_FUNC inline double pfirst<double2>(const double2& a) {
415 return a.x;
416}
417
418template<> EIGEN_DEVICE_FUNC inline float predux<float4>(const float4& a) {
419 return a.x + a.y + a.z + a.w;
420}
421template<> EIGEN_DEVICE_FUNC inline double predux<double2>(const double2& a) {
422 return a.x + a.y;
423}
424
425template<> EIGEN_DEVICE_FUNC inline float predux_max<float4>(const float4& a) {
426 return fmaxf(fmaxf(a.x, a.y), fmaxf(a.z, a.w));
427}
428template<> EIGEN_DEVICE_FUNC inline double predux_max<double2>(const double2& a) {
429 return fmax(a.x, a.y);
430}
431
432template<> EIGEN_DEVICE_FUNC inline float predux_min<float4>(const float4& a) {
433 return fminf(fminf(a.x, a.y), fminf(a.z, a.w));
434}
435template<> EIGEN_DEVICE_FUNC inline double predux_min<double2>(const double2& a) {
436 return fmin(a.x, a.y);
437}
438
439template<> EIGEN_DEVICE_FUNC inline float predux_mul<float4>(const float4& a) {
440 return a.x * a.y * a.z * a.w;
441}
442template<> EIGEN_DEVICE_FUNC inline double predux_mul<double2>(const double2& a) {
443 return a.x * a.y;
444}
445
446template<> EIGEN_DEVICE_FUNC inline float4 pabs<float4>(const float4& a) {
447 return make_float4(fabsf(a.x), fabsf(a.y), fabsf(a.z), fabsf(a.w));
448}
449template<> EIGEN_DEVICE_FUNC inline double2 pabs<double2>(const double2& a) {
450 return make_double2(fabs(a.x), fabs(a.y));
451}
452
453template<> EIGEN_DEVICE_FUNC inline float4 pfloor<float4>(const float4& a) {
454 return make_float4(floorf(a.x), floorf(a.y), floorf(a.z), floorf(a.w));
455}
456template<> EIGEN_DEVICE_FUNC inline double2 pfloor<double2>(const double2& a) {
457 return make_double2(floor(a.x), floor(a.y));
458}
459
460EIGEN_DEVICE_FUNC inline void
461ptranspose(PacketBlock<float4,4>& kernel) {
462 float tmp = kernel.packet[0].y;
463 kernel.packet[0].y = kernel.packet[1].x;
464 kernel.packet[1].x = tmp;
465
466 tmp = kernel.packet[0].z;
467 kernel.packet[0].z = kernel.packet[2].x;
468 kernel.packet[2].x = tmp;
469
470 tmp = kernel.packet[0].w;
471 kernel.packet[0].w = kernel.packet[3].x;
472 kernel.packet[3].x = tmp;
473
474 tmp = kernel.packet[1].z;
475 kernel.packet[1].z = kernel.packet[2].y;
476 kernel.packet[2].y = tmp;
477
478 tmp = kernel.packet[1].w;
479 kernel.packet[1].w = kernel.packet[3].y;
480 kernel.packet[3].y = tmp;
481
482 tmp = kernel.packet[2].w;
483 kernel.packet[2].w = kernel.packet[3].z;
484 kernel.packet[3].z = tmp;
485}
486
487EIGEN_DEVICE_FUNC inline void
488ptranspose(PacketBlock<double2,2>& kernel) {
489 double tmp = kernel.packet[0].y;
490 kernel.packet[0].y = kernel.packet[1].x;
491 kernel.packet[1].x = tmp;
492}
493
494#endif // defined(EIGEN_GPUCC) && defined(EIGEN_USE_GPU)
495
496// Packet4h2 must be defined in the macro without EIGEN_CUDA_ARCH, meaning
497// its corresponding packet_traits<Eigen::half> must be visible on host.
498#if defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
499
500typedef ulonglong2 Packet4h2;
501template<> struct unpacket_traits<Packet4h2> { typedef Eigen::half type; enum {size=8, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef Packet4h2 half; };
502template<> struct is_arithmetic<Packet4h2> { enum { value = true }; };
503
504template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=2, alignment=Aligned16, vectorizable=true, masked_load_available=false, masked_store_available=false}; typedef half2 half; };
505template<> struct is_arithmetic<half2> { enum { value = true }; };
506
507template<> struct packet_traits<Eigen::half> : default_packet_traits
508{
509 typedef Packet4h2 type;
510 typedef Packet4h2 half;
511 enum {
512 Vectorizable = 1,
513 AlignedOnScalar = 1,
514 size=8,
515 HasHalfPacket = 0,
516 HasAdd = 1,
517 HasSub = 1,
518 HasMul = 1,
519 HasDiv = 1,
520 HasSqrt = 1,
521 HasRsqrt = 1,
522 HasExp = 1,
523 HasExpm1 = 1,
524 HasLog = 1,
525 HasLog1p = 1
526 };
527};
528
529namespace {
530// This is equivalent to make_half2, which is undocumented and doesn't seem to always exist.
531EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 combine_half(const __half& a, const __half& b) {
532#if defined(EIGEN_GPU_COMPILE_PHASE)
533 return __halves2half2(a, b);
534#else
535 // Round-about way since __halves2half2 is a __device__ function.
536 return __floats2half2_rn(__half2float(a), __half2float(b));
537#endif
538}
539
540EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_low(const half2& a) {
541#if defined(EIGEN_GPU_COMPILE_PHASE)
542 return __low2half(a);
543#else
544 return __float2half(__low2float(a));
545#endif
546}
547
548EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE __half get_half2_high(const half2& a) {
549#if defined(EIGEN_GPU_COMPILE_PHASE)
550 return __high2half(a);
551#else
552 return __float2half(__high2float(a));
553#endif
554}
555} // namespace
556
557template<>
558EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
559#if defined(EIGEN_GPU_COMPILE_PHASE)
560 return __half2half2(from);
561#else
562 const float f = __half2float(from);
563 return __floats2half2_rn(f, f);
564#endif
565}
566
567template <>
568EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
569pset1<Packet4h2>(const Eigen::half& from) {
570 Packet4h2 r;
571 half2* p_alias = reinterpret_cast<half2*>(&r);
572 p_alias[0] = pset1<half2>(from);
573 p_alias[1] = pset1<half2>(from);
574 p_alias[2] = pset1<half2>(from);
575 p_alias[3] = pset1<half2>(from);
576 return r;
577}
578
579// We now need this visible on both host and device.
580// #if defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
581namespace {
582
583EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pload(const Eigen::half* from) {
584 return *reinterpret_cast<const half2*>(from);
585}
586
587EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploadu(const Eigen::half* from) {
588 return combine_half(from[0], from[1]);
589}
590
591EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ploaddup(const Eigen::half* from) {
592 return combine_half(from[0], from[0]);
593}
594
595EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore(Eigen::half* to,
596 const half2& from) {
597 *reinterpret_cast<half2*>(to) = from;
598}
599
600EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu(Eigen::half* to,
601 const half2& from) {
602 to[0] = get_half2_low(from);
603 to[1] = get_half2_high(from);
604}
605
606
607EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_aligned(
608 const Eigen::half* from) {
609#if defined(EIGEN_GPU_HAS_LDG)
610 // Input is guaranteed to be properly aligned.
611 return __ldg(reinterpret_cast<const half2*>(from));
612#else
613 return combine_half(*(from+0), *(from+1));
614#endif
615}
616
617EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE half2 ploadt_ro_unaligned(
618 const Eigen::half* from) {
619#if defined(EIGEN_GPU_HAS_LDG)
620 return __halves2half2(__ldg(from+0), __ldg(from+1));
621#else
622 return combine_half(*(from+0), *(from+1));
623#endif
624}
625
626EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pgather(const Eigen::half* from,
627 Index stride) {
628 return combine_half(from[0*stride], from[1*stride]);
629}
630
631EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter(
632 Eigen::half* to, const half2& from, Index stride) {
633 to[stride*0] = get_half2_low(from);
634 to[stride*1] = get_half2_high(from);
635}
636
637EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& a) {
638 return get_half2_low(a);
639}
640
641EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pabs(const half2& a) {
642 half a1 = get_half2_low(a);
643 half a2 = get_half2_high(a);
644 half result1 = half_impl::raw_uint16_to_half(a1.x & 0x7FFF);
645 half result2 = half_impl::raw_uint16_to_half(a2.x & 0x7FFF);
646 return combine_half(result1, result2);
647}
648
649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 ptrue(const half2& /*a*/) {
650 half true_half = half_impl::raw_uint16_to_half(0xffffu);
651 return pset1<half2>(true_half);
652}
653
654EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pzero(const half2& /*a*/) {
655 half false_half = half_impl::raw_uint16_to_half(0x0000u);
656 return pset1<half2>(false_half);
657}
658
659EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
660ptranspose(PacketBlock<half2,2>& kernel) {
661 __half a1 = get_half2_low(kernel.packet[0]);
662 __half a2 = get_half2_high(kernel.packet[0]);
663 __half b1 = get_half2_low(kernel.packet[1]);
664 __half b2 = get_half2_high(kernel.packet[1]);
665 kernel.packet[0] = combine_half(a1, b1);
666 kernel.packet[1] = combine_half(a2, b2);
667}
668
669EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plset(const Eigen::half& a) {
670#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
671 return __halves2half2(a, __hadd(a, __float2half(1.0f)));
672#else
673 float f = __half2float(a) + 1.0f;
674 return combine_half(a, __float2half(f));
675#endif
676}
677
678EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pselect(const half2& mask,
679 const half2& a,
680 const half2& b) {
681 half mask_low = get_half2_low(mask);
682 half mask_high = get_half2_high(mask);
683 half result_low = mask_low == half(0) ? get_half2_low(b) : get_half2_low(a);
684 half result_high = mask_high == half(0) ? get_half2_high(b) : get_half2_high(a);
685 return combine_half(result_low, result_high);
686}
687
688EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_eq(const half2& a,
689 const half2& b) {
690 half true_half = half_impl::raw_uint16_to_half(0xffffu);
691 half false_half = half_impl::raw_uint16_to_half(0x0000u);
692 half a1 = get_half2_low(a);
693 half a2 = get_half2_high(a);
694 half b1 = get_half2_low(b);
695 half b2 = get_half2_high(b);
696 half eq1 = __half2float(a1) == __half2float(b1) ? true_half : false_half;
697 half eq2 = __half2float(a2) == __half2float(b2) ? true_half : false_half;
698 return combine_half(eq1, eq2);
699}
700
701EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pcmp_lt(const half2& a,
702 const half2& b) {
703 half true_half = half_impl::raw_uint16_to_half(0xffffu);
704 half false_half = half_impl::raw_uint16_to_half(0x0000u);
705 half a1 = get_half2_low(a);
706 half a2 = get_half2_high(a);
707 half b1 = get_half2_low(b);
708 half b2 = get_half2_high(b);
709 half eq1 = __half2float(a1) < __half2float(b1) ? true_half : false_half;
710 half eq2 = __half2float(a2) < __half2float(b2) ? true_half : false_half;
711 return combine_half(eq1, eq2);
712}
713
714EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pand(const half2& a,
715 const half2& b) {
716 half a1 = get_half2_low(a);
717 half a2 = get_half2_high(a);
718 half b1 = get_half2_low(b);
719 half b2 = get_half2_high(b);
720 half result1 = half_impl::raw_uint16_to_half(a1.x & b1.x);
721 half result2 = half_impl::raw_uint16_to_half(a2.x & b2.x);
722 return combine_half(result1, result2);
723}
724
725EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 por(const half2& a,
726 const half2& b) {
727 half a1 = get_half2_low(a);
728 half a2 = get_half2_high(a);
729 half b1 = get_half2_low(b);
730 half b2 = get_half2_high(b);
731 half result1 = half_impl::raw_uint16_to_half(a1.x | b1.x);
732 half result2 = half_impl::raw_uint16_to_half(a2.x | b2.x);
733 return combine_half(result1, result2);
734}
735
736EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pxor(const half2& a,
737 const half2& b) {
738 half a1 = get_half2_low(a);
739 half a2 = get_half2_high(a);
740 half b1 = get_half2_low(b);
741 half b2 = get_half2_high(b);
742 half result1 = half_impl::raw_uint16_to_half(a1.x ^ b1.x);
743 half result2 = half_impl::raw_uint16_to_half(a2.x ^ b2.x);
744 return combine_half(result1, result2);
745}
746
747EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pandnot(const half2& a,
748 const half2& b) {
749 half a1 = get_half2_low(a);
750 half a2 = get_half2_high(a);
751 half b1 = get_half2_low(b);
752 half b2 = get_half2_high(b);
753 half result1 = half_impl::raw_uint16_to_half(a1.x & ~b1.x);
754 half result2 = half_impl::raw_uint16_to_half(a2.x & ~b2.x);
755 return combine_half(result1, result2);
756}
757
758EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd(const half2& a,
759 const half2& b) {
760#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
761 return __hadd2(a, b);
762#else
763 float a1 = __low2float(a);
764 float a2 = __high2float(a);
765 float b1 = __low2float(b);
766 float b2 = __high2float(b);
767 float r1 = a1 + b1;
768 float r2 = a2 + b2;
769 return __floats2half2_rn(r1, r2);
770#endif
771}
772
773EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psub(const half2& a,
774 const half2& b) {
775#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
776 return __hsub2(a, b);
777#else
778 float a1 = __low2float(a);
779 float a2 = __high2float(a);
780 float b1 = __low2float(b);
781 float b2 = __high2float(b);
782 float r1 = a1 - b1;
783 float r2 = a2 - b2;
784 return __floats2half2_rn(r1, r2);
785#endif
786}
787
788EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
789#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
790 return __hneg2(a);
791#else
792 float a1 = __low2float(a);
793 float a2 = __high2float(a);
794 return __floats2half2_rn(-a1, -a2);
795#endif
796}
797
798EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
799
800EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul(const half2& a,
801 const half2& b) {
802#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
803 return __hmul2(a, b);
804#else
805 float a1 = __low2float(a);
806 float a2 = __high2float(a);
807 float b1 = __low2float(b);
808 float b2 = __high2float(b);
809 float r1 = a1 * b1;
810 float r2 = a2 * b2;
811 return __floats2half2_rn(r1, r2);
812#endif
813}
814
815EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmadd(const half2& a,
816 const half2& b,
817 const half2& c) {
818#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
819 return __hfma2(a, b, c);
820#else
821 float a1 = __low2float(a);
822 float a2 = __high2float(a);
823 float b1 = __low2float(b);
824 float b2 = __high2float(b);
825 float c1 = __low2float(c);
826 float c2 = __high2float(c);
827 float r1 = a1 * b1 + c1;
828 float r2 = a2 * b2 + c2;
829 return __floats2half2_rn(r1, r2);
830#endif
831}
832
833EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv(const half2& a,
834 const half2& b) {
835#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
836 return __h2div(a, b);
837#else
838 float a1 = __low2float(a);
839 float a2 = __high2float(a);
840 float b1 = __low2float(b);
841 float b2 = __high2float(b);
842 float r1 = a1 / b1;
843 float r2 = a2 / b2;
844 return __floats2half2_rn(r1, r2);
845#endif
846}
847
848EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin(const half2& a,
849 const half2& b) {
850 float a1 = __low2float(a);
851 float a2 = __high2float(a);
852 float b1 = __low2float(b);
853 float b2 = __high2float(b);
854 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
855 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
856 return combine_half(r1, r2);
857}
858
859EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax(const half2& a,
860 const half2& b) {
861 float a1 = __low2float(a);
862 float a2 = __high2float(a);
863 float b1 = __low2float(b);
864 float b2 = __high2float(b);
865 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
866 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
867 return combine_half(r1, r2);
868}
869
870EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux(const half2& a) {
871#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
872 return __hadd(__low2half(a), __high2half(a));
873#else
874 float a1 = __low2float(a);
875 float a2 = __high2float(a);
876 return Eigen::half(__float2half(a1 + a2));
877#endif
878}
879
880EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max(const half2& a) {
881#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
882 __half first = __low2half(a);
883 __half second = __high2half(a);
884 return __hgt(first, second) ? first : second;
885#else
886 float a1 = __low2float(a);
887 float a2 = __high2float(a);
888 return a1 > a2 ? get_half2_low(a) : get_half2_high(a);
889#endif
890}
891
892EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min(const half2& a) {
893#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
894 __half first = __low2half(a);
895 __half second = __high2half(a);
896 return __hlt(first, second) ? first : second;
897#else
898 float a1 = __low2float(a);
899 float a2 = __high2float(a);
900 return a1 < a2 ? get_half2_low(a) : get_half2_high(a);
901#endif
902}
903
904EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul(const half2& a) {
905#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
906 return __hmul(__low2half(a), __high2half(a));
907#else
908 float a1 = __low2float(a);
909 float a2 = __high2float(a);
910 return Eigen::half(__float2half(a1 * a2));
911#endif
912}
913
914EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog1p(const half2& a) {
915 float a1 = __low2float(a);
916 float a2 = __high2float(a);
917 float r1 = log1pf(a1);
918 float r2 = log1pf(a2);
919 return __floats2half2_rn(r1, r2);
920}
921
922EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexpm1(const half2& a) {
923 float a1 = __low2float(a);
924 float a2 = __high2float(a);
925 float r1 = expm1f(a1);
926 float r2 = expm1f(a2);
927 return __floats2half2_rn(r1, r2);
928}
929
930#if (EIGEN_CUDA_SDK_VER >= 80000 && defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)) || \
931 defined(EIGEN_HIP_DEVICE_COMPILE)
932
933EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
934half2 plog(const half2& a) {
935 return h2log(a);
936}
937
938 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
939half2 pexp(const half2& a) {
940 return h2exp(a);
941}
942
943 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
944half2 psqrt(const half2& a) {
945 return h2sqrt(a);
946}
947
948 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
949half2 prsqrt(const half2& a) {
950 return h2rsqrt(a);
951}
952
953#else
954
955EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 plog(const half2& a) {
956 float a1 = __low2float(a);
957 float a2 = __high2float(a);
958 float r1 = logf(a1);
959 float r2 = logf(a2);
960 return __floats2half2_rn(r1, r2);
961}
962
963EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pexp(const half2& a) {
964 float a1 = __low2float(a);
965 float a2 = __high2float(a);
966 float r1 = expf(a1);
967 float r2 = expf(a2);
968 return __floats2half2_rn(r1, r2);
969}
970
971EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 psqrt(const half2& a) {
972 float a1 = __low2float(a);
973 float a2 = __high2float(a);
974 float r1 = sqrtf(a1);
975 float r2 = sqrtf(a2);
976 return __floats2half2_rn(r1, r2);
977}
978
979EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 prsqrt(const half2& a) {
980 float a1 = __low2float(a);
981 float a2 = __high2float(a);
982 float r1 = rsqrtf(a1);
983 float r2 = rsqrtf(a2);
984 return __floats2half2_rn(r1, r2);
985}
986#endif
987} // namespace
988
989template <>
990EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
991pload<Packet4h2>(const Eigen::half* from) {
992 return *reinterpret_cast<const Packet4h2*>(from);
993}
994
995// unaligned load;
996template <>
997EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
998ploadu<Packet4h2>(const Eigen::half* from) {
999 Packet4h2 r;
1000 half2* p_alias = reinterpret_cast<half2*>(&r);
1001 p_alias[0] = ploadu(from + 0);
1002 p_alias[1] = ploadu(from + 2);
1003 p_alias[2] = ploadu(from + 4);
1004 p_alias[3] = ploadu(from + 6);
1005 return r;
1006}
1007
1008template <>
1009EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1010ploaddup<Packet4h2>(const Eigen::half* from) {
1011 Packet4h2 r;
1012 half2* p_alias = reinterpret_cast<half2*>(&r);
1013 p_alias[0] = ploaddup(from + 0);
1014 p_alias[1] = ploaddup(from + 1);
1015 p_alias[2] = ploaddup(from + 2);
1016 p_alias[3] = ploaddup(from + 3);
1017 return r;
1018}
1019
1020template <>
1021EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstore<Eigen::half>(
1022 Eigen::half* to, const Packet4h2& from) {
1023 *reinterpret_cast<Packet4h2*>(to) = from;
1024}
1025
1026template <>
1027EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(
1028 Eigen::half* to, const Packet4h2& from) {
1029 const half2* from_alias = reinterpret_cast<const half2*>(&from);
1030 pstoreu(to + 0,from_alias[0]);
1031 pstoreu(to + 2,from_alias[1]);
1032 pstoreu(to + 4,from_alias[2]);
1033 pstoreu(to + 6,from_alias[3]);
1034}
1035
1036template <>
1037EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1038ploadt_ro<Packet4h2, Aligned>(const Eigen::half* from) {
1039#if defined(EIGEN_GPU_HAS_LDG)
1040 Packet4h2 r;
1041 r = __ldg(reinterpret_cast<const Packet4h2*>(from));
1042 return r;
1043#else
1044 Packet4h2 r;
1045 half2* r_alias = reinterpret_cast<half2*>(&r);
1046 r_alias[0] = ploadt_ro_aligned(from + 0);
1047 r_alias[1] = ploadt_ro_aligned(from + 2);
1048 r_alias[2] = ploadt_ro_aligned(from + 4);
1049 r_alias[3] = ploadt_ro_aligned(from + 6);
1050 return r;
1051#endif
1052}
1053
1054template <>
1055EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet4h2
1056ploadt_ro<Packet4h2, Unaligned>(const Eigen::half* from) {
1057 Packet4h2 r;
1058 half2* r_alias = reinterpret_cast<half2*>(&r);
1059 r_alias[0] = ploadt_ro_unaligned(from + 0);
1060 r_alias[1] = ploadt_ro_unaligned(from + 2);
1061 r_alias[2] = ploadt_ro_unaligned(from + 4);
1062 r_alias[3] = ploadt_ro_unaligned(from + 6);
1063 return r;
1064}
1065
1066template <>
1067EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1068pgather<Eigen::half, Packet4h2>(const Eigen::half* from, Index stride) {
1069 Packet4h2 r;
1070 half2* p_alias = reinterpret_cast<half2*>(&r);
1071 p_alias[0] = combine_half(from[0 * stride], from[1 * stride]);
1072 p_alias[1] = combine_half(from[2 * stride], from[3 * stride]);
1073 p_alias[2] = combine_half(from[4 * stride], from[5 * stride]);
1074 p_alias[3] = combine_half(from[6 * stride], from[7 * stride]);
1075 return r;
1076}
1077
1078template <>
1079EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h2>(
1080 Eigen::half* to, const Packet4h2& from, Index stride) {
1081 const half2* from_alias = reinterpret_cast<const half2*>(&from);
1082 pscatter(to + stride * 0, from_alias[0], stride);
1083 pscatter(to + stride * 2, from_alias[1], stride);
1084 pscatter(to + stride * 4, from_alias[2], stride);
1085 pscatter(to + stride * 6, from_alias[3], stride);
1086}
1087
1088template <>
1089EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h2>(
1090 const Packet4h2& a) {
1091 return pfirst(*(reinterpret_cast<const half2*>(&a)));
1092}
1093
1094template <>
1095EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pabs<Packet4h2>(
1096 const Packet4h2& a) {
1097 Packet4h2 r;
1098 half2* p_alias = reinterpret_cast<half2*>(&r);
1099 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1100 p_alias[0] = pabs(a_alias[0]);
1101 p_alias[1] = pabs(a_alias[1]);
1102 p_alias[2] = pabs(a_alias[2]);
1103 p_alias[3] = pabs(a_alias[3]);
1104 return r;
1105}
1106
1107template <>
1108EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 ptrue<Packet4h2>(
1109 const Packet4h2& /*a*/) {
1110 half true_half = half_impl::raw_uint16_to_half(0xffffu);
1111 return pset1<Packet4h2>(true_half);
1112}
1113
1114template <>
1115EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pzero<Packet4h2>(const Packet4h2& /*a*/) {
1116 half false_half = half_impl::raw_uint16_to_half(0x0000u);
1117 return pset1<Packet4h2>(false_half);
1118}
1119
1120EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_double(
1121 double* d_row0, double* d_row1, double* d_row2, double* d_row3,
1122 double* d_row4, double* d_row5, double* d_row6, double* d_row7) {
1123 double d_tmp;
1124 d_tmp = d_row0[1];
1125 d_row0[1] = d_row4[0];
1126 d_row4[0] = d_tmp;
1127
1128 d_tmp = d_row1[1];
1129 d_row1[1] = d_row5[0];
1130 d_row5[0] = d_tmp;
1131
1132 d_tmp = d_row2[1];
1133 d_row2[1] = d_row6[0];
1134 d_row6[0] = d_tmp;
1135
1136 d_tmp = d_row3[1];
1137 d_row3[1] = d_row7[0];
1138 d_row7[0] = d_tmp;
1139}
1140
1141EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void ptranspose_half2(
1142 half2* f_row0, half2* f_row1, half2* f_row2, half2* f_row3) {
1143 half2 f_tmp;
1144 f_tmp = f_row0[1];
1145 f_row0[1] = f_row2[0];
1146 f_row2[0] = f_tmp;
1147
1148 f_tmp = f_row1[1];
1149 f_row1[1] = f_row3[0];
1150 f_row3[0] = f_tmp;
1151}
1152
1153EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1154ptranspose_half(half2& f0, half2& f1) {
1155 __half a1 = get_half2_low(f0);
1156 __half a2 = get_half2_high(f0);
1157 __half b1 = get_half2_low(f1);
1158 __half b2 = get_half2_high(f1);
1159 f0 = combine_half(a1, b1);
1160 f1 = combine_half(a2, b2);
1161}
1162
1163EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void
1164ptranspose(PacketBlock<Packet4h2,8>& kernel) {
1165 double* d_row0 = reinterpret_cast<double*>(&kernel.packet[0]);
1166 double* d_row1 = reinterpret_cast<double*>(&kernel.packet[1]);
1167 double* d_row2 = reinterpret_cast<double*>(&kernel.packet[2]);
1168 double* d_row3 = reinterpret_cast<double*>(&kernel.packet[3]);
1169 double* d_row4 = reinterpret_cast<double*>(&kernel.packet[4]);
1170 double* d_row5 = reinterpret_cast<double*>(&kernel.packet[5]);
1171 double* d_row6 = reinterpret_cast<double*>(&kernel.packet[6]);
1172 double* d_row7 = reinterpret_cast<double*>(&kernel.packet[7]);
1173 ptranspose_double(d_row0, d_row1, d_row2, d_row3,
1174 d_row4, d_row5, d_row6, d_row7);
1175
1176
1177 half2* f_row0 = reinterpret_cast<half2*>(d_row0);
1178 half2* f_row1 = reinterpret_cast<half2*>(d_row1);
1179 half2* f_row2 = reinterpret_cast<half2*>(d_row2);
1180 half2* f_row3 = reinterpret_cast<half2*>(d_row3);
1181 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1182 ptranspose_half(f_row0[0], f_row1[0]);
1183 ptranspose_half(f_row0[1], f_row1[1]);
1184 ptranspose_half(f_row2[0], f_row3[0]);
1185 ptranspose_half(f_row2[1], f_row3[1]);
1186
1187 f_row0 = reinterpret_cast<half2*>(d_row0 + 1);
1188 f_row1 = reinterpret_cast<half2*>(d_row1 + 1);
1189 f_row2 = reinterpret_cast<half2*>(d_row2 + 1);
1190 f_row3 = reinterpret_cast<half2*>(d_row3 + 1);
1191 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1192 ptranspose_half(f_row0[0], f_row1[0]);
1193 ptranspose_half(f_row0[1], f_row1[1]);
1194 ptranspose_half(f_row2[0], f_row3[0]);
1195 ptranspose_half(f_row2[1], f_row3[1]);
1196
1197 f_row0 = reinterpret_cast<half2*>(d_row4);
1198 f_row1 = reinterpret_cast<half2*>(d_row5);
1199 f_row2 = reinterpret_cast<half2*>(d_row6);
1200 f_row3 = reinterpret_cast<half2*>(d_row7);
1201 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1202 ptranspose_half(f_row0[0], f_row1[0]);
1203 ptranspose_half(f_row0[1], f_row1[1]);
1204 ptranspose_half(f_row2[0], f_row3[0]);
1205 ptranspose_half(f_row2[1], f_row3[1]);
1206
1207 f_row0 = reinterpret_cast<half2*>(d_row4 + 1);
1208 f_row1 = reinterpret_cast<half2*>(d_row5 + 1);
1209 f_row2 = reinterpret_cast<half2*>(d_row6 + 1);
1210 f_row3 = reinterpret_cast<half2*>(d_row7 + 1);
1211 ptranspose_half2(f_row0, f_row1, f_row2, f_row3);
1212 ptranspose_half(f_row0[0], f_row1[0]);
1213 ptranspose_half(f_row0[1], f_row1[1]);
1214 ptranspose_half(f_row2[0], f_row3[0]);
1215 ptranspose_half(f_row2[1], f_row3[1]);
1216
1217}
1218
1219template <>
1220EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1221plset<Packet4h2>(const Eigen::half& a) {
1222#if defined(EIGEN_HIP_DEVICE_COMPILE)
1223
1224 Packet4h2 r;
1225 half2* p_alias = reinterpret_cast<half2*>(&r);
1226 p_alias[0] = __halves2half2(a, __hadd(a, __float2half(1.0f)));
1227 p_alias[1] = __halves2half2(__hadd(a, __float2half(2.0f)),
1228 __hadd(a, __float2half(3.0f)));
1229 p_alias[2] = __halves2half2(__hadd(a, __float2half(4.0f)),
1230 __hadd(a, __float2half(5.0f)));
1231 p_alias[3] = __halves2half2(__hadd(a, __float2half(6.0f)),
1232 __hadd(a, __float2half(7.0f)));
1233 return r;
1234#elif defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1235 Packet4h2 r;
1236 half2* r_alias = reinterpret_cast<half2*>(&r);
1237
1238 half2 b = pset1<half2>(a);
1239 half2 c;
1240 half2 half_offset0 = __halves2half2(__float2half(0.0f),__float2half(2.0f));
1241 half2 half_offset1 = __halves2half2(__float2half(4.0f),__float2half(6.0f));
1242
1243 c = __hadd2(b, half_offset0);
1244 r_alias[0] = plset(__low2half(c));
1245 r_alias[1] = plset(__high2half(c));
1246
1247 c = __hadd2(b, half_offset1);
1248 r_alias[2] = plset(__low2half(c));
1249 r_alias[3] = plset(__high2half(c));
1250
1251 return r;
1252
1253#else
1254 float f = __half2float(a);
1255 Packet4h2 r;
1256 half2* p_alias = reinterpret_cast<half2*>(&r);
1257 p_alias[0] = combine_half(a, __float2half(f + 1.0f));
1258 p_alias[1] = combine_half(__float2half(f + 2.0f), __float2half(f + 3.0f));
1259 p_alias[2] = combine_half(__float2half(f + 4.0f), __float2half(f + 5.0f));
1260 p_alias[3] = combine_half(__float2half(f + 6.0f), __float2half(f + 7.0f));
1261 return r;
1262#endif
1263}
1264
1265template <>
1266EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1267pselect<Packet4h2>(const Packet4h2& mask, const Packet4h2& a,
1268 const Packet4h2& b) {
1269 Packet4h2 r;
1270 half2* r_alias = reinterpret_cast<half2*>(&r);
1271 const half2* mask_alias = reinterpret_cast<const half2*>(&mask);
1272 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1273 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1274 r_alias[0] = pselect(mask_alias[0], a_alias[0], b_alias[0]);
1275 r_alias[1] = pselect(mask_alias[1], a_alias[1], b_alias[1]);
1276 r_alias[2] = pselect(mask_alias[2], a_alias[2], b_alias[2]);
1277 r_alias[3] = pselect(mask_alias[3], a_alias[3], b_alias[3]);
1278 return r;
1279}
1280
1281template <>
1282EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1283pcmp_eq<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1284 Packet4h2 r;
1285 half2* r_alias = reinterpret_cast<half2*>(&r);
1286 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1287 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1288 r_alias[0] = pcmp_eq(a_alias[0], b_alias[0]);
1289 r_alias[1] = pcmp_eq(a_alias[1], b_alias[1]);
1290 r_alias[2] = pcmp_eq(a_alias[2], b_alias[2]);
1291 r_alias[3] = pcmp_eq(a_alias[3], b_alias[3]);
1292 return r;
1293}
1294
1295template <>
1296EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pand<Packet4h2>(
1297 const Packet4h2& a, const Packet4h2& b) {
1298 Packet4h2 r;
1299 half2* r_alias = reinterpret_cast<half2*>(&r);
1300 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1301 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1302 r_alias[0] = pand(a_alias[0], b_alias[0]);
1303 r_alias[1] = pand(a_alias[1], b_alias[1]);
1304 r_alias[2] = pand(a_alias[2], b_alias[2]);
1305 r_alias[3] = pand(a_alias[3], b_alias[3]);
1306 return r;
1307}
1308
1309template <>
1310EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 por<Packet4h2>(
1311 const Packet4h2& a, const Packet4h2& b) {
1312 Packet4h2 r;
1313 half2* r_alias = reinterpret_cast<half2*>(&r);
1314 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1315 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1316 r_alias[0] = por(a_alias[0], b_alias[0]);
1317 r_alias[1] = por(a_alias[1], b_alias[1]);
1318 r_alias[2] = por(a_alias[2], b_alias[2]);
1319 r_alias[3] = por(a_alias[3], b_alias[3]);
1320 return r;
1321}
1322
1323template <>
1324EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pxor<Packet4h2>(
1325 const Packet4h2& a, const Packet4h2& b) {
1326 Packet4h2 r;
1327 half2* r_alias = reinterpret_cast<half2*>(&r);
1328 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1329 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1330 r_alias[0] = pxor(a_alias[0], b_alias[0]);
1331 r_alias[1] = pxor(a_alias[1], b_alias[1]);
1332 r_alias[2] = pxor(a_alias[2], b_alias[2]);
1333 r_alias[3] = pxor(a_alias[3], b_alias[3]);
1334 return r;
1335}
1336
1337template <>
1338EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1339pandnot<Packet4h2>(const Packet4h2& a, const Packet4h2& b) {
1340 Packet4h2 r;
1341 half2* r_alias = reinterpret_cast<half2*>(&r);
1342 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1343 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1344 r_alias[0] = pandnot(a_alias[0], b_alias[0]);
1345 r_alias[1] = pandnot(a_alias[1], b_alias[1]);
1346 r_alias[2] = pandnot(a_alias[2], b_alias[2]);
1347 r_alias[3] = pandnot(a_alias[3], b_alias[3]);
1348 return r;
1349}
1350
1351template <>
1352EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 padd<Packet4h2>(
1353 const Packet4h2& a, const Packet4h2& b) {
1354 Packet4h2 r;
1355 half2* r_alias = reinterpret_cast<half2*>(&r);
1356 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1357 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1358 r_alias[0] = padd(a_alias[0], b_alias[0]);
1359 r_alias[1] = padd(a_alias[1], b_alias[1]);
1360 r_alias[2] = padd(a_alias[2], b_alias[2]);
1361 r_alias[3] = padd(a_alias[3], b_alias[3]);
1362 return r;
1363}
1364
1365template <>
1366EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psub<Packet4h2>(
1367 const Packet4h2& a, const Packet4h2& b) {
1368 Packet4h2 r;
1369 half2* r_alias = reinterpret_cast<half2*>(&r);
1370 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1371 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1372 r_alias[0] = psub(a_alias[0], b_alias[0]);
1373 r_alias[1] = psub(a_alias[1], b_alias[1]);
1374 r_alias[2] = psub(a_alias[2], b_alias[2]);
1375 r_alias[3] = psub(a_alias[3], b_alias[3]);
1376 return r;
1377}
1378
1379template <>
1380EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pnegate(const Packet4h2& a) {
1381 Packet4h2 r;
1382 half2* r_alias = reinterpret_cast<half2*>(&r);
1383 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1384 r_alias[0] = pnegate(a_alias[0]);
1385 r_alias[1] = pnegate(a_alias[1]);
1386 r_alias[2] = pnegate(a_alias[2]);
1387 r_alias[3] = pnegate(a_alias[3]);
1388 return r;
1389}
1390
1391template <>
1392EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pconj(const Packet4h2& a) {
1393 return a;
1394}
1395
1396template <>
1397EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmul<Packet4h2>(
1398 const Packet4h2& a, const Packet4h2& b) {
1399 Packet4h2 r;
1400 half2* r_alias = reinterpret_cast<half2*>(&r);
1401 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1402 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1403 r_alias[0] = pmul(a_alias[0], b_alias[0]);
1404 r_alias[1] = pmul(a_alias[1], b_alias[1]);
1405 r_alias[2] = pmul(a_alias[2], b_alias[2]);
1406 r_alias[3] = pmul(a_alias[3], b_alias[3]);
1407 return r;
1408}
1409
1410template <>
1411EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmadd<Packet4h2>(
1412 const Packet4h2& a, const Packet4h2& b, const Packet4h2& c) {
1413 Packet4h2 r;
1414 half2* r_alias = reinterpret_cast<half2*>(&r);
1415 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1416 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1417 const half2* c_alias = reinterpret_cast<const half2*>(&c);
1418 r_alias[0] = pmadd(a_alias[0], b_alias[0], c_alias[0]);
1419 r_alias[1] = pmadd(a_alias[1], b_alias[1], c_alias[1]);
1420 r_alias[2] = pmadd(a_alias[2], b_alias[2], c_alias[2]);
1421 r_alias[3] = pmadd(a_alias[3], b_alias[3], c_alias[3]);
1422 return r;
1423}
1424
1425template <>
1426EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pdiv<Packet4h2>(
1427 const Packet4h2& a, const Packet4h2& b) {
1428 Packet4h2 r;
1429 half2* r_alias = reinterpret_cast<half2*>(&r);
1430 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1431 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1432 r_alias[0] = pdiv(a_alias[0], b_alias[0]);
1433 r_alias[1] = pdiv(a_alias[1], b_alias[1]);
1434 r_alias[2] = pdiv(a_alias[2], b_alias[2]);
1435 r_alias[3] = pdiv(a_alias[3], b_alias[3]);
1436 return r;
1437}
1438
1439template <>
1440EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmin<Packet4h2>(
1441 const Packet4h2& a, const Packet4h2& b) {
1442 Packet4h2 r;
1443 half2* r_alias = reinterpret_cast<half2*>(&r);
1444 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1445 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1446 r_alias[0] = pmin(a_alias[0], b_alias[0]);
1447 r_alias[1] = pmin(a_alias[1], b_alias[1]);
1448 r_alias[2] = pmin(a_alias[2], b_alias[2]);
1449 r_alias[3] = pmin(a_alias[3], b_alias[3]);
1450 return r;
1451}
1452
1453template <>
1454EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pmax<Packet4h2>(
1455 const Packet4h2& a, const Packet4h2& b) {
1456 Packet4h2 r;
1457 half2* r_alias = reinterpret_cast<half2*>(&r);
1458 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1459 const half2* b_alias = reinterpret_cast<const half2*>(&b);
1460 r_alias[0] = pmax(a_alias[0], b_alias[0]);
1461 r_alias[1] = pmax(a_alias[1], b_alias[1]);
1462 r_alias[2] = pmax(a_alias[2], b_alias[2]);
1463 r_alias[3] = pmax(a_alias[3], b_alias[3]);
1464 return r;
1465}
1466
1467template <>
1468EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux<Packet4h2>(
1469 const Packet4h2& a) {
1470 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1471
1472 return predux(a_alias[0]) + predux(a_alias[1]) +
1473 predux(a_alias[2]) + predux(a_alias[3]);
1474}
1475
1476template <>
1477EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_max<Packet4h2>(
1478 const Packet4h2& a) {
1479 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1480 half2 m0 = combine_half(predux_max(a_alias[0]),
1481 predux_max(a_alias[1]));
1482 half2 m1 = combine_half(predux_max(a_alias[2]),
1483 predux_max(a_alias[3]));
1484 __half first = predux_max(m0);
1485 __half second = predux_max(m1);
1486#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1487 return (__hgt(first, second) ? first : second);
1488#else
1489 float ffirst = __half2float(first);
1490 float fsecond = __half2float(second);
1491 return (ffirst > fsecond)? first: second;
1492#endif
1493}
1494
1495template <>
1496EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_min<Packet4h2>(
1497 const Packet4h2& a) {
1498 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1499 half2 m0 = combine_half(predux_min(a_alias[0]),
1500 predux_min(a_alias[1]));
1501 half2 m1 = combine_half(predux_min(a_alias[2]),
1502 predux_min(a_alias[3]));
1503 __half first = predux_min(m0);
1504 __half second = predux_min(m1);
1505#if defined(EIGEN_CUDA_HAS_FP16_ARITHMETIC)
1506 return (__hlt(first, second) ? first : second);
1507#else
1508 float ffirst = __half2float(first);
1509 float fsecond = __half2float(second);
1510 return (ffirst < fsecond)? first: second;
1511#endif
1512}
1513
1514// likely overflow/underflow
1515template <>
1516EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet4h2>(
1517 const Packet4h2& a) {
1518 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1519 return predux_mul(pmul(pmul(a_alias[0], a_alias[1]),
1520 pmul(a_alias[2], a_alias[3])));
1521}
1522
1523template <>
1524EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1525plog1p<Packet4h2>(const Packet4h2& a) {
1526 Packet4h2 r;
1527 half2* r_alias = reinterpret_cast<half2*>(&r);
1528 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1529 r_alias[0] = plog1p(a_alias[0]);
1530 r_alias[1] = plog1p(a_alias[1]);
1531 r_alias[2] = plog1p(a_alias[2]);
1532 r_alias[3] = plog1p(a_alias[3]);
1533 return r;
1534}
1535
1536template <>
1537EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1538pexpm1<Packet4h2>(const Packet4h2& a) {
1539 Packet4h2 r;
1540 half2* r_alias = reinterpret_cast<half2*>(&r);
1541 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1542 r_alias[0] = pexpm1(a_alias[0]);
1543 r_alias[1] = pexpm1(a_alias[1]);
1544 r_alias[2] = pexpm1(a_alias[2]);
1545 r_alias[3] = pexpm1(a_alias[3]);
1546 return r;
1547}
1548
1549template <>
1550EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 plog<Packet4h2>(const Packet4h2& a) {
1551 Packet4h2 r;
1552 half2* r_alias = reinterpret_cast<half2*>(&r);
1553 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1554 r_alias[0] = plog(a_alias[0]);
1555 r_alias[1] = plog(a_alias[1]);
1556 r_alias[2] = plog(a_alias[2]);
1557 r_alias[3] = plog(a_alias[3]);
1558 return r;
1559}
1560
1561template <>
1562EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 pexp<Packet4h2>(const Packet4h2& a) {
1563 Packet4h2 r;
1564 half2* r_alias = reinterpret_cast<half2*>(&r);
1565 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1566 r_alias[0] = pexp(a_alias[0]);
1567 r_alias[1] = pexp(a_alias[1]);
1568 r_alias[2] = pexp(a_alias[2]);
1569 r_alias[3] = pexp(a_alias[3]);
1570 return r;
1571}
1572
1573template <>
1574EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2 psqrt<Packet4h2>(const Packet4h2& a) {
1575 Packet4h2 r;
1576 half2* r_alias = reinterpret_cast<half2*>(&r);
1577 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1578 r_alias[0] = psqrt(a_alias[0]);
1579 r_alias[1] = psqrt(a_alias[1]);
1580 r_alias[2] = psqrt(a_alias[2]);
1581 r_alias[3] = psqrt(a_alias[3]);
1582 return r;
1583}
1584
1585template <>
1586EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet4h2
1587prsqrt<Packet4h2>(const Packet4h2& a) {
1588 Packet4h2 r;
1589 half2* r_alias = reinterpret_cast<half2*>(&r);
1590 const half2* a_alias = reinterpret_cast<const half2*>(&a);
1591 r_alias[0] = prsqrt(a_alias[0]);
1592 r_alias[1] = prsqrt(a_alias[1]);
1593 r_alias[2] = prsqrt(a_alias[2]);
1594 r_alias[3] = prsqrt(a_alias[3]);
1595 return r;
1596}
1597
1598// The following specialized padd, pmul, pdiv, pmin, pmax, pset1 are needed for
1599// the implementation of GPU half reduction.
1600template<>
1601EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a,
1602 const half2& b) {
1603#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1604 return __hadd2(a, b);
1605#else
1606 float a1 = __low2float(a);
1607 float a2 = __high2float(a);
1608 float b1 = __low2float(b);
1609 float b2 = __high2float(b);
1610 float r1 = a1 + b1;
1611 float r2 = a2 + b2;
1612 return __floats2half2_rn(r1, r2);
1613#endif
1614}
1615
1616template<>
1617EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a,
1618 const half2& b) {
1619#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1620 return __hmul2(a, b);
1621#else
1622 float a1 = __low2float(a);
1623 float a2 = __high2float(a);
1624 float b1 = __low2float(b);
1625 float b2 = __high2float(b);
1626 float r1 = a1 * b1;
1627 float r2 = a2 * b2;
1628 return __floats2half2_rn(r1, r2);
1629#endif
1630}
1631
1632template<>
1633EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a,
1634 const half2& b) {
1635#if defined(EIGEN_GPU_HAS_FP16_ARITHMETIC)
1636 return __h2div(a, b);
1637#else
1638 float a1 = __low2float(a);
1639 float a2 = __high2float(a);
1640 float b1 = __low2float(b);
1641 float b2 = __high2float(b);
1642 float r1 = a1 / b1;
1643 float r2 = a2 / b2;
1644 return __floats2half2_rn(r1, r2);
1645#endif
1646}
1647
1648template<>
1649EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a,
1650 const half2& b) {
1651 float a1 = __low2float(a);
1652 float a2 = __high2float(a);
1653 float b1 = __low2float(b);
1654 float b2 = __high2float(b);
1655 __half r1 = a1 < b1 ? get_half2_low(a) : get_half2_low(b);
1656 __half r2 = a2 < b2 ? get_half2_high(a) : get_half2_high(b);
1657 return combine_half(r1, r2);
1658}
1659
1660template<>
1661EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a,
1662 const half2& b) {
1663 float a1 = __low2float(a);
1664 float a2 = __high2float(a);
1665 float b1 = __low2float(b);
1666 float b2 = __high2float(b);
1667 __half r1 = a1 > b1 ? get_half2_low(a) : get_half2_low(b);
1668 __half r2 = a2 > b2 ? get_half2_high(a) : get_half2_high(b);
1669 return combine_half(r1, r2);
1670}
1671
1672// #endif // defined(EIGEN_CUDA_ARCH) || defined(EIGEN_HIPCC) || (defined(EIGEN_CUDACC) && EIGEN_COMP_CLANG && !EIGEN_COMP_NVCC)
1673
1674#endif // defined(EIGEN_HAS_CUDA_FP16) || defined(EIGEN_HAS_HIP_FP16)
1675
1676#undef EIGEN_GPU_HAS_LDG
1677#undef EIGEN_CUDA_HAS_FP16_ARITHMETIC
1678#undef EIGEN_GPU_HAS_FP16_ARITHMETIC
1679
1680} // end namespace internal
1681
1682} // end namespace Eigen
1683
1684
1685#endif // EIGEN_PACKET_MATH_GPU_H
@ Aligned16
Definition: Constants.h:235
Namespace containing all symbols from the Eigen library.
Definition: Core:141
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
The Index type as used for the API.
Definition: Meta.h:74