Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
Eigen  3.4.0
 
Loading...
Searching...
No Matches
GeneralBlockPanelKernel.h
1namespace Eigen {
2namespace internal {
3
4#if EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
5
6// Clang seems to excessively spill registers in the GEBP kernel on 32-bit arm.
7// Here we specialize gebp_traits to eliminate these register spills.
8// See #2138.
9template<>
10struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
11 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
12{
13 EIGEN_STRONG_INLINE void acc(const AccPacket& c, const ResPacket& alpha, ResPacket& r) const
14 {
15 // This volatile inline ASM both acts as a barrier to prevent reordering,
16 // as well as enforces strict register use.
17 asm volatile(
18 "vmla.f32 %q[r], %q[c], %q[alpha]"
19 : [r] "+w" (r)
20 : [c] "w" (c),
21 [alpha] "w" (alpha)
22 : );
23 }
24
25 template <typename LaneIdType>
26 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const Packet4f& b,
27 Packet4f& c, Packet4f& tmp,
28 const LaneIdType&) const {
29 acc(a, b, c);
30 }
31
32 template <typename LaneIdType>
33 EIGEN_STRONG_INLINE void madd(const Packet4f& a, const QuadPacket<Packet4f>& b,
34 Packet4f& c, Packet4f& tmp,
35 const LaneIdType& lane) const {
36 madd(a, b.get(lane), c, tmp, lane);
37 }
38};
39
40#endif // EIGEN_ARCH_ARM && EIGEN_COMP_CLANG
41
42#if EIGEN_ARCH_ARM64
43
44template<>
45struct gebp_traits <float,float,false,false,Architecture::NEON,GEBPPacketFull>
46 : gebp_traits<float,float,false,false,Architecture::Generic,GEBPPacketFull>
47{
48 typedef float RhsPacket;
49 typedef float32x4_t RhsPacketx4;
50
51 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
52 {
53 dest = *b;
54 }
55
56 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
57 {
58 dest = vld1q_f32(b);
59 }
60
61 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
62 {
63 dest = *b;
64 }
65
66 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
67 {}
68
69 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
70 {
71 loadRhs(b,dest);
72 }
73
74 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
75 {
76 c = vfmaq_n_f32(c, a, b);
77 }
78
79 // NOTE: Template parameter inference failed when compiled with Android NDK:
80 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
81
82 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
83 { madd_helper<0>(a, b, c); }
84 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
85 { madd_helper<1>(a, b, c); }
86 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
87 { madd_helper<2>(a, b, c); }
88 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
89 { madd_helper<3>(a, b, c); }
90
91 private:
92 template<int LaneID>
93 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
94 {
95 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
96 // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
97 // vfmaq_laneq_f32 is implemented through a costly dup
98 if(LaneID==0) asm("fmla %0.4s, %1.4s, %2.s[0]\n" : "+w" (c) : "w" (a), "w" (b) : );
99 else if(LaneID==1) asm("fmla %0.4s, %1.4s, %2.s[1]\n" : "+w" (c) : "w" (a), "w" (b) : );
100 else if(LaneID==2) asm("fmla %0.4s, %1.4s, %2.s[2]\n" : "+w" (c) : "w" (a), "w" (b) : );
101 else if(LaneID==3) asm("fmla %0.4s, %1.4s, %2.s[3]\n" : "+w" (c) : "w" (a), "w" (b) : );
102 #else
103 c = vfmaq_laneq_f32(c, a, b, LaneID);
104 #endif
105 }
106};
107
108
109template<>
110struct gebp_traits <double,double,false,false,Architecture::NEON>
111 : gebp_traits<double,double,false,false,Architecture::Generic>
112{
113 typedef double RhsPacket;
114
115 struct RhsPacketx4 {
116 float64x2_t B_0, B_1;
117 };
118
119 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacket& dest) const
120 {
121 dest = *b;
122 }
123
124 EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, RhsPacketx4& dest) const
125 {
126 dest.B_0 = vld1q_f64(b);
127 dest.B_1 = vld1q_f64(b+2);
128 }
129
130 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar* b, RhsPacket& dest) const
131 {
132 loadRhs(b,dest);
133 }
134
135 EIGEN_STRONG_INLINE void updateRhs(const RhsScalar*, RhsPacketx4&) const
136 {}
137
138 EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, RhsPacket& dest) const
139 {
140 loadRhs(b,dest);
141 }
142
143 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacket& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
144 {
145 c = vfmaq_n_f64(c, a, b);
146 }
147
148 // NOTE: Template parameter inference failed when compiled with Android NDK:
149 // "candidate template ignored: could not match 'FixedInt<N>' against 'Eigen::internal::FixedInt<0>".
150
151 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<0>&) const
152 { madd_helper<0>(a, b, c); }
153 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<1>&) const
154 { madd_helper<1>(a, b, c); }
155 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<2>&) const
156 { madd_helper<2>(a, b, c); }
157 EIGEN_STRONG_INLINE void madd(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c, RhsPacket& /*tmp*/, const FixedInt<3>&) const
158 { madd_helper<3>(a, b, c); }
159
160 private:
161 template <int LaneID>
162 EIGEN_STRONG_INLINE void madd_helper(const LhsPacket& a, const RhsPacketx4& b, AccPacket& c) const
163 {
164 #if EIGEN_COMP_GNUC_STRICT && !(EIGEN_GNUC_AT_LEAST(9,0))
165 // workaround gcc issue https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89101
166 // vfmaq_laneq_f64 is implemented through a costly dup
167 if(LaneID==0) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
168 else if(LaneID==1) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_0) : );
169 else if(LaneID==2) asm("fmla %0.2d, %1.2d, %2.d[0]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
170 else if(LaneID==3) asm("fmla %0.2d, %1.2d, %2.d[1]\n" : "+w" (c) : "w" (a), "w" (b.B_1) : );
171 #else
172 if(LaneID==0) c = vfmaq_laneq_f64(c, a, b.B_0, 0);
173 else if(LaneID==1) c = vfmaq_laneq_f64(c, a, b.B_0, 1);
174 else if(LaneID==2) c = vfmaq_laneq_f64(c, a, b.B_1, 0);
175 else if(LaneID==3) c = vfmaq_laneq_f64(c, a, b.B_1, 1);
176 #endif
177 }
178};
179
180#endif // EIGEN_ARCH_ARM64
181
182} // namespace internal
183} // namespace Eigen
Namespace containing all symbols from the Eigen library.
Definition: Core:141