Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorBroadcasting.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
11#define EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
12
13namespace Eigen {
14
22namespace internal {
23template<typename Broadcast, typename XprType>
24struct traits<TensorBroadcastingOp<Broadcast, XprType> > : public traits<XprType>
25{
26 typedef typename XprType::Scalar Scalar;
27 typedef traits<XprType> XprTraits;
28 typedef typename XprTraits::StorageKind StorageKind;
29 typedef typename XprTraits::Index Index;
30 typedef typename XprType::Nested Nested;
31 typedef typename remove_reference<Nested>::type _Nested;
32 static const int NumDimensions = XprTraits::NumDimensions;
33 static const int Layout = XprTraits::Layout;
34 typedef typename XprTraits::PointerType PointerType;
35};
36
37template<typename Broadcast, typename XprType>
38struct eval<TensorBroadcastingOp<Broadcast, XprType>, Eigen::Dense>
39{
40 typedef const TensorBroadcastingOp<Broadcast, XprType> EIGEN_DEVICE_REF type;
41};
42
43template<typename Broadcast, typename XprType>
44struct nested<TensorBroadcastingOp<Broadcast, XprType>, 1, typename eval<TensorBroadcastingOp<Broadcast, XprType> >::type>
45{
46 typedef TensorBroadcastingOp<Broadcast, XprType> type;
47};
48
49template <typename Dims>
50struct is_input_scalar {
51 static const bool value = false;
52};
53template <>
54struct is_input_scalar<Sizes<> > {
55 static const bool value = true;
56};
57#ifndef EIGEN_EMULATE_CXX11_META_H
58template <typename std::ptrdiff_t... Indices>
59struct is_input_scalar<Sizes<Indices...> > {
60 static const bool value = (Sizes<Indices...>::total_size == 1);
61};
62#endif
63
64} // end namespace internal
65
66
67
68template<typename Broadcast, typename XprType>
69class TensorBroadcastingOp : public TensorBase<TensorBroadcastingOp<Broadcast, XprType>, ReadOnlyAccessors>
70{
71 public:
72 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Scalar Scalar;
73 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
74 typedef typename XprType::CoeffReturnType CoeffReturnType;
75 typedef typename Eigen::internal::nested<TensorBroadcastingOp>::type Nested;
76 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::StorageKind StorageKind;
77 typedef typename Eigen::internal::traits<TensorBroadcastingOp>::Index Index;
78
79 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBroadcastingOp(const XprType& expr, const Broadcast& broadcast)
80 : m_xpr(expr), m_broadcast(broadcast) {}
81
82 EIGEN_DEVICE_FUNC
83 const Broadcast& broadcast() const { return m_broadcast; }
84
85 EIGEN_DEVICE_FUNC
86 const typename internal::remove_all<typename XprType::Nested>::type&
87 expression() const { return m_xpr; }
88
89 protected:
90 typename XprType::Nested m_xpr;
91 const Broadcast m_broadcast;
92};
93
94
95// Eval as rvalue
96template<typename Broadcast, typename ArgType, typename Device>
97struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
98{
99 typedef TensorBroadcastingOp<Broadcast, ArgType> XprType;
100 typedef typename XprType::Index Index;
101 static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
102 typedef DSizes<Index, NumDims> Dimensions;
103 typedef typename XprType::Scalar Scalar;
104 typedef typename TensorEvaluator<ArgType, Device>::Dimensions InputDimensions;
105 typedef typename XprType::CoeffReturnType CoeffReturnType;
106 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
107 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
108 protected: // all the non-static fields must have the same access control, otherwise the TensorEvaluator wont be standard layout;
109 bool isCopy, nByOne, oneByN;
110 public:
111 typedef StorageMemory<CoeffReturnType, Device> Storage;
112 typedef typename Storage::Type EvaluatorPointerType;
113
114 enum {
115 IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
116 PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
117 BlockAccess = TensorEvaluator<ArgType, Device>::BlockAccess,
118 PreferBlockAccess = true,
119 Layout = TensorEvaluator<ArgType, Device>::Layout,
120 RawAccess = false
121 };
122
123 typedef typename internal::remove_const<Scalar>::type ScalarNoConst;
124
125 // We do block based broadcasting using a trick with 2x tensor rank and 0
126 // strides. See block method implementation for details.
127 typedef DSizes<Index, 2 * NumDims> BroadcastDimensions;
128
129 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
130 typedef internal::TensorBlockDescriptor<NumDims, Index> TensorBlockDesc;
131 typedef internal::TensorBlockScratchAllocator<Device> TensorBlockScratch;
132
133 typedef typename TensorEvaluator<const ArgType, Device>::TensorBlock
134 ArgTensorBlock;
135
136 typedef typename internal::TensorMaterializedBlock<ScalarNoConst, NumDims,
137 Layout, Index>
138 TensorBlock;
139 //===--------------------------------------------------------------------===//
140
141 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
142 : isCopy(false), nByOne(false), oneByN(false),
143 m_device(device), m_broadcast(op.broadcast()), m_impl(op.expression(), device)
144 {
145
146 // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
147 // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
148 // tensor with N >= 1 of 1 element first and then broadcast.
149 EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
150 const InputDimensions& input_dims = m_impl.dimensions();
151 isCopy = true;
152 for (int i = 0; i < NumDims; ++i) {
153 eigen_assert(input_dims[i] > 0);
154 m_dimensions[i] = input_dims[i] * m_broadcast[i];
155 if (m_broadcast[i] != 1) {
156 isCopy = false;
157 }
158 }
159
160 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
161 m_inputStrides[0] = 1;
162 m_outputStrides[0] = 1;
163 for (int i = 1; i < NumDims; ++i) {
164 m_inputStrides[i] = m_inputStrides[i-1] * input_dims[i-1];
165 m_outputStrides[i] = m_outputStrides[i-1] * m_dimensions[i-1];
166 }
167 } else {
168 m_inputStrides[NumDims-1] = 1;
169 m_outputStrides[NumDims-1] = 1;
170 for (int i = NumDims-2; i >= 0; --i) {
171 m_inputStrides[i] = m_inputStrides[i+1] * input_dims[i+1];
172 m_outputStrides[i] = m_outputStrides[i+1] * m_dimensions[i+1];
173 }
174 }
175
176 if (input_dims[0] == 1) {
177 oneByN = true;
178 for (int i = 1; i < NumDims; ++i) {
179 if (m_broadcast[i] != 1) {
180 oneByN = false;
181 break;
182 }
183 }
184 } else if (input_dims[NumDims-1] == 1) {
185 nByOne = true;
186 for (int i = 0; i < NumDims-1; ++i) {
187 if (m_broadcast[i] != 1) {
188 nByOne = false;
189 break;
190 }
191 }
192 }
193
194 // Handle special format like NCHW, its input shape is '[1, N..., 1]' and
195 // broadcast shape is '[N, 1..., N]'
196 if (!oneByN && !nByOne) {
197 if (input_dims[0] == 1 && input_dims[NumDims-1] == 1 && NumDims > 2) {
198 nByOne = true;
199 oneByN = true;
200 for (int i = 1; i < NumDims-1; ++i) {
201 if (m_broadcast[i] != 1) {
202 nByOne = false;
203 oneByN = false;
204 break;
205 }
206 }
207 }
208 }
209 }
210
211 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
212
213 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType) {
214 m_impl.evalSubExprsIfNeeded(NULL);
215 return true;
216 }
217
218#ifdef EIGEN_USE_THREADS
219 template <typename EvalSubExprsCallback>
220 EIGEN_STRONG_INLINE void evalSubExprsIfNeededAsync(
221 EvaluatorPointerType, EvalSubExprsCallback done) {
222 m_impl.evalSubExprsIfNeededAsync(nullptr, [done](bool) { done(true); });
223 }
224#endif // EIGEN_USE_THREADS
225
226 EIGEN_STRONG_INLINE void cleanup() {
227 m_impl.cleanup();
228 }
229
230 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE CoeffReturnType coeff(Index index) const
231 {
232 if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
233 return m_impl.coeff(0);
234 }
235
236 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
237 if (isCopy) {
238 return m_impl.coeff(index);
239 } else {
240 return coeffColMajor(index);
241 }
242 } else {
243 if (isCopy) {
244 return m_impl.coeff(index);
245 } else {
246 return coeffRowMajor(index);
247 }
248 }
249 }
250
251 // TODO: attempt to speed this up. The integer divisions and modulo are slow
252 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexColMajor(Index index) const {
253 Index inputIndex = 0;
254 EIGEN_UNROLL_LOOP
255 for (int i = NumDims - 1; i > 0; --i) {
256 const Index idx = index / m_outputStrides[i];
257 if (internal::index_statically_eq<Broadcast>(i, 1)) {
258 eigen_assert(idx < m_impl.dimensions()[i]);
259 inputIndex += idx * m_inputStrides[i];
260 } else {
261 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
262 eigen_assert(idx % m_impl.dimensions()[i] == 0);
263 } else {
264 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
265 }
266 }
267 index -= idx * m_outputStrides[i];
268 }
269 if (internal::index_statically_eq<Broadcast>(0, 1)) {
270 eigen_assert(index < m_impl.dimensions()[0]);
271 inputIndex += index;
272 } else {
273 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
274 eigen_assert(index % m_impl.dimensions()[0] == 0);
275 } else {
276 inputIndex += (index % m_impl.dimensions()[0]);
277 }
278 }
279 return inputIndex;
280 }
281
282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffColMajor(Index index) const
283 {
284 return m_impl.coeff(indexColMajor(index));
285 }
286
287 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index indexRowMajor(Index index) const {
288 Index inputIndex = 0;
289 EIGEN_UNROLL_LOOP
290 for (int i = 0; i < NumDims - 1; ++i) {
291 const Index idx = index / m_outputStrides[i];
292 if (internal::index_statically_eq<Broadcast>(i, 1)) {
293 eigen_assert(idx < m_impl.dimensions()[i]);
294 inputIndex += idx * m_inputStrides[i];
295 } else {
296 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
297 eigen_assert(idx % m_impl.dimensions()[i] == 0);
298 } else {
299 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
300 }
301 }
302 index -= idx * m_outputStrides[i];
303 }
304 if (internal::index_statically_eq<Broadcast>(NumDims - 1, 1)) {
305 eigen_assert(index < m_impl.dimensions()[NumDims - 1]);
306 inputIndex += index;
307 } else {
308 if (internal::index_statically_eq<InputDimensions>(NumDims - 1, 1)) {
309 eigen_assert(index % m_impl.dimensions()[NumDims - 1] == 0);
310 } else {
311 inputIndex += (index % m_impl.dimensions()[NumDims - 1]);
312 }
313 }
314 return inputIndex;
315 }
316
317 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeffRowMajor(Index index) const
318 {
319 return m_impl.coeff(indexRowMajor(index));
320 }
321
322 template<int LoadMode>
323 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketReturnType packet(Index index) const
324 {
325 if (internal::is_input_scalar<typename internal::remove_all<InputDimensions>::type>::value) {
326 return internal::pset1<PacketReturnType>(m_impl.coeff(0));
327 }
328
329 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
330 if (isCopy) {
331 #ifdef EIGEN_GPU_COMPILE_PHASE
332 // See PR 437: on NVIDIA P100 and K20m we observed a x3-4 speed up by enforcing
333 // unaligned loads here. The reason is unclear though.
334 return m_impl.template packet<Unaligned>(index);
335 #else
336 return m_impl.template packet<LoadMode>(index);
337 #endif
338 } else if (oneByN && !nByOne) {
339 return packetNByOne<LoadMode>(index);
340 } else if (!oneByN && nByOne) {
341 return packetOneByN<LoadMode>(index);
342 } else if (oneByN && nByOne) {
343 return packetOneByNByOne<LoadMode>(index);
344 } else {
345 return packetColMajor<LoadMode>(index);
346 }
347 } else {
348 if (isCopy) {
349 #ifdef EIGEN_GPU_COMPILE_PHASE
350 // See above.
351 return m_impl.template packet<Unaligned>(index);
352 #else
353 return m_impl.template packet<LoadMode>(index);
354 #endif
355 } else if (oneByN && !nByOne) {
356 return packetOneByN<LoadMode>(index);
357 } else if (!oneByN && nByOne) {
358 return packetNByOne<LoadMode>(index);
359 } else if (oneByN && nByOne) {
360 return packetOneByNByOne<LoadMode>(index);
361 } else {
362 return packetRowMajor<LoadMode>(index);
363 }
364 }
365 }
366
367 template<int LoadMode>
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByNByOne
369 (Index index) const
370 {
371 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
372 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
373
374 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
375 Index startDim, endDim;
376 Index inputIndex, outputOffset, batchedIndex;
377
378 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
379 startDim = NumDims - 1;
380 endDim = 1;
381 } else {
382 startDim = 0;
383 endDim = NumDims - 2;
384 }
385
386 batchedIndex = index % m_outputStrides[startDim];
387 inputIndex = batchedIndex / m_outputStrides[endDim];
388 outputOffset = batchedIndex % m_outputStrides[endDim];
389
390 if (outputOffset + PacketSize <= m_outputStrides[endDim]) {
391 values[0] = m_impl.coeff(inputIndex);
392 return internal::pload1<PacketReturnType>(values);
393 } else {
394 EIGEN_UNROLL_LOOP
395 for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
396 if (outputOffset + cur < m_outputStrides[endDim]) {
397 values[i] = m_impl.coeff(inputIndex);
398 } else {
399 ++inputIndex;
400 inputIndex = (inputIndex == m_inputStrides[startDim] ? 0 : inputIndex);
401 values[i] = m_impl.coeff(inputIndex);
402 outputOffset = 0;
403 cur = 0;
404 }
405 }
406 return internal::pload<PacketReturnType>(values);
407 }
408 }
409
410 template<int LoadMode>
411 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetOneByN(Index index) const
412 {
413 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
414 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
415
416 Index dim, inputIndex;
417
418 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
419 dim = NumDims - 1;
420 } else {
421 dim = 0;
422 }
423
424 inputIndex = index % m_inputStrides[dim];
425 if (inputIndex + PacketSize <= m_inputStrides[dim]) {
426 return m_impl.template packet<Unaligned>(inputIndex);
427 } else {
428 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
429 EIGEN_UNROLL_LOOP
430 for (int i = 0; i < PacketSize; ++i) {
431 if (inputIndex > m_inputStrides[dim]-1) {
432 inputIndex = 0;
433 }
434 values[i] = m_impl.coeff(inputIndex++);
435 }
436 return internal::pload<PacketReturnType>(values);
437 }
438 }
439
440 template<int LoadMode>
441 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetNByOne(Index index) const
442 {
443 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
444 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
445
446 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
447 Index dim, inputIndex, outputOffset;
448
449 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
450 dim = 1;
451 } else {
452 dim = NumDims - 2;
453 }
454
455 inputIndex = index / m_outputStrides[dim];
456 outputOffset = index % m_outputStrides[dim];
457 if (outputOffset + PacketSize <= m_outputStrides[dim]) {
458 values[0] = m_impl.coeff(inputIndex);
459 return internal::pload1<PacketReturnType>(values);
460 } else {
461 EIGEN_UNROLL_LOOP
462 for (int i = 0, cur = 0; i < PacketSize; ++i, ++cur) {
463 if (outputOffset + cur < m_outputStrides[dim]) {
464 values[i] = m_impl.coeff(inputIndex);
465 } else {
466 values[i] = m_impl.coeff(++inputIndex);
467 outputOffset = 0;
468 cur = 0;
469 }
470 }
471 return internal::pload<PacketReturnType>(values);
472 }
473 }
474
475 // Ignore the LoadMode and always use unaligned loads since we can't guarantee
476 // the alignment at compile time.
477 template<int LoadMode>
478 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
479 {
480 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
481 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
482
483 const Index originalIndex = index;
484
485 Index inputIndex = 0;
486 EIGEN_UNROLL_LOOP
487 for (int i = NumDims - 1; i > 0; --i) {
488 const Index idx = index / m_outputStrides[i];
489 if (internal::index_statically_eq<Broadcast>(i, 1)) {
490 eigen_assert(idx < m_impl.dimensions()[i]);
491 inputIndex += idx * m_inputStrides[i];
492 } else {
493 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
494 eigen_assert(idx % m_impl.dimensions()[i] == 0);
495 } else {
496 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
497 }
498 }
499 index -= idx * m_outputStrides[i];
500 }
501 Index innermostLoc;
502 if (internal::index_statically_eq<Broadcast>(0, 1)) {
503 eigen_assert(index < m_impl.dimensions()[0]);
504 innermostLoc = index;
505 } else {
506 if (internal::index_statically_eq<InputDimensions>(0, 1)) {
507 eigen_assert(index % m_impl.dimensions()[0] == 0);
508 innermostLoc = 0;
509 } else {
510 innermostLoc = index % m_impl.dimensions()[0];
511 }
512 }
513 inputIndex += innermostLoc;
514
515 // Todo: this could be extended to the second dimension if we're not
516 // broadcasting alongside the first dimension, and so on.
517 if (innermostLoc + PacketSize <= m_impl.dimensions()[0]) {
518 return m_impl.template packet<Unaligned>(inputIndex);
519 } else {
520 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
521 values[0] = m_impl.coeff(inputIndex);
522 EIGEN_UNROLL_LOOP
523 for (int i = 1; i < PacketSize; ++i) {
524 if (innermostLoc + i < m_impl.dimensions()[0]) {
525 values[i] = m_impl.coeff(inputIndex+i);
526 } else {
527 values[i] = coeffColMajor(originalIndex+i);
528 }
529 }
530 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
531 return rslt;
532 }
533 }
534
535 template<int LoadMode>
536 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
537 {
538 EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
539 eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
540
541 const Index originalIndex = index;
542
543 Index inputIndex = 0;
544 EIGEN_UNROLL_LOOP
545 for (int i = 0; i < NumDims - 1; ++i) {
546 const Index idx = index / m_outputStrides[i];
547 if (internal::index_statically_eq<Broadcast>(i, 1)) {
548 eigen_assert(idx < m_impl.dimensions()[i]);
549 inputIndex += idx * m_inputStrides[i];
550 } else {
551 if (internal::index_statically_eq<InputDimensions>(i, 1)) {
552 eigen_assert(idx % m_impl.dimensions()[i] == 0);
553 } else {
554 inputIndex += (idx % m_impl.dimensions()[i]) * m_inputStrides[i];
555 }
556 }
557 index -= idx * m_outputStrides[i];
558 }
559 Index innermostLoc;
560 if (internal::index_statically_eq<Broadcast>(NumDims-1, 1)) {
561 eigen_assert(index < m_impl.dimensions()[NumDims-1]);
562 innermostLoc = index;
563 } else {
564 if (internal::index_statically_eq<InputDimensions>(NumDims-1, 1)) {
565 eigen_assert(index % m_impl.dimensions()[NumDims-1] == 0);
566 innermostLoc = 0;
567 } else {
568 innermostLoc = index % m_impl.dimensions()[NumDims-1];
569 }
570 }
571 inputIndex += innermostLoc;
572
573 // Todo: this could be extended to the second dimension if we're not
574 // broadcasting alongside the first dimension, and so on.
575 if (innermostLoc + PacketSize <= m_impl.dimensions()[NumDims-1]) {
576 return m_impl.template packet<Unaligned>(inputIndex);
577 } else {
578 EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
579 values[0] = m_impl.coeff(inputIndex);
580 EIGEN_UNROLL_LOOP
581 for (int i = 1; i < PacketSize; ++i) {
582 if (innermostLoc + i < m_impl.dimensions()[NumDims-1]) {
583 values[i] = m_impl.coeff(inputIndex+i);
584 } else {
585 values[i] = coeffRowMajor(originalIndex+i);
586 }
587 }
588 PacketReturnType rslt = internal::pload<PacketReturnType>(values);
589 return rslt;
590 }
591 }
592
593 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
594 costPerCoeff(bool vectorized) const {
595 double compute_cost = TensorOpCost::AddCost<Index>();
596 if (!isCopy && NumDims > 0) {
597 EIGEN_UNROLL_LOOP
598 for (int i = NumDims - 1; i > 0; --i) {
599 compute_cost += TensorOpCost::DivCost<Index>();
600 if (internal::index_statically_eq<Broadcast>(i, 1)) {
601 compute_cost +=
602 TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
603 } else {
604 if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
605 compute_cost += TensorOpCost::MulCost<Index>() +
606 TensorOpCost::ModCost<Index>() +
607 TensorOpCost::AddCost<Index>();
608 }
609 }
610 compute_cost +=
611 TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
612 }
613 }
614 return m_impl.costPerCoeff(vectorized) +
615 TensorOpCost(0, 0, compute_cost, vectorized, PacketSize);
616 }
617
618 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
619 internal::TensorBlockResourceRequirements getResourceRequirements() const {
620 // TODO(wuke): Targeting L1 size is 30% faster than targeting L{-1} on large
621 // tensors. But this might need further tuning.
622 const size_t target_size = m_device.firstLevelCacheSize();
623 return internal::TensorBlockResourceRequirements::merge(
624 m_impl.getResourceRequirements(),
625 internal::TensorBlockResourceRequirements::skewed<Scalar>(target_size));
626 }
627
628 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock
629 block(TensorBlockDesc& desc, TensorBlockScratch& scratch,
630 bool /*root_of_expr_ast*/ = false) const {
631 BlockBroadcastingParams params = blockBroadcastingParams(desc);
632
633 if (params.inner_dim_size == 0 || params.bcast_dim_size == 0) {
634 return emptyBlock();
635 }
636
637 // Prepare storage for the materialized broadcasting result.
638 const typename TensorBlock::Storage block_storage =
639 TensorBlock::prepareStorage(desc, scratch);
640 ScalarNoConst* materialized_output = block_storage.data();
641
642 // We potentially will need to materialize input blocks.
643 size_t materialized_input_size = 0;
644 ScalarNoConst* materialized_input = NULL;
645
646 // Initialize block broadcating iterator state for outer dimensions (outer
647 // with regard to bcast dimension). Dimension in this array are always in
648 // inner_most -> outer_most order (col major layout).
649 array<BlockBroadcastingIteratorState, NumDims> it;
650 int idx = 0;
651
652 for (int i = params.inner_dim_count + 1; i < NumDims; ++i) {
653 const Index dim = IsColMajor ? i : NumDims - 1 - i;
654 it[idx].size = params.output_dims[dim];
655 it[idx].count = 0;
656 it[idx].output_stride = m_outputStrides[dim];
657 it[idx].output_span = it[idx].output_stride * (it[idx].size - 1);
658 idx++;
659 }
660
661 // Write output into the beginning of `materialized_output`.
662 Index output_offset = 0;
663
664 // We will fill output block by broadcasting along the bcast dim, and
665 // iterating over outer dimension.
666 const Index output_size = NumDims == 0 ? 1 : params.output_dims.TotalSize();
667
668 for (Index num_output_coeffs = 0; num_output_coeffs < output_size;) {
669 ScalarNoConst* bcast_output = materialized_output + num_output_coeffs;
670 Index bcast_offset = desc.offset() + output_offset;
671
672 // Broadcast along the bcast dimension.
673 num_output_coeffs += BroadcastBlockAlongBcastDim(
674 params, bcast_offset, scratch, bcast_output, &materialized_input,
675 &materialized_input_size);
676
677 // Switch to the next outer dimension.
678 for (int j = 0; j < idx; ++j) {
679 if (++it[j].count < it[j].size) {
680 output_offset += it[j].output_stride;
681 break;
682 }
683 it[j].count = 0;
684 output_offset -= it[j].output_span;
685 }
686 }
687
688 return block_storage.AsTensorMaterializedBlock();
689 }
690
691 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
692
693 const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
694
695 Broadcast functor() const { return m_broadcast; }
696#ifdef EIGEN_USE_SYCL
697 // binding placeholder accessors to a command group handler for SYCL
698 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(
699 cl::sycl::handler& cgh) const {
700 m_impl.bind(cgh);
701 }
702#endif
703 private:
704 static const bool IsColMajor =
705 static_cast<int>(Layout) == static_cast<int>(ColMajor);
706
707 // We will build a general case block broadcasting on top of broadcasting
708 // primitive that will do broadcasting only for the inner dimension(s) along
709 // the first dimension smaller than the input size (it's called `bcast_dim`).
710 //
711 // Example:
712 // dim: 0 1 2 (ColMajor)
713 // input size: [9, 3, 6]
714 // block size: [9, 2, 6]
715 //
716 // We will compute broadcasted block by iterating over the outer dimensions
717 // before `bcast_dim` (only dimension `2` in this example) and computing
718 // broadcasts along the `bcast_dim` (dimension `1` in this example).
719
720 // BlockBroadcastingParams holds precomputed parameters for broadcasting a
721 // single block along the broadcasting dimension. Sizes and strides along the
722 // `bcast_dim` might be invalid, they will be adjusted later in
723 // `BroadcastBlockAlongBcastDim`.
724 struct BlockBroadcastingParams {
725 Dimensions input_dims; // input expression dimensions
726 Dimensions output_dims; // output block sizes
727 Dimensions output_strides; // output block strides
728
729 int inner_dim_count; // count inner dimensions matching in size
730 int bcast_dim; // broadcasting dimension index
731 Index bcast_dim_size; // broadcasting dimension size
732 Index inner_dim_size; // inner dimensions size
733
734 // Block sizes and strides for the input block where all dimensions before
735 // `bcast_dim` are equal to `1`.
736 Dimensions input_block_sizes;
737 Dimensions input_block_strides;
738
739 // Block sizes and strides for blocks with extra dimensions and strides `0`.
740 BroadcastDimensions bcast_block_sizes;
741 BroadcastDimensions bcast_block_strides;
742 BroadcastDimensions bcast_input_strides;
743 };
744
745 struct BlockBroadcastingIteratorState {
746 Index size;
747 Index count;
748 Index output_stride;
749 Index output_span;
750 };
751
752 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlockBroadcastingParams
753 blockBroadcastingParams(TensorBlockDesc& desc) const {
754 BlockBroadcastingParams params;
755
756 params.input_dims = Dimensions(m_impl.dimensions());
757
758 // Output block sizes and strides.
759 params.output_dims = desc.dimensions();
760 params.output_strides = internal::strides<Layout>(params.output_dims);
761
762 // Find the broadcasting dimension (first dimension with output size smaller
763 // that the input size).
764 params.bcast_dim = 0;
765 params.bcast_dim_size = 1;
766 params.inner_dim_size = 1;
767
768 // Count the number of inner dimensions that have the same size in the block
769 // and in the broadcast expression.
770 params.inner_dim_count = 0;
771
772 for (int i = 0; i < NumDims; ++i) {
773 const int dim = IsColMajor ? i : NumDims - i - 1;
774
775 if (params.output_dims[dim] == m_dimensions[dim]) {
776 params.inner_dim_size *= params.output_dims[dim];
777 ++params.inner_dim_count;
778 continue;
779 }
780
781 // First non-matching dimension is the broadcasting dimension.
782 eigen_assert(params.output_dims[dim] < m_dimensions[dim]);
783 params.bcast_dim = dim;
784 params.bcast_dim_size = params.output_dims[dim];
785 break;
786 }
787
788 // Calculate the input block size for looking into the input.
789 for (int i = 0; i < params.inner_dim_count; ++i) {
790 const int dim = IsColMajor ? i : NumDims - i - 1;
791 params.input_block_sizes[dim] = params.input_dims[dim];
792 }
793 for (int i = params.inner_dim_count; i < NumDims; ++i) {
794 const int dim = IsColMajor ? i : NumDims - i - 1;
795 params.input_block_sizes[dim] = 1;
796 }
797 params.input_block_strides =
798 internal::strides<Layout>(params.input_block_sizes);
799
800 // Broadcast with the 0-stride trick: Create 1 extra dim for each
801 // broadcast, set the input stride to 0.
802 //
803 // When ColMajor:
804 //
805 // - bcast_block_sizes:
806 // [d_0, b_0, d_1, b_1, ...]
807 //
808 // - bcast_block_strides:
809 // [output_block_strides[0], output_block_strides[0] * d_0,
810 // output_block_strides[1], output_block_strides[1] * d_1,
811 // ...]
812 //
813 // - bcast_input_strides:
814 // [input_block_strides[0], 0,
815 // input_block_strides[1], 0,
816 // ...].
817 //
818 for (int i = 0; i < params.inner_dim_count; ++i) {
819 const int dim = IsColMajor ? i : NumDims - i - 1;
820
821 const int copy_dim = IsColMajor ? 2 * i : 2 * NumDims - 2 * i - 1;
822 const int broadcast_dim = IsColMajor ? copy_dim + 1 : copy_dim - 1;
823
824 params.bcast_block_sizes[copy_dim] = params.input_dims[dim];
825 params.bcast_block_sizes[broadcast_dim] = m_broadcast[dim];
826 params.bcast_block_strides[copy_dim] = params.output_strides[dim];
827 params.bcast_block_strides[broadcast_dim] =
828 params.output_strides[dim] * params.input_dims[dim];
829 params.bcast_input_strides[copy_dim] = params.input_block_strides[dim];
830 params.bcast_input_strides[broadcast_dim] = 0;
831 }
832
833 for (int i = 2 * params.inner_dim_count; i < 2 * NumDims; ++i) {
834 const int dim = IsColMajor ? i : 2 * NumDims - i - 1;
835 params.bcast_block_sizes[dim] = 1;
836 params.bcast_block_strides[dim] = 0;
837 params.bcast_input_strides[dim] = 0;
838 }
839
840 return params;
841 }
842
843 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorBlock emptyBlock() const {
844 DSizes<Index, NumDims> dimensions;
845 for (int i = 0; i < NumDims; ++i) dimensions[i] = 0;
846 return TensorBlock(internal::TensorBlockKind::kView, NULL, dimensions);
847 }
848
849 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlockAlongBcastDim(
850 BlockBroadcastingParams params, Index bcast_offset,
851 TensorBlockScratch& scratch, ScalarNoConst* materialized_output,
852 ScalarNoConst** materialized_input,
853 size_t* materialized_input_size) const {
854 if (params.bcast_dim_size == 1) {
855 // We just need one block read using the ready-set values above.
856 return BroadcastBlock(
857 params.input_block_sizes, params.input_block_strides,
858 params.bcast_block_sizes, params.bcast_block_strides,
859 params.bcast_input_strides, bcast_offset, 0, scratch,
860 materialized_output, materialized_input, materialized_input_size);
861
862 } else if (params.input_dims[params.bcast_dim] == 1) {
863 // Broadcast bcast dimension (< NumDims) by bcast_dim_size.
864 const int broadcast_bcast_dim =
865 IsColMajor ? 2 * params.inner_dim_count + 1
866 : 2 * NumDims - 2 * params.inner_dim_count - 2;
867
868 params.bcast_block_sizes[broadcast_bcast_dim] = params.bcast_dim_size;
869 params.bcast_input_strides[broadcast_bcast_dim] = 0;
870 params.bcast_block_strides[broadcast_bcast_dim] =
871 params.output_strides[params.bcast_dim];
872
873 return BroadcastBlock(
874 params.input_block_sizes, params.input_block_strides,
875 params.bcast_block_sizes, params.bcast_block_strides,
876 params.bcast_input_strides, bcast_offset, 0, scratch,
877 materialized_output, materialized_input, materialized_input_size);
878
879 } else {
880 // Keep track of the total number of the coefficients written to the
881 // output block.
882 Index num_output_coeffs = 0;
883
884 // The general case. Let's denote the output block as
885 //
886 // x[..., a:a+bcast_dim_size, :, ..., :]
887 //
888 // where a:a+bcast_dim_size is a slice on the bcast_dim dimension
889 // (< NumDims). We need to split the a:a+bcast_dim_size into possibly 3
890 // sub-blocks:
891 //
892 // (1) a:b, where b is the smallest multiple of
893 // input_dims[bcast_dim_start] in [a, a+bcast_dim_size].
894 //
895 // (2) b:c, where c is the largest multiple of input_dims[bcast_dim_start]
896 // in [a, a+bcast_dim_size].
897 //
898 // (3) c:a+bcast_dim_size .
899 //
900 // Or, when b and c do not exist, we just need to process the whole block
901 // together.
902
903 // Find a.
904 const Index bcast_dim_left_index =
905 bcast_offset / m_outputStrides[params.bcast_dim];
906
907 // Find b and c.
908 const Index input_bcast_dim_size = params.input_dims[params.bcast_dim];
909
910 // First multiple after a. This is b when <= bcast_dim_left_index +
911 // bcast_dim_size.
912 const Index first_multiple =
913 divup<Index>(bcast_dim_left_index, input_bcast_dim_size) *
914 input_bcast_dim_size;
915
916 if (first_multiple <= bcast_dim_left_index + params.bcast_dim_size) {
917 // b exists, so does c. Find it.
918 const Index last_multiple =
919 (bcast_dim_left_index + params.bcast_dim_size) /
920 input_bcast_dim_size * input_bcast_dim_size;
921 const int copy_bcast_dim =
922 IsColMajor ? 2 * params.inner_dim_count
923 : 2 * NumDims - 2 * params.inner_dim_count - 1;
924 const int broadcast_bcast_dim =
925 IsColMajor ? 2 * params.inner_dim_count + 1
926 : 2 * NumDims - 2 * params.inner_dim_count - 2;
927
928 if (first_multiple > bcast_dim_left_index) {
929 const Index head_size = first_multiple - bcast_dim_left_index;
930 params.input_block_sizes[params.bcast_dim] = head_size;
931 params.bcast_block_sizes[copy_bcast_dim] = head_size;
932 params.bcast_input_strides[copy_bcast_dim] =
933 params.input_block_strides[params.bcast_dim];
934 params.bcast_block_strides[copy_bcast_dim] =
935 params.output_strides[params.bcast_dim];
936 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
937 params.bcast_input_strides[broadcast_bcast_dim] = 0;
938 params.bcast_block_strides[broadcast_bcast_dim] =
939 params.output_strides[params.bcast_dim] *
940 params.input_dims[params.bcast_dim];
941
942 num_output_coeffs += BroadcastBlock(
943 params.input_block_sizes, params.input_block_strides,
944 params.bcast_block_sizes, params.bcast_block_strides,
945 params.bcast_input_strides, bcast_offset, 0, scratch,
946 materialized_output, materialized_input, materialized_input_size);
947 }
948 if (first_multiple < last_multiple) {
949 params.input_block_sizes[params.bcast_dim] = input_bcast_dim_size;
950 params.bcast_block_sizes[copy_bcast_dim] = input_bcast_dim_size;
951 params.bcast_input_strides[copy_bcast_dim] =
952 params.input_block_strides[params.bcast_dim];
953 params.bcast_block_strides[copy_bcast_dim] =
954 params.output_strides[params.bcast_dim];
955 params.bcast_block_sizes[broadcast_bcast_dim] =
956 (last_multiple - first_multiple) / input_bcast_dim_size;
957 params.bcast_input_strides[broadcast_bcast_dim] = 0;
958 params.bcast_block_strides[broadcast_bcast_dim] =
959 params.output_strides[params.bcast_dim] *
960 params.input_dims[params.bcast_dim];
961 const Index offset = (first_multiple - bcast_dim_left_index) *
962 m_outputStrides[params.bcast_dim];
963
964 num_output_coeffs += BroadcastBlock(
965 params.input_block_sizes, params.input_block_strides,
966 params.bcast_block_sizes, params.bcast_block_strides,
967 params.bcast_input_strides, bcast_offset, offset, scratch,
968 materialized_output, materialized_input, materialized_input_size);
969 }
970 if (last_multiple < bcast_dim_left_index + params.bcast_dim_size) {
971 const Index tail_size =
972 bcast_dim_left_index + params.bcast_dim_size - last_multiple;
973 params.input_block_sizes[params.bcast_dim] = tail_size;
974 params.bcast_block_sizes[copy_bcast_dim] = tail_size;
975 params.bcast_input_strides[copy_bcast_dim] =
976 params.input_block_strides[params.bcast_dim];
977 params.bcast_block_strides[copy_bcast_dim] =
978 params.output_strides[params.bcast_dim];
979 params.bcast_block_sizes[broadcast_bcast_dim] = 1;
980 params.bcast_input_strides[broadcast_bcast_dim] = 0;
981 params.bcast_block_strides[broadcast_bcast_dim] =
982 params.output_strides[params.bcast_dim] *
983 params.input_dims[params.bcast_dim];
984 const Index offset = (last_multiple - bcast_dim_left_index) *
985 m_outputStrides[params.bcast_dim];
986
987 num_output_coeffs += BroadcastBlock(
988 params.input_block_sizes, params.input_block_strides,
989 params.bcast_block_sizes, params.bcast_block_strides,
990 params.bcast_input_strides, bcast_offset, offset, scratch,
991 materialized_output, materialized_input, materialized_input_size);
992 }
993 } else {
994 // b and c do not exist.
995 const int copy_bcast_dim =
996 IsColMajor ? 2 * params.inner_dim_count
997 : 2 * NumDims - 2 * params.inner_dim_count - 1;
998 params.input_block_sizes[params.bcast_dim] = params.bcast_dim_size;
999 params.bcast_block_sizes[copy_bcast_dim] = params.bcast_dim_size;
1000 params.bcast_input_strides[copy_bcast_dim] =
1001 params.input_block_strides[params.bcast_dim];
1002 params.bcast_block_strides[copy_bcast_dim] =
1003 params.output_strides[params.bcast_dim];
1004
1005 num_output_coeffs += BroadcastBlock(
1006 params.input_block_sizes, params.input_block_strides,
1007 params.bcast_block_sizes, params.bcast_block_strides,
1008 params.bcast_input_strides, bcast_offset, 0, scratch,
1009 materialized_output, materialized_input, materialized_input_size);
1010 }
1011
1012 return num_output_coeffs;
1013 }
1014 }
1015
1016 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index BroadcastBlock(
1017 const Dimensions& input_block_sizes,
1018 const Dimensions& input_block_strides,
1019 const BroadcastDimensions& bcast_block_sizes,
1020 const BroadcastDimensions& bcast_block_strides,
1021 const BroadcastDimensions& bcast_input_strides, Index bcast_offset,
1022 Index offset, TensorBlockScratch& scratch,
1023 ScalarNoConst* materialized_output, ScalarNoConst** materialized_input,
1024 size_t* materialized_input_size) const {
1025 // ---------------------------------------------------------------------- //
1026 // Tensor block descriptor for reading block from the input.
1027 const Index input_offset = bcast_offset + offset;
1028 TensorBlockDesc input_desc(
1029 IsColMajor ? indexColMajor(input_offset) : indexRowMajor(input_offset),
1030 input_block_sizes);
1031
1032 ArgTensorBlock input_block = m_impl.block(input_desc, scratch);
1033
1034 // ---------------------------------------------------------------------- //
1035 // Materialize input block into a temporary memory buffer only if it's not
1036 // already available in the arg block.
1037 const ScalarNoConst* input_buffer = NULL;
1038
1039 if (input_block.data() != NULL) {
1040 // Input block already has raw data, there is no need to materialize it.
1041 input_buffer = input_block.data();
1042
1043 } else {
1044 // Otherwise we have to do block assignment into a temporary buffer.
1045
1046 // Maybe reuse previously allocated buffer, or allocate a new one with a
1047 // scratch allocator.
1048 const size_t input_total_size = input_block_sizes.TotalSize();
1049 if (*materialized_input == NULL ||
1050 *materialized_input_size < input_total_size) {
1051 *materialized_input_size = input_total_size;
1052 void* mem = scratch.allocate(*materialized_input_size * sizeof(Scalar));
1053 *materialized_input = static_cast<ScalarNoConst*>(mem);
1054 }
1055
1056 typedef internal::TensorBlockAssignment<
1057 ScalarNoConst, NumDims, typename ArgTensorBlock::XprType, Index>
1058 TensorBlockAssignment;
1059
1060 TensorBlockAssignment::Run(
1061 TensorBlockAssignment::target(input_block_sizes, input_block_strides,
1062 *materialized_input),
1063 input_block.expr());
1064
1065 input_buffer = *materialized_input;
1066 }
1067
1068 // ---------------------------------------------------------------------- //
1069 // Copy data from materialized input block to the materialized output, using
1070 // given broadcast strides (strides with zeroes).
1071 typedef internal::TensorBlockIO<ScalarNoConst, Index, 2 * NumDims, Layout>
1072 TensorBlockIO;
1073
1074 typename TensorBlockIO::Src src(bcast_input_strides, input_buffer);
1075 typename TensorBlockIO::Dst dst(bcast_block_sizes, bcast_block_strides,
1076 materialized_output + offset);
1077
1078 return TensorBlockIO::Copy(dst, src);
1079 }
1080
1081protected:
1082 const Device EIGEN_DEVICE_REF m_device;
1083 const typename internal::remove_reference<Broadcast>::type m_broadcast;
1084 Dimensions m_dimensions;
1085 array<Index, NumDims> m_outputStrides;
1086 array<Index, NumDims> m_inputStrides;
1087 TensorEvaluator<ArgType, Device> m_impl;
1088};
1089
1090
1091} // end namespace Eigen
1092
1093#endif // EIGEN_CXX11_TENSOR_TENSOR_BROADCASTING_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index