Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorConvolution.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
11#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
12
13namespace Eigen {
14
22namespace internal {
23
24template <typename Index, typename InputDims, int NumKernelDims, int Layout>
25class IndexMapper {
26 public:
27 IndexMapper(const InputDims& input_dims, const array<Index, NumKernelDims>& kernel_dims,
28 const array<Index, NumKernelDims>& indices) {
29
30 array<Index, NumDims> dimensions = input_dims;
31 for (int i = 0; i < NumKernelDims; ++i) {
32 const Index index = indices[i];
33 const Index input_dim = input_dims[index];
34 const Index kernel_dim = kernel_dims[i];
35 const Index result_dim = input_dim - kernel_dim + 1;
36 dimensions[index] = result_dim;
37 }
38
39 array<Index, NumDims> inputStrides;
40 array<Index, NumDims> outputStrides;
41 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
42 inputStrides[0] = 1;
43 outputStrides[0] = 1;
44 for (int i = 1; i < NumDims; ++i) {
45 inputStrides[i] = inputStrides[i-1] * input_dims[i-1];
46 outputStrides[i] = outputStrides[i-1] * dimensions[i-1];
47 }
48 } else {
49 inputStrides[NumDims - 1] = 1;
50 outputStrides[NumDims - 1] = 1;
51 for (int i = static_cast<int>(NumDims) - 2; i >= 0; --i) {
52 inputStrides[i] = inputStrides[i + 1] * input_dims[i + 1];
53 outputStrides[i] = outputStrides[i + 1] * dimensions[i + 1];
54 }
55 }
56
57 array<Index, NumDims> gpuInputDimensions;
58 array<Index, NumDims> gpuOutputDimensions;
59 array<Index, NumDims> tmp = dimensions;
60 array<Index, NumDims> ordering;
61 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
62 ? 0
63 : NumDims - NumKernelDims;
64 for (int i = 0; i < NumKernelDims; ++i) {
65 const Index index = i + offset;
66 ordering[index] = indices[i];
67 tmp[indices[i]] = -1;
68 gpuInputDimensions[index] = input_dims[indices[i]];
69 gpuOutputDimensions[index] = dimensions[indices[i]];
70 }
71
72 int written = static_cast<int>(Layout) == static_cast<int>(ColMajor)
73 ? NumKernelDims
74 : 0;
75 for (int i = 0; i < NumDims; ++i) {
76 if (tmp[i] >= 0) {
77 ordering[written] = i;
78 gpuInputDimensions[written] = input_dims[i];
79 gpuOutputDimensions[written] = dimensions[i];
80 ++written;
81 }
82 }
83
84 for (int i = 0; i < NumDims; ++i) {
85 m_inputStrides[i] = inputStrides[ordering[i]];
86 m_outputStrides[i] = outputStrides[ordering[i]];
87 }
88
89 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
90 for (int i = 0; i < NumDims; ++i) {
91 if (i > NumKernelDims) {
92 m_gpuInputStrides[i] =
93 m_gpuInputStrides[i - 1] * gpuInputDimensions[i - 1];
94 m_gpuOutputStrides[i] =
95 m_gpuOutputStrides[i - 1] * gpuOutputDimensions[i - 1];
96 } else {
97 m_gpuInputStrides[i] = 1;
98 m_gpuOutputStrides[i] = 1;
99 }
100 }
101 } else {
102 for (int i = NumDims - 1; i >= 0; --i) {
103 if (static_cast<size_t>(i + 1) < offset) {
104 m_gpuInputStrides[i] =
105 m_gpuInputStrides[i + 1] * gpuInputDimensions[i + 1];
106 m_gpuOutputStrides[i] =
107 m_gpuOutputStrides[i + 1] * gpuOutputDimensions[i + 1];
108 } else {
109 m_gpuInputStrides[i] = 1;
110 m_gpuOutputStrides[i] = 1;
111 }
112 }
113 }
114 }
115
116 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputPlaneToTensorInputOffset(Index p) const {
117 Index inputIndex = 0;
118 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
119 for (int d = NumDims - 1; d > NumKernelDims; --d) {
120 const Index idx = p / m_gpuInputStrides[d];
121 inputIndex += idx * m_inputStrides[d];
122 p -= idx * m_gpuInputStrides[d];
123 }
124 inputIndex += p * m_inputStrides[NumKernelDims];
125 } else {
126 std::ptrdiff_t limit = 0;
127 if (NumKernelDims < NumDims) {
128 limit = NumDims - NumKernelDims - 1;
129 }
130 for (int d = 0; d < limit; ++d) {
131 const Index idx = p / m_gpuInputStrides[d];
132 inputIndex += idx * m_inputStrides[d];
133 p -= idx * m_gpuInputStrides[d];
134 }
135 inputIndex += p * m_inputStrides[limit];
136 }
137 return inputIndex;
138 }
139
140 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputPlaneToTensorOutputOffset(Index p) const {
141 Index outputIndex = 0;
142 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
143 for (int d = NumDims - 1; d > NumKernelDims; --d) {
144 const Index idx = p / m_gpuOutputStrides[d];
145 outputIndex += idx * m_outputStrides[d];
146 p -= idx * m_gpuOutputStrides[d];
147 }
148 outputIndex += p * m_outputStrides[NumKernelDims];
149 } else {
150 std::ptrdiff_t limit = 0;
151 if (NumKernelDims < NumDims) {
152 limit = NumDims - NumKernelDims - 1;
153 }
154 for (int d = 0; d < limit; ++d) {
155 const Index idx = p / m_gpuOutputStrides[d];
156 outputIndex += idx * m_outputStrides[d];
157 p -= idx * m_gpuOutputStrides[d];
158 }
159 outputIndex += p * m_outputStrides[limit];
160 }
161 return outputIndex;
162 }
163
164 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i) const {
165 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
166 ? 0
167 : NumDims - NumKernelDims;
168 return i * m_inputStrides[offset];
169 }
170
171 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i) const {
172 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
173 ? 0
174 : NumDims - NumKernelDims;
175 return i * m_outputStrides[offset];
176 }
177
178 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j) const {
179 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
180 ? 0
181 : NumDims - NumKernelDims;
182 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1];
183 }
184
185 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j) const {
186 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
187 ? 0
188 : NumDims - NumKernelDims;
189 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1];
190 }
191
192 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuInputKernelToTensorInputOffset(Index i, Index j, Index k) const {
193 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
194 ? 0
195 : NumDims - NumKernelDims;
196 return i * m_inputStrides[offset] + j * m_inputStrides[offset + 1] +
197 k * m_inputStrides[offset + 2];
198 }
199
200 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Index mapGpuOutputKernelToTensorOutputOffset(Index i, Index j, Index k) const {
201 const size_t offset = static_cast<int>(Layout) == static_cast<int>(ColMajor)
202 ? 0
203 : NumDims - NumKernelDims;
204 return i * m_outputStrides[offset] + j * m_outputStrides[offset + 1] +
205 k * m_outputStrides[offset + 2];
206 }
207
208 private:
209 static const int NumDims = internal::array_size<InputDims>::value;
210 array<Index, NumDims> m_inputStrides;
211 array<Index, NumDims> m_outputStrides;
212 array<Index, NumDims> m_gpuInputStrides;
213 array<Index, NumDims> m_gpuOutputStrides;
214};
215
216
217
218template<typename Dimensions, typename InputXprType, typename KernelXprType>
219struct traits<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >
220{
221 // Type promotion to handle the case where the types of the lhs and the rhs are different.
222 typedef typename promote_storage_type<typename InputXprType::Scalar,
223 typename KernelXprType::Scalar>::ret Scalar;
224 typedef typename promote_storage_type<typename traits<InputXprType>::StorageKind,
225 typename traits<KernelXprType>::StorageKind>::ret StorageKind;
226 typedef typename promote_index_type<typename traits<InputXprType>::Index,
227 typename traits<KernelXprType>::Index>::type Index;
228 typedef typename InputXprType::Nested LhsNested;
229 typedef typename KernelXprType::Nested RhsNested;
230 typedef typename remove_reference<LhsNested>::type _LhsNested;
231 typedef typename remove_reference<RhsNested>::type _RhsNested;
232 static const int NumDimensions = traits<InputXprType>::NumDimensions;
233 static const int Layout = traits<InputXprType>::Layout;
234 typedef typename conditional<Pointer_type_promotion<typename InputXprType::Scalar, Scalar>::val,
235 typename traits<InputXprType>::PointerType, typename traits<KernelXprType>::PointerType>::type PointerType;
236
237 enum {
238 Flags = 0
239 };
240};
241
242template<typename Dimensions, typename InputXprType, typename KernelXprType>
243struct eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, Eigen::Dense>
244{
245 typedef const TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>& type;
246};
247
248template<typename Dimensions, typename InputXprType, typename KernelXprType>
249struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, typename eval<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> >::type>
250{
251 typedef TensorConvolutionOp<Dimensions, InputXprType, KernelXprType> type;
252};
253
254} // end namespace internal
255
256
257
258template<typename Indices, typename InputXprType, typename KernelXprType>
259class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
260{
261 public:
262 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
263 typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
264 typedef typename internal::promote_storage_type<typename InputXprType::CoeffReturnType,
265 typename KernelXprType::CoeffReturnType>::ret CoeffReturnType;
266 typedef typename Eigen::internal::nested<TensorConvolutionOp>::type Nested;
267 typedef typename Eigen::internal::traits<TensorConvolutionOp>::StorageKind StorageKind;
268 typedef typename Eigen::internal::traits<TensorConvolutionOp>::Index Index;
269
270 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorConvolutionOp(const InputXprType& input, const KernelXprType& kernel, const Indices& dims)
271 : m_input_xpr(input), m_kernel_xpr(kernel), m_indices(dims) {}
272
273 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
274 const Indices& indices() const { return m_indices; }
275
277 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
278 const typename internal::remove_all<typename InputXprType::Nested>::type&
279 inputExpression() const { return m_input_xpr; }
280
281 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
282 const typename internal::remove_all<typename KernelXprType::Nested>::type&
283 kernelExpression() const { return m_kernel_xpr; }
284
285 protected:
286 typename InputXprType::Nested m_input_xpr;
287 typename KernelXprType::Nested m_kernel_xpr;
288 const Indices m_indices;
289};
290
291
292template<typename Indices, typename InputArgType, typename KernelArgType, typename Device>
293struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Device>
294{
295 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
296
297 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, Device>::Dimensions>::value;
298 static const int NumKernelDims = internal::array_size<Indices>::value;
299 typedef typename XprType::Index Index;
300 typedef DSizes<Index, NumDims> Dimensions;
301
302 typedef typename XprType::Scalar Scalar;
303 typedef typename XprType::CoeffReturnType CoeffReturnType;
304 typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
305 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
306 typedef StorageMemory<Scalar, Device> Storage;
307 typedef typename Storage::Type EvaluatorPointerType;
308
309 enum {
310 IsAligned = int(TensorEvaluator<InputArgType, Device>::IsAligned) & int(TensorEvaluator<KernelArgType, Device>::IsAligned),
311 PacketAccess = int(TensorEvaluator<InputArgType, Device>::PacketAccess) & int(TensorEvaluator<KernelArgType, Device>::PacketAccess),
312 BlockAccess = false,
313 PreferBlockAccess = false,
314 Layout = TensorEvaluator<InputArgType, Device>::Layout,
315 CoordAccess = false, // to be implemented
316 RawAccess = false
317 };
318
319 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
320 typedef internal::TensorBlockNotImplemented TensorBlock;
321 //===--------------------------------------------------------------------===//
322
323 EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
324 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_kernel(NULL), m_local_kernel(false), m_device(device)
325 {
326 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, Device>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
327
328 const typename TensorEvaluator<InputArgType, Device>::Dimensions& input_dims = m_inputImpl.dimensions();
329 const typename TensorEvaluator<KernelArgType, Device>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
330
331 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
332 m_inputStride[0] = 1;
333 for (int i = 1; i < NumDims; ++i) {
334 m_inputStride[i] = m_inputStride[i - 1] * input_dims[i - 1];
335 }
336 } else {
337 m_inputStride[NumDims - 1] = 1;
338 for (int i = NumDims - 2; i >= 0; --i) {
339 m_inputStride[i] = m_inputStride[i + 1] * input_dims[i + 1];
340 }
341 }
342
343 m_dimensions = m_inputImpl.dimensions();
344 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
345 for (int i = 0; i < NumKernelDims; ++i) {
346 const Index index = op.indices()[i];
347 const Index input_dim = input_dims[index];
348 const Index kernel_dim = kernel_dims[i];
349 const Index result_dim = input_dim - kernel_dim + 1;
350 m_dimensions[index] = result_dim;
351 if (i > 0) {
352 m_kernelStride[i] = m_kernelStride[i - 1] * kernel_dims[i - 1];
353 } else {
354 m_kernelStride[0] = 1;
355 }
356 m_indexStride[i] = m_inputStride[index];
357 }
358
359 m_outputStride[0] = 1;
360 for (int i = 1; i < NumDims; ++i) {
361 m_outputStride[i] = m_outputStride[i - 1] * m_dimensions[i - 1];
362 }
363 } else {
364 for (int i = NumKernelDims - 1; i >= 0; --i) {
365 const Index index = op.indices()[i];
366 const Index input_dim = input_dims[index];
367 const Index kernel_dim = kernel_dims[i];
368 const Index result_dim = input_dim - kernel_dim + 1;
369 m_dimensions[index] = result_dim;
370 if (i < NumKernelDims - 1) {
371 m_kernelStride[i] = m_kernelStride[i + 1] * kernel_dims[i + 1];
372 } else {
373 m_kernelStride[NumKernelDims - 1] = 1;
374 }
375 m_indexStride[i] = m_inputStride[index];
376 }
377
378 m_outputStride[NumDims - 1] = 1;
379 for (int i = NumDims - 2; i >= 0; --i) {
380 m_outputStride[i] = m_outputStride[i + 1] * m_dimensions[i + 1];
381 }
382 }
383 }
384
385 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
386
387 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar*) {
388 m_inputImpl.evalSubExprsIfNeeded(NULL);
389 preloadKernel();
390 return true;
391 }
392 EIGEN_STRONG_INLINE void cleanup() {
393 m_inputImpl.cleanup();
394 if (m_local_kernel) {
395 m_device.deallocate((void*)m_kernel);
396 m_local_kernel = false;
397 }
398 m_kernel = NULL;
399 }
400
401 void evalTo(typename XprType::Scalar* buffer) {
402 evalSubExprsIfNeeded(NULL);
403 for (int i = 0; i < dimensions().TotalSize(); ++i) {
404 buffer[i] += coeff(i);
405 }
406 cleanup();
407 }
408
409 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
410 {
411 CoeffReturnType result = CoeffReturnType(0);
412 convolve(firstInput(index), 0, NumKernelDims-1, result);
413 return result;
414 }
415
416 template<int LoadMode>
417 EIGEN_DEVICE_FUNC PacketReturnType packet(const Index index) const
418 {
419 Index indices[2] = {index, index+PacketSize-1};
420 Index startInputs[2] = {0, 0};
421 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
422 for (int i = NumDims - 1; i > 0; --i) {
423 const Index idx0 = indices[0] / m_outputStride[i];
424 const Index idx1 = indices[1] / m_outputStride[i];
425 startInputs[0] += idx0 * m_inputStride[i];
426 startInputs[1] += idx1 * m_inputStride[i];
427 indices[0] -= idx0 * m_outputStride[i];
428 indices[1] -= idx1 * m_outputStride[i];
429 }
430 } else {
431 for (int i = 0; i < NumDims - 1; ++i) {
432 const Index idx0 = indices[0] / m_outputStride[i];
433 const Index idx1 = indices[1] / m_outputStride[i];
434 startInputs[0] += idx0 * m_inputStride[i];
435 startInputs[1] += idx1 * m_inputStride[i];
436 indices[0] -= idx0 * m_outputStride[i];
437 indices[1] -= idx1 * m_outputStride[i];
438 }
439 }
440 startInputs[0] += indices[0];
441 startInputs[1] += indices[1];
442
443 if (startInputs[1]-startInputs[0] == PacketSize-1) {
444 PacketReturnType result = internal::pset1<PacketReturnType>(0);
445 convolvePacket(startInputs[0], 0, NumKernelDims-1, result);
446 return result;
447 } else {
448 EIGEN_ALIGN_MAX Scalar data[PacketSize];
449 data[0] = Scalar(0);
450 convolve(startInputs[0], 0, NumKernelDims-1, data[0]);
451 for (int i = 1; i < PacketSize-1; ++i) {
452 data[i] = Scalar(0);
453 convolve(firstInput(index+i), 0, NumKernelDims-1, data[i]);
454 }
455 data[PacketSize-1] = Scalar(0);
456 convolve(startInputs[1], 0, NumKernelDims-1, data[PacketSize-1]);
457 return internal::pload<PacketReturnType>(data);
458 }
459 }
460
461 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
462 costPerCoeff(bool vectorized) const {
463 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
464 // We ignore the use of fused multiply-add.
465 const double convolve_compute_cost =
466 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
467 const double firstIndex_compute_cost =
468 NumDims *
469 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
470 TensorOpCost::DivCost<Index>());
471 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
472 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
473 m_kernelImpl.costPerCoeff(vectorized) +
474 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
475 PacketSize));
476 }
477
478 EIGEN_DEVICE_FUNC EvaluatorPointerType data() const { return NULL; }
479
480 private:
481 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
482 Index startInput = 0;
483 if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
484 for (int i = NumDims - 1; i > 0; --i) {
485 const Index idx = index / m_outputStride[i];
486 startInput += idx * m_inputStride[i];
487 index -= idx * m_outputStride[i];
488 }
489 } else {
490 for (int i = 0; i < NumDims - 1; ++i) {
491 const Index idx = index / m_outputStride[i];
492 startInput += idx * m_inputStride[i];
493 index -= idx * m_outputStride[i];
494 }
495 }
496 startInput += index;
497 return startInput;
498 }
499
500 EIGEN_DEVICE_FUNC void convolve(Index firstIndex, Index firstKernel, int DimIndex, CoeffReturnType& accum) const {
501 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
502 const Index input = firstIndex + j * m_indexStride[DimIndex];
503 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
504 if (DimIndex > 0) {
505 convolve(input, kernel, DimIndex-1, accum);
506 } else {
507 accum += m_inputImpl.coeff(input) * m_kernel[kernel];
508 }
509 }
510 }
511
512 template <typename Packet>
513 EIGEN_DEVICE_FUNC void convolvePacket(Index firstIndex, Index firstKernel, int DimIndex, Packet& accum) const {
514 for (int j = 0; j < m_kernelImpl.dimensions()[DimIndex]; ++j) {
515 const Index input = firstIndex + j * m_indexStride[DimIndex];
516 const Index kernel = firstKernel + j * m_kernelStride[DimIndex];
517 if (DimIndex > 0) {
518 convolvePacket(input, kernel, DimIndex-1, accum);
519 } else {
520 accum = internal::pmadd<Packet>(m_inputImpl.template packet<Unaligned>(input), internal::pset1<Packet>(m_kernel[kernel]), accum);
521 }
522 }
523 }
524
525 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
526 // Don't make a local copy of the kernel unless we have to (i.e. it's an
527 // expression that needs to be evaluated)
528 const Scalar* in_place = m_kernelImpl.data();
529 if (in_place) {
530 m_kernel = in_place;
531 m_local_kernel = false;
532 } else {
533 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
534 Scalar* local = (Scalar*)m_device.allocate_temp(kernel_sz);
535 typedef TensorEvalToOp<const KernelArgType> EvalTo;
536 EvalTo evalToTmp(local, m_kernelArg);
537 const bool Vectorize = internal::IsVectorizable<Device, KernelArgType>::value;
538 internal::TensorExecutor<const EvalTo, Device, Vectorize>::run(evalToTmp, m_device);
539
540 m_kernel = local;
541 m_local_kernel = true;
542 }
543 }
544
545 array<Index, NumDims> m_inputStride;
546 array<Index, NumDims> m_outputStride;
547
548 array<Index, NumKernelDims> m_indexStride;
549 array<Index, NumKernelDims> m_kernelStride;
550 TensorEvaluator<InputArgType, Device> m_inputImpl;
551 TensorEvaluator<KernelArgType, Device> m_kernelImpl;
552 Dimensions m_dimensions;
553
554 KernelArgType m_kernelArg;
555 const Scalar* m_kernel;
556 bool m_local_kernel;
557 const Device EIGEN_DEVICE_REF m_device;
558};
559
560
561
562
563// Use an optimized implementation of the evaluation code for GPUs whenever possible.
564#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC)
565
566template <int StaticKernelSize>
567struct GetKernelSize {
568 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int /*kernelSize*/) const {
569 return StaticKernelSize;
570 }
571};
572template <>
573struct GetKernelSize<Dynamic> {
574 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator() (const int kernelSize) const {
575 return kernelSize;
576 }
577};
578
579template <typename InputEvaluator, typename Index, typename InputDims,
580 int StaticKernelSize>
581__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
582 InputEvaluator eval,
583 const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
584 indexMapper,
585 const float* __restrict kernel, const int numPlanes, const int numX,
586 const int maxX, const int kernelSize, float* buffer) {
587#if defined(EIGEN_HIPCC)
588 HIP_DYNAMIC_SHARED(float, s)
589#else
590 extern __shared__ float s[];
591#endif
592
593 const int first_x = blockIdx.x * maxX;
594 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
595 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSize>()(kernelSize);
596 const int num_x_output = last_x - first_x + 1;
597
598 const int first_plane = blockIdx.y * blockDim.y;
599 const int plane_stride = blockDim.y * gridDim.y;
600
601 for (int p = first_plane + threadIdx.y; p < numPlanes; p += plane_stride) {
602 // Load inputs to shared memory
603 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
604 const int plane_kernel_offset = threadIdx.y * num_x_input;
605 #pragma unroll
606 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
607 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x);
608 s[i + plane_kernel_offset] = eval.coeff(tensor_index);
609 }
610
611 __syncthreads();
612
613 // Compute the convolution
614 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
615
616 #pragma unroll
617 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
618 const int kernel_offset = plane_kernel_offset + i;
619 float result = 0.0f;
620 #pragma unroll
621 for (int k = 0; k < GetKernelSize<StaticKernelSize>()(kernelSize); ++k) {
622 result += s[k + kernel_offset] * kernel[k];
623 }
624 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x);
625 buffer[tensor_index] = result;
626 }
627 __syncthreads();
628 }
629};
630
631template <typename InputEvaluator, typename Index, typename InputDims,
632 int StaticKernelSizeX, int StaticKernelSizeY>
633__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
634 InputEvaluator eval,
635 const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
636 indexMapper,
637 const float* __restrict kernel, const int numPlanes, const int numX,
638 const int maxX, const int numY, const int maxY, const int kernelSizeX,
639 const int kernelSizeY, float* buffer) {
640#if defined(EIGEN_HIPCC)
641 HIP_DYNAMIC_SHARED(float, s)
642#else
643 extern __shared__ float s[];
644#endif
645
646 const int first_x = blockIdx.x * maxX;
647 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
648 const int num_x_input = last_x - first_x + GetKernelSize<StaticKernelSizeX>()(kernelSizeX);
649 const int num_x_output = last_x - first_x + 1;
650
651 const int first_y = blockIdx.y * maxY;
652 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
653 const int num_y_input = last_y - first_y + GetKernelSize<StaticKernelSizeY>()(kernelSizeY);
654 const int num_y_output = last_y - first_y + 1;
655
656 const int first_plane = blockIdx.z * blockDim.z;
657 const int plane_stride = blockDim.z * gridDim.z;
658
659 for (int p = first_plane + threadIdx.z; p < numPlanes; p += plane_stride) {
660
661 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
662 const int plane_kernel_offset = threadIdx.z * num_y_input;
663
664 // Load inputs to shared memory
665 #pragma unroll
666 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
667 const int input_offset = num_x_input * (j + plane_kernel_offset);
668 #pragma unroll
669 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
670 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y);
671 s[i + input_offset] = eval.coeff(tensor_index);
672 }
673 }
674
675 __syncthreads();
676
677 // Convolution
678 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
679
680 #pragma unroll
681 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
682 #pragma unroll
683 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
684 float result = 0.0f;
685 #pragma unroll
686 for (int l = 0; l < GetKernelSize<StaticKernelSizeY>()(kernelSizeY); ++l) {
687 const int kernel_offset = kernelSizeX * l;
688 const int input_offset = i + num_x_input * (j + l + plane_kernel_offset);
689 #pragma unroll
690 for (int k = 0; k < GetKernelSize<StaticKernelSizeX>()(kernelSizeX); ++k) {
691 result += s[k + input_offset] * kernel[k + kernel_offset];
692 }
693 }
694 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y);
695 buffer[tensor_index] = result;
696 }
697 }
698
699 __syncthreads();
700 }
701};
702
703template <typename InputEvaluator, typename Index, typename InputDims>
704__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
705 InputEvaluator eval,
706 const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
707 indexMapper,
708 const float* __restrict kernel, const size_t numPlanes, const size_t numX,
709 const size_t maxX, const size_t numY, const size_t maxY, const size_t numZ,
710 const size_t maxZ, const size_t kernelSizeX, const size_t kernelSizeY,
711 const size_t kernelSizeZ, float* buffer) {
712#if defined(EIGEN_HIPCC)
713 HIP_DYNAMIC_SHARED(float, s)
714#else
715 extern __shared__ float s[];
716#endif
717
718 // Load inputs to shared memory
719 const int first_x = blockIdx.x * maxX;
720 const int last_x = (first_x + maxX < numX ? first_x + maxX : numX) - 1;
721 const int num_x_input = last_x - first_x + kernelSizeX;
722
723 const int first_y = blockIdx.y * maxY;
724 const int last_y = (first_y + maxY < numY ? first_y + maxY : numY) - 1;
725 const int num_y_input = last_y - first_y + kernelSizeY;
726
727 const int first_z = blockIdx.z * maxZ;
728 const int last_z = (first_z + maxZ < numZ ? first_z + maxZ : numZ) - 1;
729 const int num_z_input = last_z - first_z + kernelSizeZ;
730
731 for (int p = 0; p < numPlanes; ++p) {
732
733 const int plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
734 const int plane_kernel_offset = 0;
735
736 for (int k = threadIdx.z; k < num_z_input; k += blockDim.z) {
737 for (int j = threadIdx.y; j < num_y_input; j += blockDim.y) {
738 for (int i = threadIdx.x; i < num_x_input; i += blockDim.x) {
739 const int tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i+first_x, j+first_y, k+first_z);
740 s[i + num_x_input * (j + num_y_input * (k + plane_kernel_offset))] = eval.coeff(tensor_index);
741 }
742 }
743 }
744
745 __syncthreads();
746
747 // Convolution
748 const int num_z_output = last_z - first_z + 1;
749 const int num_y_output = last_y - first_y + 1;
750 const int num_x_output = last_x - first_x + 1;
751 const int plane_output_offset = indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p);
752
753 for (int k = threadIdx.z; k < num_z_output; k += blockDim.z) {
754 for (int j = threadIdx.y; j < num_y_output; j += blockDim.y) {
755 for (int i = threadIdx.x; i < num_x_output; i += blockDim.x) {
756 float result = 0.0f;
757 for (int n = 0; n < kernelSizeZ; ++n) {
758 for (int m = 0; m < kernelSizeY; ++m) {
759 for (int l = 0; l < kernelSizeX; ++l) {
760 result += s[i + l + num_x_input * (j + m + num_y_input * (k + n + plane_kernel_offset))] * kernel[l + kernelSizeX * (m + kernelSizeY * n)];
761 }
762 }
763 }
764 const int tensor_index = plane_output_offset + indexMapper.mapGpuOutputKernelToTensorOutputOffset(i+first_x, j+first_y, k+first_z);
765 buffer[tensor_index] = result;
766 }
767 }
768 }
769 __syncthreads();
770 }
771};
772
773
774
775template<typename Indices, typename InputArgType, typename KernelArgType>
776struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, GpuDevice>
777{
778 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
779
780 static const int NumDims = internal::array_size<typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions>::value;
781 static const int NumKernelDims = internal::array_size<Indices>::value;
782 typedef typename XprType::Index Index;
783 typedef DSizes<Index, NumDims> Dimensions;
784 typedef typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions KernelDimensions;
785
786 enum {
787 IsAligned = TensorEvaluator<InputArgType, GpuDevice>::IsAligned & TensorEvaluator<KernelArgType, GpuDevice>::IsAligned,
788 PacketAccess = false,
789 BlockAccess = false,
790 PreferBlockAccess = false,
791 Layout = TensorEvaluator<InputArgType, GpuDevice>::Layout,
792 CoordAccess = false, // to be implemented
793 RawAccess = false
794 };
795
796 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
797 typedef internal::TensorBlockNotImplemented TensorBlock;
798 //===--------------------------------------------------------------------===//
799
800 TensorEvaluator(const XprType& op, const GpuDevice& device)
801 : m_inputImpl(op.inputExpression(), device), m_kernelImpl(op.kernelExpression(), device), m_kernelArg(op.kernelExpression()), m_indices(op.indices()), m_buf(NULL), m_kernel(NULL), m_local_kernel(false), m_device(device)
802 {
803 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, GpuDevice>::Layout) == static_cast<int>(TensorEvaluator<KernelArgType, GpuDevice>::Layout)), YOU_MADE_A_PROGRAMMING_MISTAKE);
804
805 const typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions& input_dims = m_inputImpl.dimensions();
806 const typename TensorEvaluator<KernelArgType, GpuDevice>::Dimensions& kernel_dims = m_kernelImpl.dimensions();
807
808 m_dimensions = m_inputImpl.dimensions();
809 for (int i = 0; i < NumKernelDims; ++i) {
810 const Index index = op.indices()[i];
811 const Index input_dim = input_dims[index];
812 const Index kernel_dim = kernel_dims[i];
813 const Index result_dim = input_dim - kernel_dim + 1;
814 m_dimensions[index] = result_dim;
815 }
816 }
817
818 typedef typename XprType::CoeffReturnType CoeffReturnType;
819 typedef typename PacketType<CoeffReturnType, GpuDevice>::type PacketReturnType;
820 typedef typename InputArgType::Scalar Scalar;
821 static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
822
823 EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_dimensions; }
824
825 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
826 preloadKernel();
827 m_inputImpl.evalSubExprsIfNeeded(NULL);
828 if (data) {
829 executeEval(data);
830 return false;
831 } else {
832 m_buf = (Scalar*)m_device.allocate(dimensions().TotalSize() * sizeof(Scalar));
833 executeEval(m_buf);
834 return true;
835 }
836 }
837
838 EIGEN_STRONG_INLINE void cleanup() {
839 m_inputImpl.cleanup();
840 if (m_buf) {
841 m_device.deallocate(m_buf);
842 m_buf = NULL;
843 }
844 if (m_local_kernel) {
845 m_device.deallocate((void*)m_kernel);
846 m_local_kernel = false;
847 }
848 m_kernel = NULL;
849 }
850
851 EIGEN_STRONG_INLINE void preloadKernel() {
852 // Don't make a local copy of the kernel unless we have to (i.e. it's an
853 // expression that needs to be evaluated)
854 const Scalar* in_place = m_kernelImpl.data();
855 if (in_place) {
856 m_kernel = in_place;
857 m_local_kernel = false;
858 } else {
859 size_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
860 Scalar* local = (Scalar*)m_device.allocate(kernel_sz);
861 typedef TensorEvalToOp<const KernelArgType> EvalTo;
862 EvalTo evalToTmp(local, m_kernelArg);
863 const bool PacketAccess = internal::IsVectorizable<GpuDevice, KernelArgType>::value;
864 internal::TensorExecutor<const EvalTo, GpuDevice, PacketAccess>::run(evalToTmp, m_device);
865
866 m_kernel = local;
867 m_local_kernel = true;
868 }
869 }
870
871 static unsigned int ceil(unsigned int num, unsigned int denom) {
872 const unsigned int rounded_toward_zero = num / denom;
873 if (num > rounded_toward_zero * denom) {
874 return rounded_toward_zero + 1;
875 }
876 return rounded_toward_zero;
877 }
878
879 void executeEval(Scalar* data) const {
880 typedef typename TensorEvaluator<InputArgType, GpuDevice>::Dimensions InputDims;
881
882 const int maxSharedMem = m_device.sharedMemPerBlock();
883 const int maxThreadsPerBlock = m_device.maxGpuThreadsPerBlock();
884 const int maxBlocksPerProcessor = m_device.maxGpuThreadsPerMultiProcessor() / maxThreadsPerBlock;
885 const int numMultiProcessors = m_device.getNumGpuMultiProcessors();
886 const int warpSize = 32;
887
888 switch (NumKernelDims) {
889 case 1: {
890 const int kernel_size = m_kernelImpl.dimensions().TotalSize();
891
892 const int numX = dimensions()[m_indices[0]];
893 const int numP = dimensions().TotalSize() / numX;
894 int maxX;
895 dim3 block_size;
896
897 const int single_stride_dim =
898 static_cast<int>(Layout) == static_cast<int>(ColMajor)
899 ? 0
900 : m_inputImpl.dimensions().rank() - 1;
901 if (m_indices[0] == single_stride_dim) {
902 // Maximum the reuse
903 const int inner_dim = ((maxSharedMem / (sizeof(Scalar)) - kernel_size + 1 + 31) / 32) * 32;
904 maxX = numext::mini<int>(inner_dim, numX);
905 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size - 1 + maxX) * sizeof(Scalar)), numP);
906 block_size.x = numext::mini(maxThreadsPerBlock, maxX);
907 block_size.y = numext::mini<int>(maxThreadsPerBlock / block_size.x, maxP);
908 }
909 else {
910 // Read as much as possible alongside the inner most dimension, that is the plane
911 const int inner_dim = maxSharedMem / ((warpSize + kernel_size) * sizeof(Scalar));
912 const int maxP = numext::mini<int>(inner_dim, numP);
913 maxX = numext::mini<int>(maxSharedMem / (inner_dim * sizeof(Scalar)) - kernel_size + 1, numX);
914
915 block_size.x = numext::mini(warpSize, maxX);
916 block_size.y = numext::mini<int>(maxThreadsPerBlock/block_size.x, maxP);
917 }
918
919 const int shared_mem = block_size.y * (maxX + kernel_size - 1) * sizeof(Scalar);
920 gpu_assert(shared_mem <= maxSharedMem);
921
922 const int num_x_blocks = ceil(numX, maxX);
923 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
924 const int num_y_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks);
925
926 dim3 num_blocks(num_x_blocks, numext::mini<int>(num_y_blocks, ceil(numP, block_size.y)));
927
928
929 //cout << "launching 1D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " maxX: " << maxX << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
930
931 const array<Index, 1> indices(m_indices[0]);
932 const array<Index, 1> kernel_dims(m_kernelImpl.dimensions()[0]);
933 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(
934 m_inputImpl.dimensions(), kernel_dims, indices);
935 switch(kernel_size) {
936 case 4: {
937 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 4, data);
938 break;
939 }
940 case 7: {
941 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, 7, data);
942 break;
943 }
944 default: {
945 LAUNCH_GPU_KERNEL((EigenConvolutionKernel1D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, kernel_size, data);
946 }
947 }
948 break;
949 }
950
951 case 2: {
952 const int idxX =
953 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1;
954 const int idxY =
955 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0;
956 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
957 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
958
959 const int numX = dimensions()[m_indices[idxX]];
960 const int numY = dimensions()[m_indices[idxY]];
961 const int numP = dimensions().TotalSize() / (numX*numY);
962
963 const float scaling_factor = sqrtf(static_cast<float>(maxSharedMem) / (sizeof(Scalar) * kernel_size_y * kernel_size_x));
964
965 // Snap maxX to warp size
966 int inner_dim = ((static_cast<int>(scaling_factor * kernel_size_x) - kernel_size_x + 1 + 32) / 32) * 32;
967 const int maxX = numext::mini<int>(inner_dim, numX);
968 const int maxY = numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1)) - kernel_size_y + 1, numY);
969 const int maxP = numext::mini<int>(maxSharedMem / ((kernel_size_x - 1 + maxX) * (kernel_size_y - 1 + maxY) * sizeof(Scalar)), numP);
970
971 dim3 block_size;
972 block_size.x = numext::mini(1024, maxX);
973 block_size.y = numext::mini<int>(1024/block_size.x, maxY);
974 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxP);
975
976 const int shared_mem = block_size.z * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * sizeof(Scalar);
977 gpu_assert(shared_mem <= maxSharedMem);
978
979 const int num_x_blocks = ceil(numX, maxX);
980 const int num_y_blocks = ceil(numY, maxY);
981 const int blocksPerProcessor = numext::mini(maxBlocksPerProcessor, maxSharedMem / shared_mem);
982 const int num_z_blocks = ceil(numMultiProcessors * blocksPerProcessor, num_x_blocks * num_y_blocks);
983
984 dim3 num_blocks(num_x_blocks, num_y_blocks, numext::mini<int>(num_z_blocks, ceil(numP, block_size.z)));
985
986
987 //cout << "launching 2D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " maxX: " << maxX << " maxY: " << maxY << " maxP: " << maxP << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
988
989 const array<Index, 2> indices(m_indices[idxX], m_indices[idxY]);
990 const array<Index, 2> kernel_dims(m_kernelImpl.dimensions()[idxX],
991 m_kernelImpl.dimensions()[idxY]);
992 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(
993 m_inputImpl.dimensions(), kernel_dims, indices);
994 switch (kernel_size_x) {
995 case 4: {
996 switch (kernel_size_y) {
997 case 7: {
998 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, 7>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, 7, data);
999 break;
1000 }
1001 default: {
1002 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 4, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 4, kernel_size_y, data);
1003 break;
1004 }
1005 }
1006 break;
1007 }
1008 case 7: {
1009 switch (kernel_size_y) {
1010 case 4: {
1011 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, 4>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, 4, data);
1012 break;
1013 }
1014 default: {
1015 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, 7, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, 7, kernel_size_y, data);
1016 break;
1017 }
1018 }
1019 break;
1020 }
1021 default: {
1022 LAUNCH_GPU_KERNEL((EigenConvolutionKernel2D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims, Dynamic, Dynamic>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, kernel_size_x, kernel_size_y, data);
1023 break;
1024 }
1025 }
1026 break;
1027 }
1028
1029 case 3: {
1030 const int idxX =
1031 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2;
1032 const int idxY =
1033 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1;
1034 const int idxZ =
1035 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0;
1036
1037 const int kernel_size_x = m_kernelImpl.dimensions()[idxX];
1038 const int kernel_size_y = m_kernelImpl.dimensions()[idxY];
1039 const int kernel_size_z = m_kernelImpl.dimensions()[idxZ];
1040
1041 const int numX = dimensions()[m_indices[idxX]];
1042 const int numY = dimensions()[m_indices[idxY]];
1043 const int numZ = dimensions()[m_indices[idxZ]];
1044 const int numP = dimensions().TotalSize() / (numX*numY*numZ);
1045
1046 const int maxX = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * kernel_size_y * kernel_size_z) - kernel_size_x + 1, numX));
1047 const int maxY = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * kernel_size_z) - kernel_size_y + 1, numY));
1048 const int maxZ = numext::mini<int>(128, numext::mini<int>(maxSharedMem / (sizeof(Scalar) * (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1)) - kernel_size_z + 1, numZ));
1049
1050 dim3 block_size;
1051 block_size.x = numext::mini(32, maxX);
1052 block_size.y = numext::mini(32, maxY);
1053 block_size.z = numext::mini<int>(1024/(block_size.x*block_size.y), maxZ);
1054 dim3 num_blocks(ceil(numX, maxX), ceil(numY, maxY), ceil(numZ, maxZ));
1055
1056 const int shared_mem = (maxX + kernel_size_x - 1) * (maxY + kernel_size_y - 1) * (maxZ + kernel_size_z - 1) * sizeof(Scalar);
1057 gpu_assert(shared_mem <= maxSharedMem);
1058
1059 //cout << "launching 3D kernel with block_size.x: " << block_size.x << " block_size.y: " << block_size.y << " block_size.z: " << block_size.z << " num_blocks.x: " << num_blocks.x << " num_blocks.y: " << num_blocks.y << " num_blocks.z: " << num_blocks.z << " shared_mem: " << shared_mem << " in stream " << m_device.stream() << endl;
1060 const array<Index, 3> indices(m_indices[idxX], m_indices[idxY],
1061 m_indices[idxZ]);
1062 const array<Index, 3> kernel_dims(m_kernelImpl.dimensions()[idxX],
1063 m_kernelImpl.dimensions()[idxY],
1064 m_kernelImpl.dimensions()[idxZ]);
1065 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(
1066 m_inputImpl.dimensions(), kernel_dims, indices);
1067
1068 LAUNCH_GPU_KERNEL((EigenConvolutionKernel3D<TensorEvaluator<InputArgType, GpuDevice>, Index, InputDims>), num_blocks, block_size, shared_mem, m_device, m_inputImpl, indexMapper, m_kernel, numP, numX, maxX, numY, maxY, numZ, maxZ, kernel_size_x, kernel_size_y, kernel_size_z, data);
1069 break;
1070 }
1071
1072 default: {
1073 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3), THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
1074 }
1075 }
1076 }
1077
1078 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
1079 {
1080 eigen_assert(m_buf);
1081 eigen_assert(index < m_dimensions.TotalSize());
1082 return m_buf[index];
1083 }
1084
1085 template<int LoadMode>
1086 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const
1087 {
1088 eigen_assert(m_buf);
1089 eigen_assert(index < m_dimensions.TotalSize());
1090 return internal::ploadt<PacketReturnType, LoadMode>(m_buf+index);
1091 }
1092
1093 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
1094 costPerCoeff(bool vectorized) const {
1095 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
1096 // model.
1097 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
1098 // We ignore the use of fused multiply-add.
1099 const double convolve_compute_cost =
1100 TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
1101 const double firstIndex_compute_cost =
1102 NumDims *
1103 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() +
1104 TensorOpCost::DivCost<Index>());
1105 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
1106 kernel_size * (m_inputImpl.costPerCoeff(vectorized) +
1107 m_kernelImpl.costPerCoeff(vectorized) +
1108 TensorOpCost(0, 0, convolve_compute_cost, vectorized,
1109 PacketSize));
1110 }
1111
1112 private:
1113 // No assignment (copies are needed by the kernels)
1114 TensorEvaluator& operator = (const TensorEvaluator&);
1115
1116 TensorEvaluator<InputArgType, GpuDevice> m_inputImpl;
1117 TensorEvaluator<KernelArgType, GpuDevice> m_kernelImpl;
1118 KernelArgType m_kernelArg;
1119 Indices m_indices;
1120 Dimensions m_dimensions;
1121 Scalar* m_buf;
1122 const Scalar* m_kernel;
1123 bool m_local_kernel;
1124
1125 const GpuDevice& m_device;
1126};
1127#endif
1128
1129
1130} // end namespace Eigen
1131
1132#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
const int Dynamic
const Eigen::CwiseUnaryOp< Eigen::internal::scalar_ceil_op< typename Derived::Scalar >, const Derived > ceil(const Eigen::ArrayBase< Derived > &x)