Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorConvolutionSycl.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Mehdi Goli Codeplay Software Ltd.
5// Ralph Potter Codeplay Software Ltd.
6// Luke Iwanski Codeplay Software Ltd.
7// Contact: <eigen@codeplay.com>
8// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
9
10//
11// This Source Code Form is subject to the terms of the Mozilla
12// Public License v. 2.0. If a copy of the MPL was not distributed
13// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
14
15#ifndef EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
16#define EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_SYCL_H
17
18namespace Eigen {
19
28enum class convolution_type { CONV1D, CONV2D, CONV3D };
29template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
30 typename Kernel_accessor, typename Buffer_accessor, convolution_type Conv_Dim>
31struct EigenConvolutionKernel;
32template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
33 typename Kernel_accessor, typename Buffer_accessor>
34struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
35 Buffer_accessor, convolution_type::CONV1D> {
36 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
37 Local_accessor;
38 Local_accessor local_acc;
39 Evaluator device_evaluator;
40 Kernel_accessor kernel_filter;
41 Buffer_accessor buffer_acc;
42 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper;
43 const size_t kernelSize;
44 const cl::sycl::range<2> input_range;
45 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
46 Buffer_accessor buffer_acc_,
47 internal::IndexMapper<Index, InputDims, 1, Evaluator::Layout> indexMapper_,
48 const size_t kernelSize_, const cl::sycl::range<2> input_range_)
49 : local_acc(local_acc_),
50 device_evaluator(device_evaluator_),
51 kernel_filter(kernel_filter_),
52 buffer_acc(buffer_acc_),
53 indexMapper(indexMapper_),
54 kernelSize(kernelSize_),
55 input_range(input_range_) {}
56
57 template <typename BooleanDim2>
58 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim2 boolean_check) {
59 return (boolean_check[0] && boolean_check[1]);
60 }
61 void operator()(cl::sycl::nd_item<2> itemID) {
62 auto buffer_ptr = buffer_acc.get_pointer();
63 auto kernel_ptr = kernel_filter.get_pointer();
64 // the required row to be calculated for the for each plane in shered memory
65 const size_t num_input = (itemID.get_local_range()[0] + kernelSize - 1);
66 const size_t plane_kernel_offset = itemID.get_local_id(1) * num_input;
67 const size_t input_offset = itemID.get_group(0) * itemID.get_local_range()[0];
68 const size_t plane_tensor_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(1));
70 for (size_t i = itemID.get_local_id(0); i < num_input; i += itemID.get_local_range()[0]) {
71 const size_t local_index = i + plane_kernel_offset;
72 const size_t tensor_index =
73 plane_tensor_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(i + input_offset);
74
75 local_acc[local_index] =
76 (((i + input_offset) < (input_range[0] + kernelSize - 1)) && itemID.get_global_id(1) < input_range[1])
77 ? device_evaluator.coeff(tensor_index)
78 : CoeffReturnType(0);
79 }
80
81 itemID.barrier(cl::sycl::access::fence_space::local_space);
82
83 // calculate the convolution // output start x
84 const size_t first_output_start = itemID.get_group(0) * (itemID.get_local_range()[0]);
85 if (boundary_check(itemID.get_global_id() < input_range)) {
86 CoeffReturnType result = static_cast<CoeffReturnType>(0);
87 const size_t index = plane_kernel_offset + itemID.get_local_id(0);
88 for (size_t k = 0; k < kernelSize; ++k) {
89 result += (local_acc[k + index] * kernel_ptr[k]);
90 }
91 const size_t tensor_index =
92 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(1)) +
93 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + first_output_start);
94 buffer_ptr[tensor_index] = result;
95 }
96 }
97};
98
99template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
100 typename Kernel_accessor, typename Buffer_accessor>
101struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
102 Buffer_accessor, convolution_type::CONV2D> {
103 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
104 Local_accessor;
105 Local_accessor local_acc;
106 Evaluator device_evaluator;
107 Kernel_accessor kernel_filter;
108 Buffer_accessor buffer_acc;
109 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper;
110 const cl::sycl::range<2> kernel_size;
111 const cl::sycl::range<3> input_range;
112 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
113 Buffer_accessor buffer_acc_,
114 internal::IndexMapper<Index, InputDims, 2, Evaluator::Layout> indexMapper_,
115 const cl::sycl::range<2> kernel_size_, const cl::sycl::range<3> input_range_)
116 : local_acc(local_acc_),
117 device_evaluator(device_evaluator_),
118 kernel_filter(kernel_filter_),
119 buffer_acc(buffer_acc_),
120 indexMapper(indexMapper_),
121 kernel_size(kernel_size_),
122 input_range(input_range_) {}
123 template <typename BooleanDim3>
124 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
125 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
126 }
127
128 void operator()(cl::sycl::nd_item<3> itemID) {
129 auto buffer_ptr = buffer_acc.get_pointer();
130 auto kernel_ptr = kernel_filter.get_pointer();
131 // the required row to be calculated for the for each plane in shered memory
132 const auto num_input = cl::sycl::range<2>{
133 (cl::sycl::range<2>(itemID.get_local_range()[0], itemID.get_local_range()[1]) + kernel_size - 1)};
134
135 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(itemID.get_global_id(2));
136 const size_t plane_kernel_offset = itemID.get_local_id(2) * num_input[1];
137
138 const auto input_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
139 itemID.get_group(1) * itemID.get_local_range()[1]};
140
141 // fill the local memory
142 bool in_range_dim2 = itemID.get_global_id(2) < input_range[2];
143 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
144 const size_t local_input_offset = num_input[0] * (j + plane_kernel_offset);
145 bool in_range_dim1 = ((j + input_offset[1]) < (input_range[1] + kernel_size[1] - 1));
146 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
147 const size_t local_index = i + local_input_offset;
148 const size_t tensor_index = plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
149 i + input_offset[0], j + input_offset[1]);
150 local_acc[local_index] = (((i + input_offset[0]) < (input_range[0] + kernel_size[0] - 1)) &&
151 in_range_dim1 && in_range_dim2)
152 ? device_evaluator.coeff(tensor_index)
153 : CoeffReturnType(0);
154 }
155 }
156
157 itemID.barrier(cl::sycl::access::fence_space::local_space);
158
159 // output offset start for each thread
160 const auto output_offset = cl::sycl::range<2>{itemID.get_group(0) * itemID.get_local_range()[0],
161 itemID.get_group(1) * itemID.get_local_range()[1]};
162
163 if (boundary_check(itemID.get_global_id() < input_range)) {
164 CoeffReturnType result = static_cast<CoeffReturnType>(0);
165
166 for (size_t j = 0; j < kernel_size[1]; j++) {
167 size_t kernel_offset = kernel_size[0] * j;
168 const size_t index =
169 (num_input[0] * (plane_kernel_offset + j + itemID.get_local_id(1))) + itemID.get_local_id(0);
170 for (size_t i = 0; i < kernel_size[0]; i++) {
171 result += (local_acc[i + index] * kernel_ptr[i + kernel_offset]);
172 }
173 }
174 const size_t tensor_index =
175 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(itemID.get_global_id(2)) +
176 indexMapper.mapGpuOutputKernelToTensorOutputOffset(itemID.get_local_id(0) + output_offset[0],
177 itemID.get_local_id(1) + output_offset[1]);
178
179 buffer_ptr[tensor_index] = result;
180 }
181 }
182};
183
184template <typename Evaluator, typename CoeffReturnType, typename KernelType, typename Index, typename InputDims,
185 typename Kernel_accessor, typename Buffer_accessor>
186struct EigenConvolutionKernel<Evaluator, CoeffReturnType, KernelType, Index, InputDims, Kernel_accessor,
187 Buffer_accessor, convolution_type::CONV3D> {
188 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
189 Local_accessor;
190 Local_accessor local_acc;
191 Evaluator device_evaluator;
192 Kernel_accessor kernel_filter;
193 Buffer_accessor buffer_acc;
194 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper;
195 const cl::sycl::range<3> kernel_size;
196 const cl::sycl::range<3> input_range;
197 const size_t numP;
198
199 EigenConvolutionKernel(Local_accessor local_acc_, Evaluator device_evaluator_, Kernel_accessor kernel_filter_,
200 Buffer_accessor buffer_acc_,
201 internal::IndexMapper<Index, InputDims, 3, Evaluator::Layout> indexMapper_,
202 const cl::sycl::range<3> kernel_size_, const cl::sycl::range<3> input_range_,
203 const size_t numP_)
204 : local_acc(local_acc_),
205 device_evaluator(device_evaluator_),
206 kernel_filter(kernel_filter_),
207 buffer_acc(buffer_acc_),
208 indexMapper(indexMapper_),
209 kernel_size(kernel_size_),
210 input_range(input_range_),
211 numP(numP_) {}
212 template <typename BooleanDim3>
213 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool boundary_check(const BooleanDim3 boolean_check) {
214 return (boolean_check[0] && boolean_check[1] && boolean_check[2]);
215 }
216 void operator()(cl::sycl::nd_item<3> itemID) {
217 auto buffer_ptr = buffer_acc.get_pointer();
218 auto kernel_ptr = kernel_filter.get_pointer();
219 const auto num_input = cl::sycl::range<3>{itemID.get_local_range() + kernel_size - 1};
220
221 const auto input_offset = cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range()};
222
223 const auto output_offset =
224 cl::sycl::range<3>{itemID.get_group().get_id() * itemID.get_local_range() + itemID.get_local_id()};
225
226 for (size_t p = 0; p < numP; p++) {
228 const size_t plane_input_offset = indexMapper.mapGpuInputPlaneToTensorInputOffset(p);
229 for (size_t k = itemID.get_local_id(2); k < num_input[2]; k += itemID.get_local_range()[2]) {
230 size_t local_index_dim2 = num_input[0] * num_input[1] * k;
231 bool cond_k_dim = (k + input_offset[2] < (input_range[2] + kernel_size[2] - 1));
232 for (size_t j = itemID.get_local_id(1); j < num_input[1]; j += itemID.get_local_range()[1]) {
233 bool cond_j_dim = cond_k_dim && (j + input_offset[1] < (input_range[1] + kernel_size[1] - 1));
234 size_t local_index_dim1 = (num_input[0] * j) + local_index_dim2;
235 for (size_t i = itemID.get_local_id(0); i < num_input[0]; i += itemID.get_local_range()[0]) {
236 bool conds = cond_j_dim && (i + input_offset[0] < (input_range[0] + kernel_size[0] - 1));
237 const size_t local_index = local_index_dim1 + i;
238 const size_t tensor_index =
239 plane_input_offset + indexMapper.mapGpuInputKernelToTensorInputOffset(
240 i + input_offset[0], j + input_offset[1], k + input_offset[2]);
241 local_acc[local_index] = conds ? device_evaluator.coeff(tensor_index) : CoeffReturnType(0);
242 }
243 }
244 }
245 itemID.barrier(cl::sycl::access::fence_space::local_space);
246
247 // calculate the convolution
248
249 if (boundary_check(itemID.get_global_id() < input_range)) {
250 CoeffReturnType result = static_cast<CoeffReturnType>(0);
251 for (size_t k = 0; k < kernel_size[2]; k++) {
252 for (size_t j = 0; j < kernel_size[1]; j++) {
253 for (size_t i = 0; i < kernel_size[0]; i++) {
254 const size_t kernel_index = i + kernel_size[0] * (j + kernel_size[1] * k);
255 const size_t local_index =
256 ((i + itemID.get_local_id(0)) +
257 num_input[0] * ((j + itemID.get_local_id(1)) + num_input[1] * (k + itemID.get_local_id(2))));
258
259 result += (local_acc[local_index] * kernel_ptr[kernel_index]);
260 }
261 }
262 }
263 const size_t tensor_index =
264 indexMapper.mapGpuOutputPlaneToTensorOutputOffset(p) +
265 indexMapper.mapGpuOutputKernelToTensorOutputOffset(output_offset[0], output_offset[1], output_offset[2]);
266 buffer_ptr[tensor_index] = result;
267 }
268
269 itemID.barrier(cl::sycl::access::fence_space::local_space);
270 }
271 }
272};
273
274template <typename Indices, typename InputArgType, typename KernelArgType>
275struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>, Eigen::SyclDevice> {
276 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
277
278 static const int NumDims =
279 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
280 static const int NumKernelDims = internal::array_size<Indices>::value;
281 typedef typename XprType::Index Index;
282 typedef DSizes<Index, NumDims> Dimensions;
283 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
284 typedef const Eigen::SyclDevice Device;
285 typedef typename XprType::CoeffReturnType CoeffReturnType;
286 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
287 typedef typename InputArgType::Scalar Scalar;
288 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
289 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
290 typedef typename Storage::Type EvaluatorPointerType;
291 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
292
293 enum {
294 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
295 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
296 PacketAccess = false,
297 BlockAccess = false,
298 PreferBlockAccess = false,
299 Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
300 CoordAccess = false, // to be implemented
301 RawAccess = false
302 };
303
304 //===- Tensor block evaluation strategy (see TensorBlock.h) -------------===//
305 typedef internal::TensorBlockNotImplemented TensorBlock;
306 //===--------------------------------------------------------------------===//
307
308 TensorEvaluator(const XprType &op, const Eigen::SyclDevice &device)
309 : m_inputImpl(op.inputExpression(), device),
310 m_kernelArg(op.kernelExpression()),
311 m_kernelImpl(op.kernelExpression(), device),
312 m_indices(op.indices()),
313 m_buf(NULL),
314 m_kernel(NULL),
315 m_local_kernel(false),
316 m_device(device) {
317 EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
318 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
319 YOU_MADE_A_PROGRAMMING_MISTAKE);
320
321 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
322 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
323 m_kernelImpl.dimensions();
324
325 m_dimensions = m_inputImpl.dimensions();
326 for (int i = 0; i < NumKernelDims; ++i) {
327 const Index index = op.indices()[i];
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
332 }
333 }
334
335 EIGEN_DEVICE_FUNC const Dimensions &dimensions() const { return m_dimensions; }
336
337 EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(EvaluatorPointerType data) {
338 preloadKernel();
339 m_inputImpl.evalSubExprsIfNeeded(NULL);
340 if (data) {
341 executeEval(data);
342 return false;
343 } else {
344 m_buf = (EvaluatorPointerType)m_device.get(
345 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() * sizeof(Scalar)));
346 executeEval(m_buf);
347 return true;
348 }
349 }
350
351 EIGEN_STRONG_INLINE void cleanup() {
352 m_inputImpl.cleanup();
353 if (m_buf) {
354 m_device.deallocate_temp(m_buf);
355 m_buf = NULL;
356 }
357 if (m_local_kernel) {
358 m_device.deallocate_temp(m_kernel);
359 m_local_kernel = false;
360 }
361 m_kernel = NULL;
362 }
364 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device &device() const { return m_device; }
366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType data() const { return m_buf; }
367
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void preloadKernel() {
369 // Don't make a local copy of the kernel unless we have to (i.e. it's an
370 // expression that needs to be evaluated)
371 typename KernelStorage::Type in_place = m_kernelImpl.data();
372 if (in_place) {
373 m_kernel = in_place;
374 m_local_kernel = false;
375 } else {
376 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() * sizeof(Scalar);
377 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
378 typedef TensorEvalToOp<const KernelArgType> EvalTo;
379 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
380 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
381 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
382 m_kernel = local;
383 m_local_kernel = true;
384 }
385 }
386
387 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void executeEval(EvaluatorPointerType data) const {
388 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
389 typedef typename InputEvaluator::Dimensions InputDims;
390 switch (NumKernelDims) {
391 case 1: {
392 const size_t numX = dimensions()[m_indices[0]];
393 const size_t numP = dimensions().TotalSize() / numX;
394 const auto input_dim = std::array<size_t, 2>{numX, numP};
395 auto global_range = cl::sycl::range<2>{};
396 auto local_range = cl::sycl::range<2>{};
397 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
398
399 m_device.parallel_for_setup(input_dim, global_range, local_range);
400 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402 const array<Index, 1> indices{{m_indices[0]}};
403 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
405
406 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
407 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
408 ConvKernel;
409
410 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
413 break;
414 }
415
416 case 2: {
417 auto kernel_index = std::array<size_t, 2>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 1,
418 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 0};
419 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423 const size_t numP = dimensions().TotalSize() / (numX * numY);
424 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
425
426 auto global_range = cl::sycl::range<3>{};
427 auto local_range = cl::sycl::range<3>{};
428
429 m_device.parallel_for_setup(input_dim, global_range, local_range);
430
431 const size_t local_memory_size =
432 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435 const array<Index, 2> kernel_dims{
436 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
439 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
440 ConvKernel;
441 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
444 break;
445 }
446
447 case 3: {
448 auto kernel_index = std::array<size_t, 3>{static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 0 : 2,
449 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 1 : 1,
450 static_cast<int>(Layout) == static_cast<int>(ColMajor) ? 2 : 0};
451
452 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
455
456 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
461
462 const array<Index, 3> indices{
463 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465 m_kernelImpl.dimensions()[kernel_index[1]],
466 m_kernelImpl.dimensions()[kernel_index[2]]}};
467
468 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
469
470 auto global_range = cl::sycl::range<3>{};
471 auto local_range = cl::sycl::range<3>{};
472
473 m_device.parallel_for_setup(input_dim, global_range, local_range);
474 auto local_memory_range = (local_range + kernel_size - 1);
475 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
476
477 gpu_assert(static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
479 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
480 ConvKernel;
481 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
484 break;
485 }
486
487 default: {
488 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
490 }
491 }
492 }
493
494 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const {
495 eigen_assert(m_buf != NULL);
496 eigen_assert(index < m_dimensions.TotalSize());
497 return m_buf[index];
498 }
499
500 template <int LoadMode>
501 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(const Index index) const {
502 eigen_assert(m_buf != NULL);
503 eigen_assert(index < m_dimensions.TotalSize());
504 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
505 }
506
507 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
508 // TODO(rmlarsen): FIXME: For now, this is just a copy of the CPU cost
509 // model.
510 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
511 // We ignore the use of fused multiply-add.
512 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513 const double firstIndex_compute_cost =
514 NumDims *
515 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
519 }
520 // binding placeholder accessors to a command group handler for SYCL
521 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void bind(cl::sycl::handler &cgh) const {
522 m_kernelImpl.bind(cgh);
523 m_inputImpl.bind(cgh);
524 m_buf.bind(cgh);
525 m_kernel.bind(cgh);
526 }
527
528 private:
529 // No assignment (copies are needed by the kernels)
530 TensorEvaluator &operator=(const TensorEvaluator &);
531 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
532 KernelArgType m_kernelArg;
533 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
534 Indices m_indices;
535 Dimensions m_dimensions;
536 EvaluatorPointerType m_buf;
537 typename KernelStorage::Type m_kernel;
538 bool m_local_kernel;
539 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;
540}; // namespace Eigen
541
542} // end namespace Eigen
543
544#endif // EIGEN_CXX11_TENSOR_TENSOR_CONVOLUTION_H
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index