Please, help us to better know about our user community by answering the following short survey: https://forms.gle/wpyrxWi18ox9Z5ae9
 
Loading...
Searching...
No Matches
TensorExecutor.h
1// This file is part of Eigen, a lightweight C++ template library
2// for linear algebra.
3//
4// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
5//
6// This Source Code Form is subject to the terms of the Mozilla
7// Public License v. 2.0. If a copy of the MPL was not distributed
8// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10#ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
11#define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
12
13namespace Eigen {
14
29namespace internal {
30
39// TODO(ezhulenev): Add specializations for all other types of Tensor ops.
40
41template<typename Expression>
42struct ExpressionHasTensorBroadcastingOp {
43 enum { value = false };
44};
45
46template<typename LhsXprType, typename RhsXprType>
47struct ExpressionHasTensorBroadcastingOp<
48 const TensorAssignOp<LhsXprType, RhsXprType> > {
49 enum { value = ExpressionHasTensorBroadcastingOp<RhsXprType>::value };
50};
51
52template<typename UnaryOp, typename XprType>
53struct ExpressionHasTensorBroadcastingOp<
54 const TensorCwiseUnaryOp<UnaryOp, XprType> > {
55 enum { value = ExpressionHasTensorBroadcastingOp<XprType>::value };
56};
57
58template<typename BinaryOp, typename LhsXprType, typename RhsXprType>
59struct ExpressionHasTensorBroadcastingOp<
60 const TensorCwiseBinaryOp<BinaryOp, LhsXprType, RhsXprType> > {
61 enum {
62 value = ExpressionHasTensorBroadcastingOp<LhsXprType>::value ||
63 ExpressionHasTensorBroadcastingOp<RhsXprType>::value
64 };
65};
66
67template<typename Broadcast, typename XprType>
68struct ExpressionHasTensorBroadcastingOp<
69 const TensorBroadcastingOp<Broadcast, XprType> > {
70 enum { value = true };
71};
72
73// -------------------------------------------------------------------------- //
74
79template <typename Expression, typename Device, bool Vectorizable,
80 TiledEvaluation Tiling>
81class TensorExecutor {
82 public:
83 typedef typename Expression::Index StorageIndex;
84
85 // Including `unsupported/Eigen/CXX11/Tensor` in different translation units
86 // with/without `EIGEN_USE_THREADS` or `EIGEN_USE_GPU` is a potential ODR
87 // violation. If this template is instantiated with a non-default device, it
88 // means that this header file was included without defining
89 // `EIGEN_USE_THREADS`, `EIGEN_USE_GPU` or `EIGEN_USE_SYCL`.
90 static_assert(std::is_same<Device, DefaultDevice>::value,
91 "Default executor instantiated with non-default device. "
92 "You must #define EIGEN_USE_THREADS, EIGEN_USE_GPU or "
93 "EIGEN_USE_SYCL before including Eigen headers.");
94
95 EIGEN_DEVICE_FUNC
96 static EIGEN_STRONG_INLINE void run(const Expression& expr,
97 const Device& device = Device()) {
98 TensorEvaluator<Expression, Device> evaluator(expr, device);
99 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
100 if (needs_assign) {
101 const StorageIndex size = array_prod(evaluator.dimensions());
102 for (StorageIndex i = 0; i < size; ++i) {
103 evaluator.evalScalar(i);
104 }
105 }
106 evaluator.cleanup();
107 }
108};
109
114template <typename Expression, typename Device, typename DoneCallback,
115 bool Vectorizable, TiledEvaluation Tiling>
116class TensorAsyncExecutor {};
117
121template <typename Expression>
122class TensorExecutor<Expression, DefaultDevice, /*Vectorizable=*/true,
123 /*Tiling=*/TiledEvaluation::Off> {
124 public:
125 typedef typename Expression::Index StorageIndex;
126
127 EIGEN_DEVICE_FUNC
128 static EIGEN_STRONG_INLINE void run(
129 const Expression& expr, const DefaultDevice& device = DefaultDevice()) {
130 TensorEvaluator<Expression, DefaultDevice> evaluator(expr, device);
131 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
132 if (needs_assign) {
133 const StorageIndex size = array_prod(evaluator.dimensions());
134 const int PacketSize = unpacket_traits<typename TensorEvaluator<
135 Expression, DefaultDevice>::PacketReturnType>::size;
136
137 // Give compiler a strong possibility to unroll the loop. But don't insist
138 // on unrolling, because if the function is expensive compiler should not
139 // unroll the loop at the expense of inlining.
140 const StorageIndex UnrolledSize =
141 (size / (4 * PacketSize)) * 4 * PacketSize;
142 for (StorageIndex i = 0; i < UnrolledSize; i += 4 * PacketSize) {
143 for (StorageIndex j = 0; j < 4; j++) {
144 evaluator.evalPacket(i + j * PacketSize);
145 }
146 }
147 const StorageIndex VectorizedSize = (size / PacketSize) * PacketSize;
148 for (StorageIndex i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
149 evaluator.evalPacket(i);
150 }
151 for (StorageIndex i = VectorizedSize; i < size; ++i) {
152 evaluator.evalScalar(i);
153 }
154 }
155 evaluator.cleanup();
156 }
157};
158
163template <typename Expression, bool Vectorizable>
164class TensorExecutor<Expression, DefaultDevice, Vectorizable,
165 /*Tiling=*/TiledEvaluation::On> {
166 public:
167 typedef typename traits<Expression>::Scalar Scalar;
168 typedef typename remove_const<Scalar>::type ScalarNoConst;
169
170 typedef TensorEvaluator<Expression, DefaultDevice> Evaluator;
171 typedef typename traits<Expression>::Index StorageIndex;
172
173 static const int NumDims = traits<Expression>::NumDimensions;
174
175 EIGEN_DEVICE_FUNC
176 static EIGEN_STRONG_INLINE void run(const Expression& expr,
177 const DefaultDevice& device = DefaultDevice()) {
178 typedef TensorBlockMapper<NumDims, Evaluator::Layout, StorageIndex>
179 TensorBlockMapper;
180
181 typedef internal::TensorBlockDescriptor<NumDims, StorageIndex>
182 TensorBlockDesc;
183 typedef internal::TensorBlockScratchAllocator<DefaultDevice>
184 TensorBlockScratch;
185
186 Evaluator evaluator(expr, device);
187
188 // TODO(ezhulenev): Do not use tiling for small tensors?
189 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
190
191 if (needs_assign) {
192 // Query expression tree for desired block size/shape.
193 const TensorBlockResourceRequirements requirements =
194 evaluator.getResourceRequirements();
195
196 const TensorBlockMapper block_mapper(
197 typename TensorBlockDesc::Dimensions(evaluator.dimensions()),
198 requirements);
199
200 // Share scratch memory allocator between all blocks.
201 TensorBlockScratch scratch(device);
202
203 const StorageIndex total_block_count = block_mapper.blockCount();
204 for (StorageIndex i = 0; i < total_block_count; ++i) {
205 TensorBlockDesc desc = block_mapper.blockDescriptor(i);
206 evaluator.evalBlock(desc, scratch);
207 scratch.reset();
208 }
209 }
210 evaluator.cleanup();
211 }
212};
213
225#ifdef EIGEN_USE_THREADS
226
227template <typename TensorBlockMapper>
228struct TensorExecutorTilingContext {
229 TensorExecutorTilingContext() = default;
230 TensorExecutorTilingContext(const TensorBlockMapper& b_mapper,
231 const TensorOpCost& b_cost, size_t b_aligned_size)
232 : block_mapper(b_mapper),
233 cost(b_cost),
234 aligned_blocksize(b_aligned_size) {}
235
236 TensorBlockMapper block_mapper; // navigate through blocks
237 TensorOpCost cost; // cost of computing a single block
238 size_t aligned_blocksize; // block size after memory alignment
239};
240
241// Computes a block evaluation parameters, and allocates temporary memory buffer
242// for blocks. See TensorExecutor/TensorAsyncExecutor (Tiling=On) below.
243template <typename Evaluator, typename TensorBlockMapper, bool Vectorizable>
244TensorExecutorTilingContext<TensorBlockMapper> GetTensorExecutorTilingContext(
245 const Evaluator& evaluator) {
246 // Query expression tree for desired block size/shape.
247 TensorBlockResourceRequirements requirements =
248 evaluator.getResourceRequirements();
249
250 // Update target block size based on cost model.
251 double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
252 1, requirements.cost_per_coeff);
253 requirements.size = static_cast<size_t>(1.0 / taskSize);
254
255 TensorBlockMapper block_mapper(
256 typename TensorBlockMapper::Dimensions(evaluator.dimensions()),
257 requirements);
258
259 size_t block_size = block_mapper.blockTotalSize();
260 const size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
261 const size_t aligned_blocksize =
262 align *
263 divup<size_t>(block_size * sizeof(typename Evaluator::Scalar), align);
264
265 return {block_mapper, requirements.cost_per_coeff * block_size,
266 aligned_blocksize};
267}
268
269template <typename Evaluator, typename StorageIndex, bool Vectorizable>
270struct EvalRange {
271 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
272 const StorageIndex lastIdx) {
273 Evaluator evaluator = *evaluator_in;
274 eigen_assert(lastIdx >= firstIdx);
275 for (StorageIndex i = firstIdx; i < lastIdx; ++i) {
276 evaluator.evalScalar(i);
277 }
278 }
279
280 static StorageIndex alignBlockSize(StorageIndex size) { return size; }
281};
282
283template <typename Evaluator, typename StorageIndex>
284struct EvalRange<Evaluator, StorageIndex, /*Vectorizable*/ true> {
285 static const int PacketSize =
286 unpacket_traits<typename Evaluator::PacketReturnType>::size;
287
288 static void run(Evaluator* evaluator_in, const StorageIndex firstIdx,
289 const StorageIndex lastIdx) {
290 Evaluator evaluator = *evaluator_in;
291 eigen_assert(lastIdx >= firstIdx);
292 StorageIndex i = firstIdx;
293 if (lastIdx - firstIdx >= PacketSize) {
294 eigen_assert(firstIdx % PacketSize == 0);
295 StorageIndex last_chunk_offset = lastIdx - 4 * PacketSize;
296 // Give compiler a strong possibility to unroll the loop. But don't insist
297 // on unrolling, because if the function is expensive compiler should not
298 // unroll the loop at the expense of inlining.
299 for (; i <= last_chunk_offset; i += 4 * PacketSize) {
300 for (StorageIndex j = 0; j < 4; j++) {
301 evaluator.evalPacket(i + j * PacketSize);
302 }
303 }
304 last_chunk_offset = lastIdx - PacketSize;
305 for (; i <= last_chunk_offset; i += PacketSize) {
306 evaluator.evalPacket(i);
307 }
308 }
309 for (; i < lastIdx; ++i) {
310 evaluator.evalScalar(i);
311 }
312 }
313
314 static StorageIndex alignBlockSize(StorageIndex size) {
315 // Align block size to packet size and account for unrolling in run above.
316 if (size >= 16 * PacketSize) {
317 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
318 }
319 // Aligning to 4 * PacketSize would increase block size by more than 25%.
320 return (size + PacketSize - 1) & ~(PacketSize - 1);
321 }
322};
323
324template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
325class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable, Tiling> {
326 public:
327 typedef typename Expression::Index StorageIndex;
328
329 static EIGEN_STRONG_INLINE void run(const Expression& expr,
330 const ThreadPoolDevice& device) {
331 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
332 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
333
334 Evaluator evaluator(expr, device);
335 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
336 if (needs_assign) {
337 const StorageIndex size = array_prod(evaluator.dimensions());
338 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
339 EvalRange::alignBlockSize,
340 [&evaluator](StorageIndex firstIdx, StorageIndex lastIdx) {
341 EvalRange::run(&evaluator, firstIdx, lastIdx);
342 });
343 }
344 evaluator.cleanup();
345 }
346};
347
348template <typename Expression, bool Vectorizable>
349class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable,
350 /*Tiling=*/TiledEvaluation::On> {
351 public:
352 typedef typename traits<Expression>::Index IndexType;
353 typedef typename traits<Expression>::Scalar Scalar;
354 typedef typename remove_const<Scalar>::type ScalarNoConst;
355
356 static const int NumDims = traits<Expression>::NumDimensions;
357
358 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
359 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
360 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
361
362 typedef internal::TensorBlockDescriptor<NumDims, IndexType>
363 TensorBlockDesc;
364 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
365 TensorBlockScratch;
366
367 static EIGEN_STRONG_INLINE void run(const Expression& expr,
368 const ThreadPoolDevice& device) {
369 Evaluator evaluator(expr, device);
370
371 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
372 if (needs_assign) {
373 const TilingContext tiling =
374 internal::GetTensorExecutorTilingContext<Evaluator, BlockMapper,
375 Vectorizable>(evaluator);
376
377 auto eval_block = [&device, &evaluator, &tiling](IndexType firstBlockIdx,
378 IndexType lastBlockIdx) {
379 TensorBlockScratch scratch(device);
380
381 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
382 ++block_idx) {
383 TensorBlockDesc desc = tiling.block_mapper.blockDescriptor(block_idx);
384 evaluator.evalBlock(desc, scratch);
385 scratch.reset();
386 }
387 };
388
389 // Evaluate small expressions directly as a single block.
390 if (tiling.block_mapper.blockCount() == 1) {
391 TensorBlockScratch scratch(device);
392 TensorBlockDesc desc(0, tiling.block_mapper.blockDimensions());
393 evaluator.evalBlock(desc, scratch);
394 } else {
395 device.parallelFor(tiling.block_mapper.blockCount(), tiling.cost,
396 eval_block);
397 }
398 }
399 evaluator.cleanup();
400 }
401};
402
403template <typename Expression, typename DoneCallback, bool Vectorizable,
404 TiledEvaluation Tiling>
405class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
406 Vectorizable, Tiling> {
407 public:
408 typedef typename Expression::Index StorageIndex;
409 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
410
411 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
412 const ThreadPoolDevice& device,
413 DoneCallback done) {
414 TensorAsyncExecutorContext* const ctx =
415 new TensorAsyncExecutorContext(expr, device, std::move(done));
416
417 const auto on_eval_subexprs = [ctx, &device](bool need_assign) -> void {
418 if (!need_assign) {
419 delete ctx;
420 return;
421 }
422
423 typedef EvalRange<Evaluator, StorageIndex, Vectorizable> EvalRange;
424 const StorageIndex size = array_prod(ctx->evaluator.dimensions());
425 device.parallelForAsync(
426 size, ctx->evaluator.costPerCoeff(Vectorizable),
427 EvalRange::alignBlockSize,
428 [ctx](StorageIndex firstIdx, StorageIndex lastIdx) {
429 EvalRange::run(&ctx->evaluator, firstIdx, lastIdx);
430 },
431 [ctx]() { delete ctx; });
432 };
433
434 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
435 }
436
437 private:
438 struct TensorAsyncExecutorContext {
439 TensorAsyncExecutorContext(const Expression& expr,
440 const ThreadPoolDevice& thread_pool,
441 DoneCallback done)
442 : evaluator(expr, thread_pool), on_done(std::move(done)) {}
443
444 ~TensorAsyncExecutorContext() {
445 evaluator.cleanup();
446 on_done();
447 }
448
449 Evaluator evaluator;
450
451 private:
452 DoneCallback on_done;
453 };
454};
455
456template <typename Expression, typename DoneCallback, bool Vectorizable>
457class TensorAsyncExecutor<Expression, ThreadPoolDevice, DoneCallback,
458 Vectorizable, /*Tileable*/ TiledEvaluation::On> {
459 public:
460 typedef typename traits<Expression>::Index IndexType;
461 typedef typename traits<Expression>::Scalar Scalar;
462 typedef typename remove_const<Scalar>::type ScalarNoConst;
463
464 static const int NumDims = traits<Expression>::NumDimensions;
465
466 typedef TensorEvaluator<Expression, ThreadPoolDevice> Evaluator;
467 typedef TensorBlockMapper<NumDims, Evaluator::Layout, IndexType> BlockMapper;
468 typedef TensorExecutorTilingContext<BlockMapper> TilingContext;
469
470 typedef internal::TensorBlockDescriptor<NumDims, IndexType> TensorBlockDesc;
471 typedef internal::TensorBlockScratchAllocator<ThreadPoolDevice>
472 TensorBlockScratch;
473
474 static EIGEN_STRONG_INLINE void runAsync(const Expression& expr,
475 const ThreadPoolDevice& device,
476 DoneCallback done) {
477
478 TensorAsyncExecutorContext* const ctx =
479 new TensorAsyncExecutorContext(expr, device, std::move(done));
480
481 const auto on_eval_subexprs = [ctx](bool need_assign) -> void {
482 if (!need_assign) {
483 delete ctx;
484 return;
485 }
486
487 ctx->tiling = internal::GetTensorExecutorTilingContext<
488 Evaluator, BlockMapper, Vectorizable>(ctx->evaluator);
489
490 auto eval_block = [ctx](IndexType firstBlockIdx, IndexType lastBlockIdx) {
491 TensorBlockScratch scratch(ctx->device);
492
493 for (IndexType block_idx = firstBlockIdx; block_idx < lastBlockIdx;
494 ++block_idx) {
495 TensorBlockDesc desc =
496 ctx->tiling.block_mapper.blockDescriptor(block_idx);
497 ctx->evaluator.evalBlock(desc, scratch);
498 scratch.reset();
499 }
500 };
501
502 // Evaluate small expressions directly as a single block.
503 if (ctx->tiling.block_mapper.blockCount() == 1) {
504 TensorBlockScratch scratch(ctx->device);
505 TensorBlockDesc desc(0, ctx->tiling.block_mapper.blockDimensions());
506 ctx->evaluator.evalBlock(desc, scratch);
507 delete ctx;
508 } else {
509 ctx->device.parallelForAsync(ctx->tiling.block_mapper.blockCount(),
510 ctx->tiling.cost, eval_block,
511 [ctx]() { delete ctx; });
512 }
513 };
514
515 ctx->evaluator.evalSubExprsIfNeededAsync(nullptr, on_eval_subexprs);
516 }
517
518 private:
519 struct TensorAsyncExecutorContext {
520 TensorAsyncExecutorContext(const Expression& expr,
521 const ThreadPoolDevice& thread_pool,
522 DoneCallback done)
523 : device(thread_pool),
524 evaluator(expr, thread_pool),
525 on_done(std::move(done)) {}
526
527 ~TensorAsyncExecutorContext() {
528 evaluator.cleanup();
529 on_done();
530 }
531
532 const ThreadPoolDevice& device;
533 Evaluator evaluator;
534 TilingContext tiling;
535
536 private:
537 DoneCallback on_done;
538 };
539};
540
541#endif // EIGEN_USE_THREADS
542
543// GPU: the evaluation of the expression is offloaded to a GPU.
544#if defined(EIGEN_USE_GPU)
545
546template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
547class TensorExecutor<Expression, GpuDevice, Vectorizable, Tiling> {
548 public:
549 typedef typename Expression::Index StorageIndex;
550 static void run(const Expression& expr, const GpuDevice& device);
551};
552
553#if defined(EIGEN_GPUCC)
554template <typename Evaluator, typename StorageIndex, bool Vectorizable>
555struct EigenMetaKernelEval {
556 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
557 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
558 for (StorageIndex i = firstIdx; i < lastIdx; i += step_size) {
559 eval.evalScalar(i);
560 }
561 }
562};
563
564template <typename Evaluator, typename StorageIndex>
565struct EigenMetaKernelEval<Evaluator, StorageIndex, true> {
566 static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
567 void run(Evaluator& eval, StorageIndex firstIdx, StorageIndex lastIdx, StorageIndex step_size) {
568 const StorageIndex PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
569 const StorageIndex vectorized_size = (lastIdx / PacketSize) * PacketSize;
570 const StorageIndex vectorized_step_size = step_size * PacketSize;
571
572 // Use the vector path
573 for (StorageIndex i = firstIdx * PacketSize; i < vectorized_size;
574 i += vectorized_step_size) {
575 eval.evalPacket(i);
576 }
577 for (StorageIndex i = vectorized_size + firstIdx; i < lastIdx; i += step_size) {
578 eval.evalScalar(i);
579 }
580 }
581};
582
583template <typename Evaluator, typename StorageIndex>
584__global__ void
585__launch_bounds__(1024)
586EigenMetaKernel(Evaluator eval, StorageIndex size) {
587
588 const StorageIndex first_index = blockIdx.x * blockDim.x + threadIdx.x;
589 const StorageIndex step_size = blockDim.x * gridDim.x;
590
591 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
592 EigenMetaKernelEval<Evaluator, StorageIndex, vectorizable>::run(eval, first_index, size, step_size);
593}
594
595/*static*/
596template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
598 const Expression& expr, const GpuDevice& device) {
599 TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
600 const bool needs_assign = evaluator.evalSubExprsIfNeeded(nullptr);
601 if (needs_assign) {
602
603 const int block_size = device.maxGpuThreadsPerBlock();
604 const int max_blocks = device.getNumGpuMultiProcessors() *
605 device.maxGpuThreadsPerMultiProcessor() / block_size;
606 const StorageIndex size = array_prod(evaluator.dimensions());
607 // Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
608 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
609
610 LAUNCH_GPU_KERNEL(
611 (EigenMetaKernel<TensorEvaluator<Expression, GpuDevice>, StorageIndex>),
612 num_blocks, block_size, 0, device, evaluator, size);
613 }
614 evaluator.cleanup();
615}
616
617#endif // EIGEN_GPUCC
618#endif // EIGEN_USE_GPU
619
620// SYCL Executor policy
621#ifdef EIGEN_USE_SYCL
622
623template <typename Evaluator>
624struct ExecExprFunctorKernel {
625 typedef typename Evaluator::Index Index;
626 Evaluator evaluator;
627 const Index range;
628 template <typename Scratch>
629 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE ExecExprFunctorKernel(
630 const Scratch, Evaluator evaluator_, const Index range_)
631 : evaluator(evaluator_), range(range_) {}
632
633 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void operator()(
634 cl::sycl::nd_item<1> itemID) {
635 compute(itemID);
636 }
637 template <bool is_vec = Evaluator::PacketAccess>
638 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<!is_vec>::type
639 compute(const cl::sycl::nd_item<1>& itemID) {
640 Index gId = static_cast<Index>(itemID.get_global_linear_id());
641 Index total_threads = itemID.get_global_range(0);
642
643 for (Index i = gId; i < range; i += total_threads) {
644 evaluator.evalScalar(i);
645 }
646 }
647 template <bool is_vec = Evaluator::PacketAccess>
648 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE typename std::enable_if<is_vec>::type
649 compute(const cl::sycl::nd_item<1>& itemID) {
650 const Index vectorizedRange =
651 (range / Evaluator::PacketSize) * Evaluator::PacketSize;
652 Index gId = static_cast<Index>(itemID.get_global_linear_id());
653 const Index step = Evaluator::PacketSize * itemID.get_global_range(0);
654 const Index start = Evaluator::PacketSize * gId;
655 for (Index i = start; i < vectorizedRange; i += step) {
656 evaluator.evalPacket(i);
657 }
658 gId += vectorizedRange;
659 for (Index i = gId; i < range; i += itemID.get_global_range(0)) {
660 evaluator.evalScalar(i);
661 }
662 }
663};
664
665template <typename Expression, bool Vectorizable, TiledEvaluation Tiling>
666class TensorExecutor<Expression, Eigen::SyclDevice, Vectorizable, Tiling> {
667 public:
668 typedef typename Expression::Index Index;
669 static EIGEN_STRONG_INLINE void run(const Expression& expr,
670 const Eigen::SyclDevice& dev) {
672 Evaluator evaluator(expr, dev);
673 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
674 if (needs_assign) {
675 Index range, GRange, tileSize;
676 Index total_size = ::Eigen::internal::array_prod(evaluator.dimensions());
677 total_size = (total_size == 0) ? 1 : total_size;
678 const int PacketSize =
679 Eigen::PacketType<typename Evaluator::CoeffReturnType,
680 Eigen::SyclDevice>::size;
681 Index vectorizable_threads = static_cast<Index>(total_size / PacketSize);
682 dev.parallel_for_setup(vectorizable_threads, tileSize, range, GRange);
683 range = total_size;
684
685 dev.template nullary_kernel_launcher<
686 typename Evaluator::CoeffReturnType,
687 ExecExprFunctorKernel<Evaluator> >(
688 evaluator,
689 cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange),
690 cl::sycl::range<1>(tileSize)),
691 Index(1), range);
692 }
693 evaluator.cleanup();
694 }
695};
696
697#endif
698
699} // end namespace internal
700
701} // end namespace Eigen
702
703#endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H
The tensor executor class.
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:29