28#ifndef UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
29#define UNSUPPORTED_EIGEN_CXX11_SRC_TENSOR_TENSOR_REDUCTION_SYCL_HPP
34template <
typename Op,
typename CoeffReturnType,
typename Index,
bool Vectorizable>
36 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, Vectorizable>::PacketReturnType PacketReturnType;
38 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Op &op) {
return op; }
40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
46template <
typename CoeffReturnType,
typename Index>
47struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, false> {
48 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
53 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType finalise_op(
const CoeffReturnType &accumulator,
55 ::Eigen::internal::scalar_quotient_op<CoeffReturnType> quotient_op;
56 return quotient_op(accumulator, CoeffReturnType(scale));
60template <
typename CoeffReturnType,
typename Index>
61struct OpDefiner<
Eigen::internal::MeanReducer<CoeffReturnType>, CoeffReturnType,
Index, true> {
62 typedef typename Vectorise<CoeffReturnType, Eigen::SyclDevice, true>::PacketReturnType PacketReturnType;
63 typedef Eigen::internal::SumReducer<CoeffReturnType> type;
64 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE type get_op(Eigen::internal::MeanReducer<CoeffReturnType> &) {
68 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType finalise_op(
const PacketReturnType &accumulator,
70 return ::Eigen::internal::pdiv(accumulator, ::Eigen::internal::pset1<PacketReturnType>(CoeffReturnType(scale)));
74template <
typename CoeffReturnType,
typename OpType,
typename InputAccessor,
typename OutputAccessor,
typename Index,
76struct SecondStepFullReducer {
77 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
79 typedef OpDefiner<OpType, CoeffReturnType, Index, true> OpDef;
80 typedef typename OpDef::type Op;
81 LocalAccessor scratch;
83 OutputAccessor outAcc;
85 SecondStepFullReducer(LocalAccessor scratch_, InputAccessor aI_, OutputAccessor outAcc_, OpType op_)
86 : scratch(scratch_), aI(aI_), outAcc(outAcc_), op(OpDef::get_op(op_)) {}
88 void operator()(cl::sycl::nd_item<1> itemID) {
96 const Index localid = itemID.get_local_id(0);
97 auto aInPtr = aI.get_pointer() + localid;
98 auto aOutPtr = outAcc.get_pointer();
99 CoeffReturnType *scratchptr = scratch.get_pointer();
100 CoeffReturnType accumulator = *aInPtr;
102 scratchptr[localid] = op.finalize(accumulator);
103 for (
Index offset = itemID.get_local_range(0) / 2; offset > 0; offset /= 2) {
104 itemID.barrier(cl::sycl::access::fence_space::local_space);
105 if (localid < offset) {
106 op.reduce(scratchptr[localid + offset], &accumulator);
107 scratchptr[localid] = op.finalize(accumulator);
110 if (localid == 0) *aOutPtr = op.finalize(accumulator);
116template <
typename Evaluator,
typename OpType,
typename Evaluator::Index local_range>
117class FullReductionKernelFunctor {
119 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
120 typedef typename Evaluator::Index
Index;
121 typedef OpDefiner<OpType,
typename Evaluator::CoeffReturnType,
Index,
122 (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
125 typedef typename OpDef::type Op;
126 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
127 typedef typename Evaluator::PacketReturnType PacketReturnType;
129 typename ::Eigen::internal::conditional<(Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess),
130 PacketReturnType, CoeffReturnType>::type OutType;
131 typedef cl::sycl::accessor<OutType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
133 LocalAccessor scratch;
135 EvaluatorPointerType final_output;
139 FullReductionKernelFunctor(LocalAccessor scratch_, Evaluator evaluator_, EvaluatorPointerType final_output_,
140 Index rng_, OpType op_)
141 : scratch(scratch_), evaluator(evaluator_), final_output(final_output_), rng(rng_), op(OpDef::get_op(op_)) {}
143 void operator()(cl::sycl::nd_item<1> itemID) { compute_reduction(itemID); }
145 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
146 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<Vect>::type compute_reduction(
147 const cl::sycl::nd_item<1> &itemID) {
148 auto output_ptr = final_output.get_pointer();
149 Index VectorizedRange = (rng / Evaluator::PacketSize) * Evaluator::PacketSize;
150 Index globalid = itemID.get_global_id(0);
151 Index localid = itemID.get_local_id(0);
152 Index step = Evaluator::PacketSize * itemID.get_global_range(0);
153 Index start = Evaluator::PacketSize * globalid;
155 PacketReturnType packetAccumulator = op.template initializePacket<PacketReturnType>();
156 for (Index i = start; i < VectorizedRange; i += step) {
157 op.template reducePacket<PacketReturnType>(evaluator.impl().template packet<Unaligned>(i), &packetAccumulator);
159 globalid += VectorizedRange;
161 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
162 op.template reducePacket<PacketReturnType>(
163 ::Eigen::TensorSycl::internal::PacketWrapper<PacketReturnType, Evaluator::PacketSize>::convert_to_packet_type(
164 evaluator.impl().coeff(i), op.initialize()),
167 scratch[localid] = packetAccumulator =
168 OpDef::finalise_op(op.template finalizePacket<PacketReturnType>(packetAccumulator), rng);
171 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
172 itemID.barrier(cl::sycl::access::fence_space::local_space);
173 if (localid < offset) {
174 op.template reducePacket<PacketReturnType>(scratch[localid + offset], &packetAccumulator);
175 scratch[localid] = op.template finalizePacket<PacketReturnType>(packetAccumulator);
179 output_ptr[itemID.get_group(0)] =
180 op.finalizeBoth(op.initialize(), op.template finalizePacket<PacketReturnType>(packetAccumulator));
184 template <
bool Vect = (Evaluator::ReducerTraits::PacketAccess & Evaluator::InputPacketAccess)>
185 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE typename ::Eigen::internal::enable_if<!Vect>::type compute_reduction(
186 const cl::sycl::nd_item<1> &itemID) {
187 auto output_ptr = final_output.get_pointer();
188 Index globalid = itemID.get_global_id(0);
189 Index localid = itemID.get_local_id(0);
191 CoeffReturnType accumulator = op.initialize();
193 for (Index i = globalid; i < rng; i += itemID.get_global_range(0)) {
194 op.reduce(evaluator.impl().coeff(i), &accumulator);
196 scratch[localid] = accumulator = OpDef::finalise_op(op.finalize(accumulator), rng);
200 for (Index offset = local_range / 2; offset > 0; offset /= 2) {
201 itemID.barrier(cl::sycl::access::fence_space::local_space);
202 if (localid < offset) {
203 op.reduce(scratch[localid + offset], &accumulator);
204 scratch[localid] = op.finalize(accumulator);
208 output_ptr[itemID.get_group(0)] = op.finalize(accumulator);
213template <
typename Evaluator,
typename OpType>
214class GenericNondeterministicReducer {
216 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
217 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
218 typedef typename Evaluator::Index
Index;
219 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
220 typedef typename OpDef::type Op;
221 template <
typename Scratch>
222 GenericNondeterministicReducer(Scratch, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType functor_,
223 Index range_, Index num_values_to_reduce_)
224 : evaluator(evaluator_),
225 output_accessor(output_accessor_),
226 functor(OpDef::get_op(functor_)),
228 num_values_to_reduce(num_values_to_reduce_) {}
230 void operator()(cl::sycl::nd_item<1> itemID) {
231 auto output_accessor_ptr = output_accessor.get_pointer();
233 Index globalid =
static_cast<Index>(itemID.get_global_linear_id());
234 if (globalid < range) {
235 CoeffReturnType accum = functor.initialize();
236 Eigen::internal::GenericDimReducer<Evaluator::NumReducedDims - 1, Evaluator, Op>::reduce(
237 evaluator, evaluator.firstInput(globalid), functor, &accum);
238 output_accessor_ptr[globalid] = OpDef::finalise_op(functor.finalize(accum), num_values_to_reduce);
244 EvaluatorPointerType output_accessor;
247 Index num_values_to_reduce;
250enum class reduction_dim { inner_most, outer_most };
252template <
typename Evaluator,
typename OpType,
typename PannelParameters, reduction_dim rt>
253struct PartialReductionKernel {
254 typedef typename Evaluator::CoeffReturnType CoeffReturnType;
255 typedef typename Evaluator::EvaluatorPointerType EvaluatorPointerType;
256 typedef typename Evaluator::Index
Index;
257 typedef OpDefiner<OpType, CoeffReturnType, Index, false> OpDef;
258 typedef typename OpDef::type Op;
259 typedef cl::sycl::accessor<CoeffReturnType, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
263 EvaluatorPointerType output_accessor;
265 const Index preserve_elements_num_groups;
266 const Index reduce_elements_num_groups;
267 const Index num_coeffs_to_preserve;
268 const Index num_coeffs_to_reduce;
270 PartialReductionKernel(ScratchAcc scratch_, Evaluator evaluator_, EvaluatorPointerType output_accessor_, OpType op_,
271 const Index preserve_elements_num_groups_,
const Index reduce_elements_num_groups_,
272 const Index num_coeffs_to_preserve_,
const Index num_coeffs_to_reduce_)
274 evaluator(evaluator_),
275 output_accessor(output_accessor_),
276 op(OpDef::get_op(op_)),
277 preserve_elements_num_groups(preserve_elements_num_groups_),
278 reduce_elements_num_groups(reduce_elements_num_groups_),
279 num_coeffs_to_preserve(num_coeffs_to_preserve_),
280 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
282 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void element_wise_reduce(Index globalRId, Index globalPId,
283 CoeffReturnType &accumulator) {
284 if (globalPId >= num_coeffs_to_preserve) {
287 Index global_offset = rt == reduction_dim::outer_most ? globalPId + (globalRId * num_coeffs_to_preserve)
288 : globalRId + (globalPId * num_coeffs_to_reduce);
289 Index localOffset = globalRId;
291 const Index per_thread_local_stride = PannelParameters::LocalThreadSizeR * reduce_elements_num_groups;
292 const Index per_thread_global_stride =
293 rt == reduction_dim::outer_most ? num_coeffs_to_preserve * per_thread_local_stride : per_thread_local_stride;
294 for (Index i = globalRId; i < num_coeffs_to_reduce; i += per_thread_local_stride) {
295 op.reduce(evaluator.impl().coeff(global_offset), &accumulator);
296 localOffset += per_thread_local_stride;
297 global_offset += per_thread_global_stride;
300 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void operator()(cl::sycl::nd_item<1> itemID) {
301 const Index linearLocalThreadId = itemID.get_local_id(0);
302 Index pLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId % PannelParameters::LocalThreadSizeP
303 : linearLocalThreadId / PannelParameters::LocalThreadSizeR;
304 Index rLocalThreadId = rt == reduction_dim::outer_most ? linearLocalThreadId / PannelParameters::LocalThreadSizeP
305 : linearLocalThreadId % PannelParameters::LocalThreadSizeR;
306 const Index pGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) % preserve_elements_num_groups
307 : itemID.get_group(0) / reduce_elements_num_groups;
308 const Index rGroupId = rt == reduction_dim::outer_most ? itemID.get_group(0) / preserve_elements_num_groups
309 : itemID.get_group(0) % reduce_elements_num_groups;
311 Index globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
312 const Index globalRId = rGroupId * PannelParameters::LocalThreadSizeR + rLocalThreadId;
313 auto scratchPtr = scratch.get_pointer().get();
315 output_accessor.get_pointer() + (reduce_elements_num_groups > 1 ? rGroupId * num_coeffs_to_preserve : 0);
316 CoeffReturnType accumulator = op.initialize();
318 element_wise_reduce(globalRId, globalPId, accumulator);
320 accumulator = OpDef::finalise_op(op.finalize(accumulator), num_coeffs_to_reduce);
321 scratchPtr[pLocalThreadId + rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)] =
323 if (rt == reduction_dim::inner_most) {
324 pLocalThreadId = linearLocalThreadId % PannelParameters::LocalThreadSizeP;
325 rLocalThreadId = linearLocalThreadId / PannelParameters::LocalThreadSizeP;
326 globalPId = pGroupId * PannelParameters::LocalThreadSizeP + pLocalThreadId;
331 auto out_scratch_ptr =
332 scratchPtr + (pLocalThreadId + (rLocalThreadId * (PannelParameters::LocalThreadSizeP + PannelParameters::BC)));
333 itemID.barrier(cl::sycl::access::fence_space::local_space);
334 if (rt == reduction_dim::inner_most) {
335 accumulator = *out_scratch_ptr;
339 for (Index offset = PannelParameters::LocalThreadSizeR >> 1; offset > 0; offset >>= 1) {
340 if (rLocalThreadId < offset) {
341 op.reduce(out_scratch_ptr[(PannelParameters::LocalThreadSizeP + PannelParameters::BC) * offset], &accumulator);
344 *out_scratch_ptr = op.finalize(accumulator);
351 itemID.barrier(cl::sycl::access::fence_space::local_space);
354 if (rLocalThreadId == 0 && (globalPId < num_coeffs_to_preserve)) {
355 outPtr[globalPId] = op.finalize(accumulator);
360template <
typename OutScalar,
typename Index,
typename InputAccessor,
typename OutputAccessor,
typename OpType>
361struct SecondStepPartialReduction {
362 typedef OpDefiner<OpType, OutScalar, Index, false> OpDef;
363 typedef typename OpDef::type Op;
364 typedef cl::sycl::accessor<OutScalar, 1, cl::sycl::access::mode::read_write, cl::sycl::access::target::local>
366 InputAccessor input_accessor;
367 OutputAccessor output_accessor;
369 const Index num_coeffs_to_preserve;
370 const Index num_coeffs_to_reduce;
372 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE SecondStepPartialReduction(ScratchAccessor, InputAccessor input_accessor_,
373 OutputAccessor output_accessor_, OpType op_,
374 const Index num_coeffs_to_preserve_,
375 const Index num_coeffs_to_reduce_)
376 : input_accessor(input_accessor_),
377 output_accessor(output_accessor_),
378 op(OpDef::get_op(op_)),
379 num_coeffs_to_preserve(num_coeffs_to_preserve_),
380 num_coeffs_to_reduce(num_coeffs_to_reduce_) {}
382 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void operator()(cl::sycl::nd_item<1> itemID) {
383 const Index globalId = itemID.get_global_id(0);
385 if (globalId >= num_coeffs_to_preserve)
return;
387 auto in_ptr = input_accessor.get_pointer() + globalId;
389 OutScalar accumulator = op.initialize();
391 for (
Index i = 0; i < num_coeffs_to_reduce; i++) {
392 op.reduce(*in_ptr, &accumulator);
393 in_ptr += num_coeffs_to_preserve;
395 output_accessor.get_pointer()[globalId] = op.finalize(accumulator);
399template <
typename Index, Index LTP, Index LTR,
bool BC_>
400struct ReductionPannel {
401 static EIGEN_CONSTEXPR
Index LocalThreadSizeP = LTP;
402 static EIGEN_CONSTEXPR
Index LocalThreadSizeR = LTR;
403 static EIGEN_CONSTEXPR
bool BC = BC_;
406template <
typename Self,
typename Op, TensorSycl::
internal::reduction_dim rt>
407struct PartialReducerLauncher {
408 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
409 typedef typename Self::CoeffReturnType CoeffReturnType;
410 typedef typename Self::Storage Storage;
411 typedef typename Self::Index
Index;
412 typedef ReductionPannel<typename Self::Index, EIGEN_SYCL_LOCAL_THREAD_DIM0, EIGEN_SYCL_LOCAL_THREAD_DIM1, true>
415 typedef PartialReductionKernel<Self, Op, PannelParameters, rt> SyclReducerKerneType;
417 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType output,
418 Index num_coeffs_to_reduce, Index num_coeffs_to_preserve) {
419 Index roundUpP = roundUp(num_coeffs_to_preserve, PannelParameters::LocalThreadSizeP);
424 static_assert(!((PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR) &
425 (PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR - 1)),
426 "The Local thread size must be a power of 2 for the reduction "
429 EIGEN_CONSTEXPR
Index localRange = PannelParameters::LocalThreadSizeP * PannelParameters::LocalThreadSizeR;
437 const Index reductionPerThread = 64;
438 Index cu = dev.getPowerOfTwo(dev.getNumSyclMultiProcessors(),
true);
439 const Index pNumGroups = roundUpP / PannelParameters::LocalThreadSizeP;
440 Index rGroups = (cu + pNumGroups - 1) / pNumGroups;
441 const Index rNumGroups = num_coeffs_to_reduce > reductionPerThread * localRange ? std::min(rGroups, localRange) : 1;
442 const Index globalRange = pNumGroups * rNumGroups * localRange;
444 EIGEN_CONSTEXPR
Index scratchSize =
445 PannelParameters::LocalThreadSizeR * (PannelParameters::LocalThreadSizeP + PannelParameters::BC);
446 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(globalRange), cl::sycl::range<1>(localRange));
447 if (rNumGroups > 1) {
448 CoeffReturnType *temp_pointer =
static_cast<CoeffReturnType *
>(
449 dev.allocate_temp(num_coeffs_to_preserve * rNumGroups *
sizeof(CoeffReturnType)));
450 EvaluatorPointerType temp_accessor = dev.get(temp_pointer);
451 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
452 self, temp_accessor, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
453 num_coeffs_to_reduce);
455 typedef SecondStepPartialReduction<CoeffReturnType, Index, EvaluatorPointerType, EvaluatorPointerType, Op>
456 SecondStepPartialReductionKernel;
458 dev.template unary_kernel_launcher<CoeffReturnType, SecondStepPartialReductionKernel>(
459 temp_accessor, output,
460 cl::sycl::nd_range<1>(cl::sycl::range<1>(pNumGroups * localRange), cl::sycl::range<1>(localRange)),
Index(1),
461 reducer, num_coeffs_to_preserve, rNumGroups);
463 self.device().deallocate_temp(temp_pointer);
465 dev.template unary_kernel_launcher<CoeffReturnType, SyclReducerKerneType>(
466 self, output, thread_range, scratchSize, reducer, pNumGroups, rNumGroups, num_coeffs_to_preserve,
467 num_coeffs_to_reduce);
477template <
typename Self,
typename Op,
bool Vectorizable>
478struct FullReducer<Self, Op,
Eigen::SyclDevice, Vectorizable> {
479 typedef typename Self::CoeffReturnType CoeffReturnType;
480 typedef typename Self::EvaluatorPointerType EvaluatorPointerType;
481 static EIGEN_CONSTEXPR
bool HasOptimizedImplementation =
true;
482 static EIGEN_CONSTEXPR
int PacketSize = Self::PacketAccess ? Self::PacketSize : 1;
483 static void run(
const Self &self, Op &reducer,
const Eigen::SyclDevice &dev, EvaluatorPointerType data) {
484 typedef typename conditional<Self::PacketAccess, typename Self::PacketReturnType, CoeffReturnType>::type OutType;
485 static_assert(!((EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1) &
486 (EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1 - 1)),
487 "The Local thread size must be a power of 2 for the reduction "
489 EIGEN_CONSTEXPR
Index local_range = EIGEN_SYCL_LOCAL_THREAD_DIM0 * EIGEN_SYCL_LOCAL_THREAD_DIM1;
491 typename Self::Index inputSize = self.impl().dimensions().TotalSize();
495 const Index reductionPerThread = 2048;
497 Index reductionGroup = dev.getPowerOfTwo(
498 (inputSize + (reductionPerThread * local_range - 1)) / (reductionPerThread * local_range),
true);
499 const Index num_work_group = std::min(reductionGroup, local_range);
503 const Index global_range = num_work_group * local_range;
505 auto thread_range = cl::sycl::nd_range<1>(cl::sycl::range<1>(global_range), cl::sycl::range<1>(local_range));
506 typedef TensorSycl::internal::FullReductionKernelFunctor<Self, Op, local_range> reduction_kernel_t;
507 if (num_work_group > 1) {
508 CoeffReturnType *temp_pointer =
509 static_cast<CoeffReturnType *
>(dev.allocate_temp(num_work_group *
sizeof(CoeffReturnType)));
510 typename Self::EvaluatorPointerType tmp_global_accessor = dev.get(temp_pointer);
511 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, tmp_global_accessor, thread_range,
512 local_range, inputSize, reducer);
514 typedef TensorSycl::internal::SecondStepFullReducer<CoeffReturnType, Op, EvaluatorPointerType,
515 EvaluatorPointerType,
Index, local_range>
517 dev.template unary_kernel_launcher<CoeffReturnType, GenericRKernel>(
518 tmp_global_accessor, data,
519 cl::sycl::nd_range<1>(cl::sycl::range<1>(num_work_group), cl::sycl::range<1>(num_work_group)), num_work_group,
522 dev.deallocate_temp(temp_pointer);
524 dev.template unary_kernel_launcher<OutType, reduction_kernel_t>(self, data, thread_range, local_range, inputSize,
531template <
typename Self,
typename Op>
532struct OuterReducer<Self, Op,
Eigen::SyclDevice> {
533 static EIGEN_CONSTEXPR
bool HasOptimizedImplementation =
true;
535 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
536 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
537 typename Self::Index num_coeffs_to_preserve) {
538 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
539 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::outer_most>::run(self, reducer, dev, output,
540 num_coeffs_to_reduce,
541 num_coeffs_to_preserve);
545template <
typename Self,
typename Op>
546struct InnerReducer<Self, Op,
Eigen::SyclDevice> {
547 static EIGEN_CONSTEXPR
bool HasOptimizedImplementation =
true;
549 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
550 typename Self::EvaluatorPointerType output,
typename Self::Index num_coeffs_to_reduce,
551 typename Self::Index num_coeffs_to_preserve) {
552 return ::Eigen::TensorSycl::internal::PartialReducerLauncher<
553 Self, Op, ::Eigen::TensorSycl::internal::reduction_dim::inner_most>::run(self, reducer, dev, output,
554 num_coeffs_to_reduce,
555 num_coeffs_to_preserve);
562template <
typename Self,
typename Op>
563struct GenericReducer<Self, Op,
Eigen::SyclDevice> {
564 static EIGEN_CONSTEXPR
bool HasOptimizedImplementation =
false;
565 static bool run(
const Self &self,
const Op &reducer,
const Eigen::SyclDevice &dev,
566 typename Self::EvaluatorPointerType output,
typename Self::Index num_values_to_reduce,
567 typename Self::Index num_coeffs_to_preserve) {
568 typename Self::Index range, GRange, tileSize;
569 dev.parallel_for_setup(num_coeffs_to_preserve, tileSize, range, GRange);
571 dev.template unary_kernel_launcher<
typename Self::CoeffReturnType,
572 TensorSycl::internal::GenericNondeterministicReducer<Self, Op>>(
573 self, output, cl::sycl::nd_range<1>(cl::sycl::range<1>(GRange), cl::sycl::range<1>(tileSize)),
Index(1),
574 reducer, range, (num_values_to_reduce != 0) ? num_values_to_reduce :
static_cast<Index>(1));
Namespace containing all symbols from the Eigen library.
EIGEN_DEFAULT_DENSE_INDEX_TYPE Index